test_common/harness/ThreadPool.cpp - external/github.com/KhronosGroup/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "ThreadPool.h"
 #include "errorHelpers.h"
 #include "fpcontrol.h"
 #include <stdio.h>
 #include <stdlib.h>

 #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
 // or any other POSIX system

 #if defined(_WIN32)
 #include <windows.h>
 #if defined(_MSC_VER)
 #include <intrin.h>
 #endif
 #include "mingw_compat.h"
 #include <process.h>
 #else // !_WIN32
 #include <pthread.h>
 #include <unistd.h>
 #include <sys/errno.h>
 #ifdef __linux__
 #include <sched.h>
 #endif
 #endif // !_WIN32

 // declarations
 #ifdef _WIN32
 void ThreadPool_WorkerFunc(void *p);
 #else
 void *ThreadPool_WorkerFunc(void *p);
 #endif
 void ThreadPool_Init(void);
 void ThreadPool_Exit(void);

 #if defined(__MINGW32__)
 // Mutex for implementing super heavy atomic operations if you don't have GCC or
 // MSVC
 CRITICAL_SECTION gAtomicLock;
 #elif defined(__GNUC__) || defined(_MSC_VER)
 #else
 pthread_mutex_t gAtomicLock;
 #endif

 // Atomic add operator with mem barrier.  Mem barrier needed to protect state
 // modified by the worker functions.
 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
 {
 #if defined(__MINGW32__)
     // No atomics on Mingw32
     EnterCriticalSection(&gAtomicLock);
     cl_int old = *a;
     *a = old + b;
     LeaveCriticalSection(&gAtomicLock);
     return old;
 #elif defined(__GNUC__)
     // GCC extension:
     // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
     return __sync_fetch_and_add(a, b);
     // do we need __sync_synchronize() here, too?  GCC docs are unclear whether
     // __sync_fetch_and_add does a synchronize
 #elif defined(_MSC_VER)
     return (cl_int)_InterlockedExchangeAdd((volatile LONG *)a, (LONG)b);
 #else
 #warning Please add a atomic add implementation here, with memory barrier.  Fallback code is slow.
     if (pthread_mutex_lock(&gAtomicLock))
         log_error("Atomic operation failed. pthread_mutex_lock(&gAtomicLock) "
                   "returned an error\n");
     cl_int old = *a;
     *a = old + b;
     if (pthread_mutex_unlock(&gAtomicLock))
         log_error("Failed to release gAtomicLock. Further atomic operations "
                   "may deadlock!\n");
     return old;
 #endif
 }

 #if defined(_WIN32)
 // Uncomment the following line if Windows XP support is not required.
 // #define HAS_INIT_ONCE_EXECUTE_ONCE 1

 #if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
 #define _INIT_ONCE INIT_ONCE
 #define _PINIT_ONCE PINIT_ONCE
 #define _InitOnceExecuteOnce InitOnceExecuteOnce
 #else // !HAS_INIT_ONCE_EXECUTE_ONCE

 typedef volatile LONG _INIT_ONCE;
 typedef _INIT_ONCE *_PINIT_ONCE;
 typedef BOOL(CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);

 #define _INIT_ONCE_UNINITIALIZED 0
 #define _INIT_ONCE_IN_PROGRESS 1
 #define _INIT_ONCE_DONE 2

 static BOOL _InitOnceExecuteOnce(_PINIT_ONCE InitOnce, _PINIT_ONCE_FN InitFn,
                                  PVOID Parameter, LPVOID *Context)
 {
     while (*InitOnce != _INIT_ONCE_DONE)
     {
         if (*InitOnce != _INIT_ONCE_IN_PROGRESS
             && _InterlockedCompareExchange(InitOnce, _INIT_ONCE_IN_PROGRESS,
                                            _INIT_ONCE_UNINITIALIZED)
                 == _INIT_ONCE_UNINITIALIZED)
         {
             InitFn(InitOnce, Parameter, Context);
             *InitOnce = _INIT_ONCE_DONE;
             return TRUE;
         }
         Sleep(1);
     }
     return TRUE;
 }
 #endif // !HAS_INIT_ONCE_EXECUTE_ONCE

 // Uncomment the following line if Windows XP support is not required.
 // #define HAS_CONDITION_VARIABLE 1

 #if defined(HAS_CONDITION_VARIABLE)
 #define _CONDITION_VARIABLE CONDITION_VARIABLE
 #define _InitializeConditionVariable InitializeConditionVariable
 #define _SleepConditionVariableCS SleepConditionVariableCS
 #define _WakeAllConditionVariable WakeAllConditionVariable
 #else // !HAS_CONDITION_VARIABLE
 typedef struct
 {
     HANDLE mEvent; // Used to park the thread.
     // Used to protect mWaiters, mGeneration and mReleaseCount:
     CRITICAL_SECTION mLock[1];
     volatile cl_int mWaiters; // Number of threads waiting on this cond var.
     volatile cl_int mGeneration; // Wait generation count.
     volatile cl_int mReleaseCount; // Number of releases to execute before
                                    // reseting the event.
 } _CONDITION_VARIABLE;

 typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;

 static void _InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)
 {
     cond_var->mEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
     InitializeCriticalSection(cond_var->mLock);
     cond_var->mWaiters = 0;
     cond_var->mGeneration = 0;
 #if !defined(NDEBUG)
     cond_var->mReleaseCount = 0;
 #endif // !NDEBUG
 }

 static void _SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,
                                       PCRITICAL_SECTION cond_lock,
                                       DWORD ignored)
 {
     EnterCriticalSection(cond_var->mLock);
     cl_int generation = cond_var->mGeneration;
     ++cond_var->mWaiters;
     LeaveCriticalSection(cond_var->mLock);
     LeaveCriticalSection(cond_lock);

     while (TRUE)
     {
         WaitForSingleObject(cond_var->mEvent, INFINITE);
         EnterCriticalSection(cond_var->mLock);
         BOOL done =
             cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
         LeaveCriticalSection(cond_var->mLock);
         if (done)
         {
             break;
         }
     }

     EnterCriticalSection(cond_lock);
     EnterCriticalSection(cond_var->mLock);
     if (--cond_var->mReleaseCount == 0)
     {
         ResetEvent(cond_var->mEvent);
     }
     --cond_var->mWaiters;
     LeaveCriticalSection(cond_var->mLock);
 }

 static void _WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)
 {
     EnterCriticalSection(cond_var->mLock);
     if (cond_var->mWaiters > 0)
     {
         ++cond_var->mGeneration;
         cond_var->mReleaseCount = cond_var->mWaiters;
         SetEvent(cond_var->mEvent);
     }
     LeaveCriticalSection(cond_var->mLock);
 }
 #endif // !HAS_CONDITION_VARIABLE
 #endif // _WIN32

 #define MAX_COUNT (1 << 29)

 // Global state to coordinate whether the threads have been launched
 // successfully or not
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
 static _INIT_ONCE threadpool_init_control;
 #elif defined(_WIN32) // MingW of XP
 static int threadpool_init_control;
 #else // Posix platforms
 pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
 #endif
 cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch

 // critical region lock around ThreadPool_Do.  We can only run one ThreadPool_Do
 // at a time, because we are too lazy to set up a queue here, and don't expect
 // to need one.
 #if defined(_WIN32)
 CRITICAL_SECTION gThreadPoolLock[1];
 #else // !_WIN32
 pthread_mutex_t gThreadPoolLock;
 #endif // !_WIN32

 // Condition variable to park ThreadPool threads when not working
 #if defined(_WIN32)
 CRITICAL_SECTION cond_lock[1];
 _CONDITION_VARIABLE cond_var[1];
 #else // !_WIN32
 pthread_mutex_t cond_lock;
 pthread_cond_t cond_var;
 #endif // !_WIN32

 // Condition variable state. How many iterations on the function left to run,
 // set to CL_INT_MAX to cause worker threads to exit. Note: this value might
 // go negative.
 volatile cl_int gRunCount = 0;

 // State that only changes when the threadpool is not working.
 volatile TPFuncPtr gFunc_ptr = NULL;
 volatile void *gUserInfo = NULL;
 volatile cl_int gJobCount = 0;

 // State that may change while the thread pool is working
 volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole

 // Condition variable to park caller while waiting
 #if defined(_WIN32)
 HANDLE caller_event;
 #else // !_WIN32
 pthread_mutex_t caller_cond_lock;
 pthread_cond_t caller_cond_var;
 #endif // !_WIN32

 // # of threads intended to be running. Running threads will decrement this
 // as they discover they've run out of work to do.
 volatile cl_int gRunning = 0;

 // The total number of threads launched.
 volatile cl_int gThreadCount = 0;
 #ifdef _WIN32
 void ThreadPool_WorkerFunc(void *p)
 #else
 void *ThreadPool_WorkerFunc(void *p)
 #endif
 {
     cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
     cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
     // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );

     while (MAX_COUNT > item)
     {
         cl_int err;

         // check for more work to do
         if (0 >= item)
         {
             // log_info("Thread %d has run out of work.\n", threadID);

             // No work to do. Attempt to block waiting for work
 #if defined(_WIN32)
             EnterCriticalSection(cond_lock);
 #else // !_WIN32
             if ((err = pthread_mutex_lock(&cond_lock)))
             {
                 log_error(
                     "Error %d from pthread_mutex_lock. Worker %d unable to "
                     "block waiting for work. ThreadPool_WorkerFunc failed.\n",
                     err, threadID);
                 goto exit;
             }
 #endif // !_WIN32

             cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
             // log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
             //          remaining - 1);
             if (1 == remaining)
             { // last thread out signal the main thread to wake up
 #if defined(_WIN32)
                 SetEvent(caller_event);
 #else // !_WIN32
                 if ((err = pthread_mutex_lock(&caller_cond_lock)))
                 {
                     log_error("Error %d from pthread_mutex_lock. Unable to "
                               "wake caller.\n",
                               err);
                     goto exit;
                 }
                 if ((err = pthread_cond_broadcast(&caller_cond_var)))
                 {
                     log_error(
                         "Error %d from pthread_cond_broadcast. Unable to wake "
                         "up main thread. ThreadPool_WorkerFunc failed.\n",
                         err);
                     goto exit;
                 }
                 if ((err = pthread_mutex_unlock(&caller_cond_lock)))
                 {
                     log_error("Error %d from pthread_mutex_lock. Unable to "
                               "wake caller.\n",
                               err);
                     goto exit;
                 }
 #endif // !_WIN32
             }

             // loop in case we are woken only to discover that some other thread
             // already did all the work
             while (0 >= item)
             {
 #if defined(_WIN32)
                 _SleepConditionVariableCS(cond_var, cond_lock, INFINITE);
 #else // !_WIN32
                 if ((err = pthread_cond_wait(&cond_var, &cond_lock)))
                 {
                     log_error(
                         "Error %d from pthread_cond_wait. Unable to block for "
                         "waiting for work. ThreadPool_WorkerFunc failed.\n",
                         err);
                     pthread_mutex_unlock(&cond_lock);
                     goto exit;
                 }
 #endif // !_WIN32

                 // try again to get a valid item id
                 item = ThreadPool_AtomicAdd(&gRunCount, -1);
                 if (MAX_COUNT <= item) // exit if we are done
                 {
 #if defined(_WIN32)
                     LeaveCriticalSection(cond_lock);
 #else // !_WIN32
                     pthread_mutex_unlock(&cond_lock);
 #endif // !_WIN32
                     goto exit;
                 }
             }

             ThreadPool_AtomicAdd(&gRunning, 1);
             // log_info("Thread %d has found work.\n", threadID);

 #if defined(_WIN32)
             LeaveCriticalSection(cond_lock);
 #else // !_WIN32
             if ((err = pthread_mutex_unlock(&cond_lock)))
             {
                 log_error(
                     "Error %d from pthread_mutex_unlock. Unable to block for "
                     "waiting for work. ThreadPool_WorkerFunc failed.\n",
                     err);
                 goto exit;
             }
 #endif // !_WIN32
         }

         // we have a valid item, so do the work
         // but only if we haven't already encountered an error
         if (CL_SUCCESS == jobError)
         {
             // log_info("Thread %d doing job %d\n", threadID, item - 1);

 #if defined(__APPLE__) && defined(__arm__)
             // On most platforms which support denorm, default is FTZ off.
             // However, on some hardware where the reference is computed,
             // default might be flush denorms to zero e.g. arm. This creates
             // issues in result verification. Since spec allows the
             // implementation to either flush or not flush denorms to zero, an
             // implementation may choose not be flush i.e. return denorm result
             // whereas reference result may be zero (flushed denorm). Hence we
             // need to disable denorm flushing on host side where reference is
             // being computed to make sure we get non-flushed reference result.
             // If implementation returns flushed result, we correctly take care
             // of that in verification code.
             FPU_mode_type oldMode;
             DisableFTZ(&oldMode);
 #endif

             // Call the user's function with this item ID
             err = gFunc_ptr(item - 1, threadID, (void *)gUserInfo);
 #if defined(__APPLE__) && defined(__arm__)
             // Restore FP state
             RestoreFPState(&oldMode);
 #endif

             if (err)
             {
 #if (__MINGW32__)
                 EnterCriticalSection(&gAtomicLock);
                 if (jobError == CL_SUCCESS) jobError = err;
                 gRunCount = 0;
                 LeaveCriticalSection(&gAtomicLock);
 #elif defined(__GNUC__)
                 // GCC extension:
                 // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
                 // set the new error if we are the first one there.
                 __sync_val_compare_and_swap(&jobError, CL_SUCCESS, err);

                 // drop run count to 0
                 gRunCount = 0;
                 __sync_synchronize();
 #elif defined(_MSC_VER)
                 // set the new error if we are the first one there.
                 _InterlockedCompareExchange((volatile LONG *)&jobError, err,
                                             CL_SUCCESS);

                 // drop run count to 0
                 gRunCount = 0;
                 _mm_mfence();
 #else
                 if (pthread_mutex_lock(&gAtomicLock))
                     log_error(
                         "Atomic operation failed. "
                         "pthread_mutex_lock(&gAtomicLock) returned an error\n");
                 if (jobError == CL_SUCCESS) jobError = err;
                 gRunCount = 0;
                 if (pthread_mutex_unlock(&gAtomicLock))
                     log_error("Failed to release gAtomicLock. Further atomic "
                               "operations may deadlock\n");
 #endif
             }
         }

         // get the next item
         item = ThreadPool_AtomicAdd(&gRunCount, -1);
     }

 exit:
     log_info("ThreadPool: thread %d exiting.\n", threadID);
     ThreadPool_AtomicAdd(&gThreadCount, -1);
 #if !defined(_WIN32)
     return NULL;
 #endif
 }

 // SetThreadCount() may be used to artifically set the number of worker threads
 // If the value is 0 (the default) the number of threads will be determined
 // based on the number of CPU cores.  If it is a unicore machine, then 2 will be
 // used, so that we still get some testing for thread safety.
 //
 // If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then
 // the code will run single threaded, but will report an error to indicate that
 // the test is invalid.  This option is intended for debugging purposes only. It
 // is suggested as a convention that test apps set the thread count to 1 in
 // response to the -m flag.
 //
 // SetThreadCount() must be called before the first call to GetThreadCount() or
 // ThreadPool_Do(), otherwise the behavior is indefined.
 void SetThreadCount(int count)
 {
     if (threadPoolInitErr == CL_SUCCESS)
     {
         log_error("Error: It is illegal to set the thread count after the "
                   "first call to ThreadPool_Do or GetThreadCount\n");
         abort();
     }

     gThreadCount = count;
 }

 void ThreadPool_Init(void)
 {
     cl_int i;
     int err;
     volatile cl_uint threadID = 0;

     // Check for manual override of multithreading code. We add this for better
     // debuggability.
     if (getenv("CL_TEST_SINGLE_THREADED"))
     {
         log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
                   "Running single threaded.\n*** TEST IS INVALID! ***\n");
         gThreadCount = 1;
         return;
     }

     // Figure out how many threads to run -- check first for non-zero to give
     // the implementation the chance
     if (0 == gThreadCount)
     {
 #if defined(_MSC_VER) || defined(__MINGW64__)
         PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
         DWORD length = 0;

         GetLogicalProcessorInformation(NULL, &length);
         buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
         if (buffer != NULL)
         {
             if (GetLogicalProcessorInformation(buffer, &length) == TRUE)
             {
                 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
                 while (
                     ptr
                     < &buffer[length
                               / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)])
                 {
                     if (ptr->Relationship == RelationProcessorCore)
                     {
                         // Count the number of bits in ProcessorMask (number of
                         // logical cores)
                         ULONG mask = ptr->ProcessorMask;
                         while (mask)
                         {
                             ++gThreadCount;
                             mask &= mask - 1; // Remove 1 bit at a time
                         }
                     }
                     ++ptr;
                 }
             }
             free(buffer);
         }
 #elif defined(__MINGW32__)
         {
 #warning How about this, instead of hard coding it to 2?
             SYSTEM_INFO sysinfo;
             GetSystemInfo(&sysinfo);
             gThreadCount = sysinfo.dwNumberOfProcessors;
         }
 #elif defined(__linux__) && !defined(__ANDROID__)
         cpu_set_t affinity;
         if (0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity))
         {
 #if !(defined(CPU_COUNT))
             gThreadCount = 1;
 #else
             gThreadCount = CPU_COUNT(&affinity);
 #endif
         }
         else
         {
             // Hopefully your system returns logical cpus here, as does MacOS X
             gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
         }
 #else /* !_WIN32 */
         // Hopefully your system returns logical cpus here, as does MacOS X
         gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
 #endif // !_WIN32

         // Multithreaded tests are required to run multithreaded even on unicore
         // systems so as to test thread safety
         if (1 == gThreadCount) gThreadCount = 2;
     }

 // When working in 32 bit limit the thread number to 12
 // This fix was made due to memory issues in integer_ops test
 // When running integer_ops, the test opens as many threads as the
 // machine has and each thread allocates a fixed amount of memory
 // When running this test on dual socket machine in 32-bit, the
 // process memory is not sufficient and the test fails
 #if defined(_WIN32) && !defined(_M_X64)
     if (gThreadCount > 12)
     {
         gThreadCount = 12;
     }
 #endif

     // Allow the app to set thread count to <0 for debugging purposes.
     // This will cause the test to run single threaded.
     if (gThreadCount < 2)
     {
         log_error("ERROR: Running single threaded because thread count < 2. "
                   "\n*** TEST IS INVALID! ***\n");
         gThreadCount = 1;
         return;
     }

 #if defined(_WIN32)
     InitializeCriticalSection(gThreadPoolLock);
     InitializeCriticalSection(cond_lock);
     _InitializeConditionVariable(cond_var);
     caller_event = CreateEvent(NULL, FALSE, FALSE, NULL);
 #elif defined(__GNUC__)
     // Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since
     // it might cause problem with some flavors of gcc compilers.
     pthread_cond_init(&cond_var, NULL);
     pthread_mutex_init(&cond_lock, NULL);
     pthread_cond_init(&caller_cond_var, NULL);
     pthread_mutex_init(&caller_cond_lock, NULL);
     pthread_mutex_init(&gThreadPoolLock, NULL);
 #endif

 #if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
     pthread_mutex_initialize(gAtomicLock);
 #elif defined(__MINGW32__)
     InitializeCriticalSection(&gAtomicLock);
 #endif
     // Make sure the last thread done in the work pool doesn't signal us to wake
     // before we get to the point where we are supposed to wait
     //  That would cause a deadlock.
 #if !defined(_WIN32)
     if ((err = pthread_mutex_lock(&caller_cond_lock)))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to block for work "
                   "to finish. ThreadPool_Init failed.\n",
                   err);
         gThreadCount = 1;
         return;
     }
 #endif // !_WIN32

     gRunning = gThreadCount;
     // init threads
     for (i = 0; i < gThreadCount; i++)
     {
 #if defined(_WIN32)
         uintptr_t handle =
             _beginthread(ThreadPool_WorkerFunc, 0, (void *)&threadID);
         err = (handle == 0);
 #else // !_WIN32
         pthread_t tid = 0;
         err = pthread_create(&tid, NULL, ThreadPool_WorkerFunc,
                              (void *)&threadID);
 #endif // !_WIN32
         if (err)
         {
             log_error("Error %d launching thread %d\n", err, i);
             threadPoolInitErr = err;
             gThreadCount = i;
             break;
         }
     }

     atexit(ThreadPool_Exit);

     // block until they are done launching.
     do
     {
 #if defined(_WIN32)
         WaitForSingleObject(caller_event, INFINITE);
 #else // !_WIN32
         if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
         {
             log_error("Error %d from pthread_cond_wait. Unable to block for "
                       "work to finish. ThreadPool_Init failed.\n",
                       err);
             pthread_mutex_unlock(&caller_cond_lock);
             return;
         }
 #endif // !_WIN32
     } while (gRunCount != -gThreadCount);
 #if !defined(_WIN32)
     if ((err = pthread_mutex_unlock(&caller_cond_lock)))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to block for "
                   "work to finish. ThreadPool_Init failed.\n",
                   err);
         return;
     }
 #endif // !_WIN32

     threadPoolInitErr = CL_SUCCESS;
 }

 #if defined(_MSC_VER)
 static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
                                       PVOID *lpContex)
 {
     ThreadPool_Init();
     return TRUE;
 }
 #endif

 void ThreadPool_Exit(void)
 {
     int err, count;
     gRunCount = CL_INT_MAX;

 #if defined(__GNUC__)
     // GCC extension:
     // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
     __sync_synchronize();
 #elif defined(_MSC_VER)
     _mm_mfence();
 #else
 #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
 #endif

     // spin waiting for threads to die
     for (count = 0; 0 != gThreadCount && count < 1000; count++)
     {
 #if defined(_WIN32)
         _WakeAllConditionVariable(cond_var);
         Sleep(1);
 #else // !_WIN32
         if ((err = pthread_cond_broadcast(&cond_var)))
         {
             log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
                       "work threads. ThreadPool_Exit failed.\n",
                       err);
             break;
         }
         usleep(1000);
 #endif // !_WIN32
     }

     if (gThreadCount)
         log_error("Error: Thread pool timed out after 1 second with %d threads "
                   "still active.\n",
                   gThreadCount);
     else
         log_info("Thread pool exited in a orderly fashion.\n");
 }


 // Blocking API that farms out count jobs to a thread pool.
 // It may return with some work undone if func_ptr() returns a non-zero
 // result.
 //
 // This function obviously has its shortcommings. Only one call to ThreadPool_Do
 // can be running at a time. It is not intended for general purpose use.
 // If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
 // all available then it would make more sense to use those features.
 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
 {
     cl_int newErr;
     cl_int err = 0;
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
     err = !_InitOnceExecuteOnce(&threadpool_init_control, _ThreadPool_Init,
                                 NULL, NULL);
 #elif defined(_WIN32)
     if (threadpool_init_control == 0)
     {
 #warning This is buggy and race prone.  Find a better way.
         ThreadPool_Init();
         threadpool_init_control = 1;
     }
 #else // posix platform
     err = pthread_once(&threadpool_init_control, ThreadPool_Init);
     if (err)
     {
         log_error("Error %d from pthread_once. Unable to init threads. "
                   "ThreadPool_Do failed.\n",
                   err);
         return err;
     }
 #endif
     // Single threaded code to handle case where threadpool wasn't allocated or
     // was disabled by environment variable
     if (threadPoolInitErr)
     {
         cl_uint currentJob = 0;
         cl_int result = CL_SUCCESS;

 #if defined(__APPLE__) && defined(__arm__)
         // On most platforms which support denorm, default is FTZ off. However,
         // on some hardware where the reference is computed, default might be
         // flush denorms to zero e.g. arm. This creates issues in result
         // verification. Since spec allows the implementation to either flush or
         // not flush denorms to zero, an implementation may choose not be flush
         // i.e. return denorm result whereas reference result may be zero
         // (flushed denorm). Hence we need to disable denorm flushing on host
         // side where reference is being computed to make sure we get
         // non-flushed reference result. If implementation returns flushed
         // result, we correctly take care of that in verification code.
         FPU_mode_type oldMode;
         DisableFTZ(&oldMode);
 #endif
         for (currentJob = 0; currentJob < count; currentJob++)
             if ((result = func_ptr(currentJob, 0, userInfo)))
             {
 #if defined(__APPLE__) && defined(__arm__)
                 // Restore FP state before leaving
                 RestoreFPState(&oldMode);
 #endif
                 return result;
             }

 #if defined(__APPLE__) && defined(__arm__)
         // Restore FP state before leaving
         RestoreFPState(&oldMode);
 #endif

         return CL_SUCCESS;
     }

     if (count >= MAX_COUNT)
     {
         log_error(
             "Error: ThreadPool_Do count %d >= max threadpool count of %d\n",
             count, MAX_COUNT);
         return -1;
     }

     // Enter critical region
 #if defined(_WIN32)
     EnterCriticalSection(gThreadPoolLock);
 #else // !_WIN32
     if ((err = pthread_mutex_lock(&gThreadPoolLock)))
     {
         switch (err)
         {
             case EDEADLK:
                 log_error(
                     "Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do "
                     "is not designed to work recursively!\n");
                 break;
             case EINVAL:
                 log_error("Error EINVAL returned in ThreadPool_Do(). How did "
                           "we end up with an invalid gThreadPoolLock?\n");
                 break;
             default: break;
         }
         return err;
     }
 #endif // !_WIN32

     // Start modifying the job state observable by worker threads
 #if defined(_WIN32)
     EnterCriticalSection(cond_lock);
 #else // !_WIN32
     if ((err = pthread_mutex_lock(&cond_lock)))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to wake up work "
                   "threads. ThreadPool_Do failed.\n",
                   err);
         goto exit;
     }
 #endif // !_WIN32

     // Make sure the last thread done in the work pool doesn't signal us to wake
     // before we get to the point where we are supposed to wait
     //  That would cause a deadlock.
 #if !defined(_WIN32)
     if ((err = pthread_mutex_lock(&caller_cond_lock)))
     {
         log_error("Error %d from pthread_mutex_lock. Unable to block for work "
                   "to finish. ThreadPool_Do failed.\n",
                   err);
         goto exit;
     }
 #endif // !_WIN32

     // Prime the worker threads to get going
     jobError = CL_SUCCESS;
     gRunCount = gJobCount = count;
     gFunc_ptr = func_ptr;
     gUserInfo = userInfo;

 #if defined(_WIN32)
     ResetEvent(caller_event);
     _WakeAllConditionVariable(cond_var);
     LeaveCriticalSection(cond_lock);
 #else // !_WIN32
     if ((err = pthread_cond_broadcast(&cond_var)))
     {
         log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
                   "work threads. ThreadPool_Do failed.\n",
                   err);
         goto exit;
     }
     if ((err = pthread_mutex_unlock(&cond_lock)))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to wake up work "
                   "threads. ThreadPool_Do failed.\n",
                   err);
         goto exit;
     }
 #endif // !_WIN32

     // block until they are done.  It would be slightly more efficient to do
     // some of the work here though.
     do
     {
 #if defined(_WIN32)
         WaitForSingleObject(caller_event, INFINITE);
 #else // !_WIN32
         if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
         {
             log_error("Error %d from pthread_cond_wait. Unable to block for "
                       "work to finish. ThreadPool_Do failed.\n",
                       err);
             pthread_mutex_unlock(&caller_cond_lock);
             goto exit;
         }
 #endif // !_WIN32
     } while (gRunning);
 #if !defined(_WIN32)
     if ((err = pthread_mutex_unlock(&caller_cond_lock)))
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to block for "
                   "work to finish. ThreadPool_Do failed.\n",
                   err);
         goto exit;
     }
 #endif // !_WIN32

     err = jobError;

 exit:
     // exit critical region
 #if defined(_WIN32)
     LeaveCriticalSection(gThreadPoolLock);
 #else // !_WIN32
     newErr = pthread_mutex_unlock(&gThreadPoolLock);
     if (newErr)
     {
         log_error("Error %d from pthread_mutex_unlock. Unable to exit critical "
                   "region. ThreadPool_Do failed.\n",
                   newErr);
         return err;
     }
 #endif // !_WIN32

     return err;
 }

 cl_uint GetThreadCount(void)
 {
     // Lazily set up our threads
 #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
     cl_int err = !_InitOnceExecuteOnce(&threadpool_init_control,
                                        _ThreadPool_Init, NULL, NULL);
 #elif defined(_WIN32)
     if (threadpool_init_control == 0)
     {
 #warning This is buggy and race prone.  Find a better way.
         ThreadPool_Init();
         threadpool_init_control = 1;
     }
 #else
     cl_int err = pthread_once(&threadpool_init_control, ThreadPool_Init);
     if (err)
     {
         log_error("Error %d from pthread_once. Unable to init threads. "
                   "ThreadPool_Do failed.\n",
                   err);
         return err;
     }
 #endif // !_WIN32

     if (gThreadCount < 1) return 1;

     return gThreadCount;
 }

 #else

 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
 #error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
 #endif
 //
 // We require multithreading in parts of the test as a means of simultaneously
 // testing reentrancy requirements of OpenCL API, while also checking
 //
 // A sample single threaded implementation follows, for documentation /
 // bootstrapping purposes. It is not okay to use this for conformance testing!!!
 //
 // Exception:  If your operating system does not support multithreaded execution
 // of any kind, then you may use this code.
 //

 cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
 {
     cl_uint r = *a;

     // since this fallback code path is not multithreaded, we just do a regular
     // add here. If your operating system supports memory-barrier-atomics, use
     // those here.
     *a = r + b;

     return r;
 }

 // Blocking API that farms out count jobs to a thread pool.
 // It may return with some work undone if func_ptr() returns a non-zero
 // result.
 cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
 {
     cl_uint currentJob = 0;
     cl_int result = CL_SUCCESS;

 #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
     // THIS FUNCTION IS NOT INTENDED FOR USE!!
     log_error("ERROR:  Test must be multithreaded!\n");
     exit(-1);
 #else
     static int spewCount = 0;

     if (0 == spewCount)
     {
         log_info("\nWARNING:  The operating system is claimed not to support "
                  "threads of any sort. Running single threaded.\n");
         spewCount = 1;
     }
 #endif

     // The multithreaded code should mimic this behavior:
     for (currentJob = 0; currentJob < count; currentJob++)
         if ((result = func_ptr(currentJob, 0, userInfo))) return result;

     return CL_SUCCESS;
 }

 cl_uint GetThreadCount(void) { return 1; }

 void SetThreadCount(int count)
 {
     if (count > 1) log_info("WARNING: SetThreadCount(%d) ignored\n", count);
 }

 #endif