| // |
| // Copyright (c) 2017 The Khronos Group Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| #include "ThreadPool.h" |
| #include "errorHelpers.h" |
| #include "fpcontrol.h" |
| #include <stdio.h> |
| #include <stdlib.h> |
| |
| #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) |
| // or any other POSIX system |
| |
| #if defined(_WIN32) |
| #include <windows.h> |
| #if defined(_MSC_VER) |
| #include <intrin.h> |
| #endif |
| #include "mingw_compat.h" |
| #include <process.h> |
| #else // !_WIN32 |
| #include <pthread.h> |
| #include <unistd.h> |
| #include <sys/errno.h> |
| #ifdef __linux__ |
| #include <sched.h> |
| #endif |
| #endif // !_WIN32 |
| |
| // declarations |
| #ifdef _WIN32 |
| void ThreadPool_WorkerFunc(void *p); |
| #else |
| void *ThreadPool_WorkerFunc(void *p); |
| #endif |
| void ThreadPool_Init(void); |
| void ThreadPool_Exit(void); |
| |
| #if defined(__MINGW32__) |
| // Mutex for implementing super heavy atomic operations if you don't have GCC or |
| // MSVC |
| CRITICAL_SECTION gAtomicLock; |
| #elif defined(__GNUC__) || defined(_MSC_VER) |
| #else |
| pthread_mutex_t gAtomicLock; |
| #endif |
| |
| // Atomic add operator with mem barrier. Mem barrier needed to protect state |
| // modified by the worker functions. |
| cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b) |
| { |
| #if defined(__MINGW32__) |
| // No atomics on Mingw32 |
| EnterCriticalSection(&gAtomicLock); |
| cl_int old = *a; |
| *a = old + b; |
| LeaveCriticalSection(&gAtomicLock); |
| return old; |
| #elif defined(__GNUC__) |
| // GCC extension: |
| // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins |
| return __sync_fetch_and_add(a, b); |
| // do we need __sync_synchronize() here, too? GCC docs are unclear whether |
| // __sync_fetch_and_add does a synchronize |
| #elif defined(_MSC_VER) |
| return (cl_int)_InterlockedExchangeAdd((volatile LONG *)a, (LONG)b); |
| #else |
| #warning Please add a atomic add implementation here, with memory barrier. Fallback code is slow. |
| if (pthread_mutex_lock(&gAtomicLock)) |
| log_error("Atomic operation failed. pthread_mutex_lock(&gAtomicLock) " |
| "returned an error\n"); |
| cl_int old = *a; |
| *a = old + b; |
| if (pthread_mutex_unlock(&gAtomicLock)) |
| log_error("Failed to release gAtomicLock. Further atomic operations " |
| "may deadlock!\n"); |
| return old; |
| #endif |
| } |
| |
| #if defined(_WIN32) |
| // Uncomment the following line if Windows XP support is not required. |
| // #define HAS_INIT_ONCE_EXECUTE_ONCE 1 |
| |
| #if defined(HAS_INIT_ONCE_EXECUTE_ONCE) |
| #define _INIT_ONCE INIT_ONCE |
| #define _PINIT_ONCE PINIT_ONCE |
| #define _InitOnceExecuteOnce InitOnceExecuteOnce |
| #else // !HAS_INIT_ONCE_EXECUTE_ONCE |
| |
| typedef volatile LONG _INIT_ONCE; |
| typedef _INIT_ONCE *_PINIT_ONCE; |
| typedef BOOL(CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *); |
| |
| #define _INIT_ONCE_UNINITIALIZED 0 |
| #define _INIT_ONCE_IN_PROGRESS 1 |
| #define _INIT_ONCE_DONE 2 |
| |
| static BOOL _InitOnceExecuteOnce(_PINIT_ONCE InitOnce, _PINIT_ONCE_FN InitFn, |
| PVOID Parameter, LPVOID *Context) |
| { |
| while (*InitOnce != _INIT_ONCE_DONE) |
| { |
| if (*InitOnce != _INIT_ONCE_IN_PROGRESS |
| && _InterlockedCompareExchange(InitOnce, _INIT_ONCE_IN_PROGRESS, |
| _INIT_ONCE_UNINITIALIZED) |
| == _INIT_ONCE_UNINITIALIZED) |
| { |
| InitFn(InitOnce, Parameter, Context); |
| *InitOnce = _INIT_ONCE_DONE; |
| return TRUE; |
| } |
| Sleep(1); |
| } |
| return TRUE; |
| } |
| #endif // !HAS_INIT_ONCE_EXECUTE_ONCE |
| |
| // Uncomment the following line if Windows XP support is not required. |
| // #define HAS_CONDITION_VARIABLE 1 |
| |
| #if defined(HAS_CONDITION_VARIABLE) |
| #define _CONDITION_VARIABLE CONDITION_VARIABLE |
| #define _InitializeConditionVariable InitializeConditionVariable |
| #define _SleepConditionVariableCS SleepConditionVariableCS |
| #define _WakeAllConditionVariable WakeAllConditionVariable |
| #else // !HAS_CONDITION_VARIABLE |
| typedef struct |
| { |
| HANDLE mEvent; // Used to park the thread. |
| // Used to protect mWaiters, mGeneration and mReleaseCount: |
| CRITICAL_SECTION mLock[1]; |
| volatile cl_int mWaiters; // Number of threads waiting on this cond var. |
| volatile cl_int mGeneration; // Wait generation count. |
| volatile cl_int mReleaseCount; // Number of releases to execute before |
| // reseting the event. |
| } _CONDITION_VARIABLE; |
| |
| typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE; |
| |
| static void _InitializeConditionVariable(_PCONDITION_VARIABLE cond_var) |
| { |
| cond_var->mEvent = CreateEvent(NULL, TRUE, FALSE, NULL); |
| InitializeCriticalSection(cond_var->mLock); |
| cond_var->mWaiters = 0; |
| cond_var->mGeneration = 0; |
| #if !defined(NDEBUG) |
| cond_var->mReleaseCount = 0; |
| #endif // !NDEBUG |
| } |
| |
| static void _SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var, |
| PCRITICAL_SECTION cond_lock, |
| DWORD ignored) |
| { |
| EnterCriticalSection(cond_var->mLock); |
| cl_int generation = cond_var->mGeneration; |
| ++cond_var->mWaiters; |
| LeaveCriticalSection(cond_var->mLock); |
| LeaveCriticalSection(cond_lock); |
| |
| while (TRUE) |
| { |
| WaitForSingleObject(cond_var->mEvent, INFINITE); |
| EnterCriticalSection(cond_var->mLock); |
| BOOL done = |
| cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation; |
| LeaveCriticalSection(cond_var->mLock); |
| if (done) |
| { |
| break; |
| } |
| } |
| |
| EnterCriticalSection(cond_lock); |
| EnterCriticalSection(cond_var->mLock); |
| if (--cond_var->mReleaseCount == 0) |
| { |
| ResetEvent(cond_var->mEvent); |
| } |
| --cond_var->mWaiters; |
| LeaveCriticalSection(cond_var->mLock); |
| } |
| |
| static void _WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var) |
| { |
| EnterCriticalSection(cond_var->mLock); |
| if (cond_var->mWaiters > 0) |
| { |
| ++cond_var->mGeneration; |
| cond_var->mReleaseCount = cond_var->mWaiters; |
| SetEvent(cond_var->mEvent); |
| } |
| LeaveCriticalSection(cond_var->mLock); |
| } |
| #endif // !HAS_CONDITION_VARIABLE |
| #endif // _WIN32 |
| |
| #define MAX_COUNT (1 << 29) |
| |
| // Global state to coordinate whether the threads have been launched |
| // successfully or not |
| #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600) |
| static _INIT_ONCE threadpool_init_control; |
| #elif defined(_WIN32) // MingW of XP |
| static int threadpool_init_control; |
| #else // Posix platforms |
| pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT; |
| #endif |
| cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch |
| |
| // critical region lock around ThreadPool_Do. We can only run one ThreadPool_Do |
| // at a time, because we are too lazy to set up a queue here, and don't expect |
| // to need one. |
| #if defined(_WIN32) |
| CRITICAL_SECTION gThreadPoolLock[1]; |
| #else // !_WIN32 |
| pthread_mutex_t gThreadPoolLock; |
| #endif // !_WIN32 |
| |
| // Condition variable to park ThreadPool threads when not working |
| #if defined(_WIN32) |
| CRITICAL_SECTION cond_lock[1]; |
| _CONDITION_VARIABLE cond_var[1]; |
| #else // !_WIN32 |
| pthread_mutex_t cond_lock; |
| pthread_cond_t cond_var; |
| #endif // !_WIN32 |
| |
| // Condition variable state. How many iterations on the function left to run, |
| // set to CL_INT_MAX to cause worker threads to exit. Note: this value might |
| // go negative. |
| volatile cl_int gRunCount = 0; |
| |
| // State that only changes when the threadpool is not working. |
| volatile TPFuncPtr gFunc_ptr = NULL; |
| volatile void *gUserInfo = NULL; |
| volatile cl_int gJobCount = 0; |
| |
| // State that may change while the thread pool is working |
| volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole |
| |
| // Condition variable to park caller while waiting |
| #if defined(_WIN32) |
| HANDLE caller_event; |
| #else // !_WIN32 |
| pthread_mutex_t caller_cond_lock; |
| pthread_cond_t caller_cond_var; |
| #endif // !_WIN32 |
| |
| // # of threads intended to be running. Running threads will decrement this |
| // as they discover they've run out of work to do. |
| volatile cl_int gRunning = 0; |
| |
| // The total number of threads launched. |
| volatile cl_int gThreadCount = 0; |
| #ifdef _WIN32 |
| void ThreadPool_WorkerFunc(void *p) |
| #else |
| void *ThreadPool_WorkerFunc(void *p) |
| #endif |
| { |
| cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1); |
| cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1); |
| // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning ); |
| |
| while (MAX_COUNT > item) |
| { |
| cl_int err; |
| |
| // check for more work to do |
| if (0 >= item) |
| { |
| // log_info("Thread %d has run out of work.\n", threadID); |
| |
| // No work to do. Attempt to block waiting for work |
| #if defined(_WIN32) |
| EnterCriticalSection(cond_lock); |
| #else // !_WIN32 |
| if ((err = pthread_mutex_lock(&cond_lock))) |
| { |
| log_error( |
| "Error %d from pthread_mutex_lock. Worker %d unable to " |
| "block waiting for work. ThreadPool_WorkerFunc failed.\n", |
| err, threadID); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1); |
| // log_info("ThreadPool_WorkerFunc: gRunning = %d\n", |
| // remaining - 1); |
| if (1 == remaining) |
| { // last thread out signal the main thread to wake up |
| #if defined(_WIN32) |
| SetEvent(caller_event); |
| #else // !_WIN32 |
| if ((err = pthread_mutex_lock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_lock. Unable to " |
| "wake caller.\n", |
| err); |
| goto exit; |
| } |
| if ((err = pthread_cond_broadcast(&caller_cond_var))) |
| { |
| log_error( |
| "Error %d from pthread_cond_broadcast. Unable to wake " |
| "up main thread. ThreadPool_WorkerFunc failed.\n", |
| err); |
| goto exit; |
| } |
| if ((err = pthread_mutex_unlock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_lock. Unable to " |
| "wake caller.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| } |
| |
| // loop in case we are woken only to discover that some other thread |
| // already did all the work |
| while (0 >= item) |
| { |
| #if defined(_WIN32) |
| _SleepConditionVariableCS(cond_var, cond_lock, INFINITE); |
| #else // !_WIN32 |
| if ((err = pthread_cond_wait(&cond_var, &cond_lock))) |
| { |
| log_error( |
| "Error %d from pthread_cond_wait. Unable to block for " |
| "waiting for work. ThreadPool_WorkerFunc failed.\n", |
| err); |
| pthread_mutex_unlock(&cond_lock); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| // try again to get a valid item id |
| item = ThreadPool_AtomicAdd(&gRunCount, -1); |
| if (MAX_COUNT <= item) // exit if we are done |
| { |
| #if defined(_WIN32) |
| LeaveCriticalSection(cond_lock); |
| #else // !_WIN32 |
| pthread_mutex_unlock(&cond_lock); |
| #endif // !_WIN32 |
| goto exit; |
| } |
| } |
| |
| ThreadPool_AtomicAdd(&gRunning, 1); |
| // log_info("Thread %d has found work.\n", threadID); |
| |
| #if defined(_WIN32) |
| LeaveCriticalSection(cond_lock); |
| #else // !_WIN32 |
| if ((err = pthread_mutex_unlock(&cond_lock))) |
| { |
| log_error( |
| "Error %d from pthread_mutex_unlock. Unable to block for " |
| "waiting for work. ThreadPool_WorkerFunc failed.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| } |
| |
| // we have a valid item, so do the work |
| // but only if we haven't already encountered an error |
| if (CL_SUCCESS == jobError) |
| { |
| // log_info("Thread %d doing job %d\n", threadID, item - 1); |
| |
| #if defined(__APPLE__) && defined(__arm__) |
| // On most platforms which support denorm, default is FTZ off. |
| // However, on some hardware where the reference is computed, |
| // default might be flush denorms to zero e.g. arm. This creates |
| // issues in result verification. Since spec allows the |
| // implementation to either flush or not flush denorms to zero, an |
| // implementation may choose not be flush i.e. return denorm result |
| // whereas reference result may be zero (flushed denorm). Hence we |
| // need to disable denorm flushing on host side where reference is |
| // being computed to make sure we get non-flushed reference result. |
| // If implementation returns flushed result, we correctly take care |
| // of that in verification code. |
| FPU_mode_type oldMode; |
| DisableFTZ(&oldMode); |
| #endif |
| |
| // Call the user's function with this item ID |
| err = gFunc_ptr(item - 1, threadID, (void *)gUserInfo); |
| #if defined(__APPLE__) && defined(__arm__) |
| // Restore FP state |
| RestoreFPState(&oldMode); |
| #endif |
| |
| if (err) |
| { |
| #if (__MINGW32__) |
| EnterCriticalSection(&gAtomicLock); |
| if (jobError == CL_SUCCESS) jobError = err; |
| gRunCount = 0; |
| LeaveCriticalSection(&gAtomicLock); |
| #elif defined(__GNUC__) |
| // GCC extension: |
| // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins |
| // set the new error if we are the first one there. |
| __sync_val_compare_and_swap(&jobError, CL_SUCCESS, err); |
| |
| // drop run count to 0 |
| gRunCount = 0; |
| __sync_synchronize(); |
| #elif defined(_MSC_VER) |
| // set the new error if we are the first one there. |
| _InterlockedCompareExchange((volatile LONG *)&jobError, err, |
| CL_SUCCESS); |
| |
| // drop run count to 0 |
| gRunCount = 0; |
| _mm_mfence(); |
| #else |
| if (pthread_mutex_lock(&gAtomicLock)) |
| log_error( |
| "Atomic operation failed. " |
| "pthread_mutex_lock(&gAtomicLock) returned an error\n"); |
| if (jobError == CL_SUCCESS) jobError = err; |
| gRunCount = 0; |
| if (pthread_mutex_unlock(&gAtomicLock)) |
| log_error("Failed to release gAtomicLock. Further atomic " |
| "operations may deadlock\n"); |
| #endif |
| } |
| } |
| |
| // get the next item |
| item = ThreadPool_AtomicAdd(&gRunCount, -1); |
| } |
| |
| exit: |
| log_info("ThreadPool: thread %d exiting.\n", threadID); |
| ThreadPool_AtomicAdd(&gThreadCount, -1); |
| #if !defined(_WIN32) |
| return NULL; |
| #endif |
| } |
| |
| // SetThreadCount() may be used to artifically set the number of worker threads |
| // If the value is 0 (the default) the number of threads will be determined |
| // based on the number of CPU cores. If it is a unicore machine, then 2 will be |
| // used, so that we still get some testing for thread safety. |
| // |
| // If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then |
| // the code will run single threaded, but will report an error to indicate that |
| // the test is invalid. This option is intended for debugging purposes only. It |
| // is suggested as a convention that test apps set the thread count to 1 in |
| // response to the -m flag. |
| // |
| // SetThreadCount() must be called before the first call to GetThreadCount() or |
| // ThreadPool_Do(), otherwise the behavior is indefined. |
| void SetThreadCount(int count) |
| { |
| if (threadPoolInitErr == CL_SUCCESS) |
| { |
| log_error("Error: It is illegal to set the thread count after the " |
| "first call to ThreadPool_Do or GetThreadCount\n"); |
| abort(); |
| } |
| |
| gThreadCount = count; |
| } |
| |
| void ThreadPool_Init(void) |
| { |
| cl_int i; |
| int err; |
| volatile cl_uint threadID = 0; |
| |
| // Check for manual override of multithreading code. We add this for better |
| // debuggability. |
| if (getenv("CL_TEST_SINGLE_THREADED")) |
| { |
| log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. " |
| "Running single threaded.\n*** TEST IS INVALID! ***\n"); |
| gThreadCount = 1; |
| return; |
| } |
| |
| // Figure out how many threads to run -- check first for non-zero to give |
| // the implementation the chance |
| if (0 == gThreadCount) |
| { |
| #if defined(_MSC_VER) || defined(__MINGW64__) |
| PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL; |
| DWORD length = 0; |
| |
| GetLogicalProcessorInformation(NULL, &length); |
| buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length); |
| if (buffer != NULL) |
| { |
| if (GetLogicalProcessorInformation(buffer, &length) == TRUE) |
| { |
| PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; |
| while ( |
| ptr |
| < &buffer[length |
| / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)]) |
| { |
| if (ptr->Relationship == RelationProcessorCore) |
| { |
| // Count the number of bits in ProcessorMask (number of |
| // logical cores) |
| ULONG mask = ptr->ProcessorMask; |
| while (mask) |
| { |
| ++gThreadCount; |
| mask &= mask - 1; // Remove 1 bit at a time |
| } |
| } |
| ++ptr; |
| } |
| } |
| free(buffer); |
| } |
| #elif defined(__MINGW32__) |
| { |
| #warning How about this, instead of hard coding it to 2? |
| SYSTEM_INFO sysinfo; |
| GetSystemInfo(&sysinfo); |
| gThreadCount = sysinfo.dwNumberOfProcessors; |
| } |
| #elif defined(__linux__) && !defined(__ANDROID__) |
| cpu_set_t affinity; |
| if (0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity)) |
| { |
| #if !(defined(CPU_COUNT)) |
| gThreadCount = 1; |
| #else |
| gThreadCount = CPU_COUNT(&affinity); |
| #endif |
| } |
| else |
| { |
| // Hopefully your system returns logical cpus here, as does MacOS X |
| gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF); |
| } |
| #else /* !_WIN32 */ |
| // Hopefully your system returns logical cpus here, as does MacOS X |
| gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF); |
| #endif // !_WIN32 |
| |
| // Multithreaded tests are required to run multithreaded even on unicore |
| // systems so as to test thread safety |
| if (1 == gThreadCount) gThreadCount = 2; |
| } |
| |
| // When working in 32 bit limit the thread number to 12 |
| // This fix was made due to memory issues in integer_ops test |
| // When running integer_ops, the test opens as many threads as the |
| // machine has and each thread allocates a fixed amount of memory |
| // When running this test on dual socket machine in 32-bit, the |
| // process memory is not sufficient and the test fails |
| #if defined(_WIN32) && !defined(_M_X64) |
| if (gThreadCount > 12) |
| { |
| gThreadCount = 12; |
| } |
| #endif |
| |
| // Allow the app to set thread count to <0 for debugging purposes. |
| // This will cause the test to run single threaded. |
| if (gThreadCount < 2) |
| { |
| log_error("ERROR: Running single threaded because thread count < 2. " |
| "\n*** TEST IS INVALID! ***\n"); |
| gThreadCount = 1; |
| return; |
| } |
| |
| #if defined(_WIN32) |
| InitializeCriticalSection(gThreadPoolLock); |
| InitializeCriticalSection(cond_lock); |
| _InitializeConditionVariable(cond_var); |
| caller_event = CreateEvent(NULL, FALSE, FALSE, NULL); |
| #elif defined(__GNUC__) |
| // Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since |
| // it might cause problem with some flavors of gcc compilers. |
| pthread_cond_init(&cond_var, NULL); |
| pthread_mutex_init(&cond_lock, NULL); |
| pthread_cond_init(&caller_cond_var, NULL); |
| pthread_mutex_init(&caller_cond_lock, NULL); |
| pthread_mutex_init(&gThreadPoolLock, NULL); |
| #endif |
| |
| #if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__)) |
| pthread_mutex_initialize(gAtomicLock); |
| #elif defined(__MINGW32__) |
| InitializeCriticalSection(&gAtomicLock); |
| #endif |
| // Make sure the last thread done in the work pool doesn't signal us to wake |
| // before we get to the point where we are supposed to wait |
| // That would cause a deadlock. |
| #if !defined(_WIN32) |
| if ((err = pthread_mutex_lock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_lock. Unable to block for work " |
| "to finish. ThreadPool_Init failed.\n", |
| err); |
| gThreadCount = 1; |
| return; |
| } |
| #endif // !_WIN32 |
| |
| gRunning = gThreadCount; |
| // init threads |
| for (i = 0; i < gThreadCount; i++) |
| { |
| #if defined(_WIN32) |
| uintptr_t handle = |
| _beginthread(ThreadPool_WorkerFunc, 0, (void *)&threadID); |
| err = (handle == 0); |
| #else // !_WIN32 |
| pthread_t tid = 0; |
| err = pthread_create(&tid, NULL, ThreadPool_WorkerFunc, |
| (void *)&threadID); |
| #endif // !_WIN32 |
| if (err) |
| { |
| log_error("Error %d launching thread %d\n", err, i); |
| threadPoolInitErr = err; |
| gThreadCount = i; |
| break; |
| } |
| } |
| |
| atexit(ThreadPool_Exit); |
| |
| // block until they are done launching. |
| do |
| { |
| #if defined(_WIN32) |
| WaitForSingleObject(caller_event, INFINITE); |
| #else // !_WIN32 |
| if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_cond_wait. Unable to block for " |
| "work to finish. ThreadPool_Init failed.\n", |
| err); |
| pthread_mutex_unlock(&caller_cond_lock); |
| return; |
| } |
| #endif // !_WIN32 |
| } while (gRunCount != -gThreadCount); |
| #if !defined(_WIN32) |
| if ((err = pthread_mutex_unlock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_unlock. Unable to block for " |
| "work to finish. ThreadPool_Init failed.\n", |
| err); |
| return; |
| } |
| #endif // !_WIN32 |
| |
| threadPoolInitErr = CL_SUCCESS; |
| } |
| |
| #if defined(_MSC_VER) |
| static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter, |
| PVOID *lpContex) |
| { |
| ThreadPool_Init(); |
| return TRUE; |
| } |
| #endif |
| |
| void ThreadPool_Exit(void) |
| { |
| int err, count; |
| gRunCount = CL_INT_MAX; |
| |
| #if defined(__GNUC__) |
| // GCC extension: |
| // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins |
| __sync_synchronize(); |
| #elif defined(_MSC_VER) |
| _mm_mfence(); |
| #else |
| #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed |
| #endif |
| |
| // spin waiting for threads to die |
| for (count = 0; 0 != gThreadCount && count < 1000; count++) |
| { |
| #if defined(_WIN32) |
| _WakeAllConditionVariable(cond_var); |
| Sleep(1); |
| #else // !_WIN32 |
| if ((err = pthread_cond_broadcast(&cond_var))) |
| { |
| log_error("Error %d from pthread_cond_broadcast. Unable to wake up " |
| "work threads. ThreadPool_Exit failed.\n", |
| err); |
| break; |
| } |
| usleep(1000); |
| #endif // !_WIN32 |
| } |
| |
| if (gThreadCount) |
| log_error("Error: Thread pool timed out after 1 second with %d threads " |
| "still active.\n", |
| gThreadCount); |
| else |
| log_info("Thread pool exited in a orderly fashion.\n"); |
| } |
| |
| |
| // Blocking API that farms out count jobs to a thread pool. |
| // It may return with some work undone if func_ptr() returns a non-zero |
| // result. |
| // |
| // This function obviously has its shortcommings. Only one call to ThreadPool_Do |
| // can be running at a time. It is not intended for general purpose use. |
| // If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were |
| // all available then it would make more sense to use those features. |
| cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo) |
| { |
| cl_int newErr; |
| cl_int err = 0; |
| // Lazily set up our threads |
| #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600) |
| err = !_InitOnceExecuteOnce(&threadpool_init_control, _ThreadPool_Init, |
| NULL, NULL); |
| #elif defined(_WIN32) |
| if (threadpool_init_control == 0) |
| { |
| #warning This is buggy and race prone. Find a better way. |
| ThreadPool_Init(); |
| threadpool_init_control = 1; |
| } |
| #else // posix platform |
| err = pthread_once(&threadpool_init_control, ThreadPool_Init); |
| if (err) |
| { |
| log_error("Error %d from pthread_once. Unable to init threads. " |
| "ThreadPool_Do failed.\n", |
| err); |
| return err; |
| } |
| #endif |
| // Single threaded code to handle case where threadpool wasn't allocated or |
| // was disabled by environment variable |
| if (threadPoolInitErr) |
| { |
| cl_uint currentJob = 0; |
| cl_int result = CL_SUCCESS; |
| |
| #if defined(__APPLE__) && defined(__arm__) |
| // On most platforms which support denorm, default is FTZ off. However, |
| // on some hardware where the reference is computed, default might be |
| // flush denorms to zero e.g. arm. This creates issues in result |
| // verification. Since spec allows the implementation to either flush or |
| // not flush denorms to zero, an implementation may choose not be flush |
| // i.e. return denorm result whereas reference result may be zero |
| // (flushed denorm). Hence we need to disable denorm flushing on host |
| // side where reference is being computed to make sure we get |
| // non-flushed reference result. If implementation returns flushed |
| // result, we correctly take care of that in verification code. |
| FPU_mode_type oldMode; |
| DisableFTZ(&oldMode); |
| #endif |
| for (currentJob = 0; currentJob < count; currentJob++) |
| if ((result = func_ptr(currentJob, 0, userInfo))) |
| { |
| #if defined(__APPLE__) && defined(__arm__) |
| // Restore FP state before leaving |
| RestoreFPState(&oldMode); |
| #endif |
| return result; |
| } |
| |
| #if defined(__APPLE__) && defined(__arm__) |
| // Restore FP state before leaving |
| RestoreFPState(&oldMode); |
| #endif |
| |
| return CL_SUCCESS; |
| } |
| |
| if (count >= MAX_COUNT) |
| { |
| log_error( |
| "Error: ThreadPool_Do count %d >= max threadpool count of %d\n", |
| count, MAX_COUNT); |
| return -1; |
| } |
| |
| // Enter critical region |
| #if defined(_WIN32) |
| EnterCriticalSection(gThreadPoolLock); |
| #else // !_WIN32 |
| if ((err = pthread_mutex_lock(&gThreadPoolLock))) |
| { |
| switch (err) |
| { |
| case EDEADLK: |
| log_error( |
| "Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do " |
| "is not designed to work recursively!\n"); |
| break; |
| case EINVAL: |
| log_error("Error EINVAL returned in ThreadPool_Do(). How did " |
| "we end up with an invalid gThreadPoolLock?\n"); |
| break; |
| default: break; |
| } |
| return err; |
| } |
| #endif // !_WIN32 |
| |
| // Start modifying the job state observable by worker threads |
| #if defined(_WIN32) |
| EnterCriticalSection(cond_lock); |
| #else // !_WIN32 |
| if ((err = pthread_mutex_lock(&cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_lock. Unable to wake up work " |
| "threads. ThreadPool_Do failed.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| // Make sure the last thread done in the work pool doesn't signal us to wake |
| // before we get to the point where we are supposed to wait |
| // That would cause a deadlock. |
| #if !defined(_WIN32) |
| if ((err = pthread_mutex_lock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_lock. Unable to block for work " |
| "to finish. ThreadPool_Do failed.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| // Prime the worker threads to get going |
| jobError = CL_SUCCESS; |
| gRunCount = gJobCount = count; |
| gFunc_ptr = func_ptr; |
| gUserInfo = userInfo; |
| |
| #if defined(_WIN32) |
| ResetEvent(caller_event); |
| _WakeAllConditionVariable(cond_var); |
| LeaveCriticalSection(cond_lock); |
| #else // !_WIN32 |
| if ((err = pthread_cond_broadcast(&cond_var))) |
| { |
| log_error("Error %d from pthread_cond_broadcast. Unable to wake up " |
| "work threads. ThreadPool_Do failed.\n", |
| err); |
| goto exit; |
| } |
| if ((err = pthread_mutex_unlock(&cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_unlock. Unable to wake up work " |
| "threads. ThreadPool_Do failed.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| // block until they are done. It would be slightly more efficient to do |
| // some of the work here though. |
| do |
| { |
| #if defined(_WIN32) |
| WaitForSingleObject(caller_event, INFINITE); |
| #else // !_WIN32 |
| if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_cond_wait. Unable to block for " |
| "work to finish. ThreadPool_Do failed.\n", |
| err); |
| pthread_mutex_unlock(&caller_cond_lock); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| } while (gRunning); |
| #if !defined(_WIN32) |
| if ((err = pthread_mutex_unlock(&caller_cond_lock))) |
| { |
| log_error("Error %d from pthread_mutex_unlock. Unable to block for " |
| "work to finish. ThreadPool_Do failed.\n", |
| err); |
| goto exit; |
| } |
| #endif // !_WIN32 |
| |
| err = jobError; |
| |
| exit: |
| // exit critical region |
| #if defined(_WIN32) |
| LeaveCriticalSection(gThreadPoolLock); |
| #else // !_WIN32 |
| newErr = pthread_mutex_unlock(&gThreadPoolLock); |
| if (newErr) |
| { |
| log_error("Error %d from pthread_mutex_unlock. Unable to exit critical " |
| "region. ThreadPool_Do failed.\n", |
| newErr); |
| return err; |
| } |
| #endif // !_WIN32 |
| |
| return err; |
| } |
| |
| cl_uint GetThreadCount(void) |
| { |
| // Lazily set up our threads |
| #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600) |
| cl_int err = !_InitOnceExecuteOnce(&threadpool_init_control, |
| _ThreadPool_Init, NULL, NULL); |
| #elif defined(_WIN32) |
| if (threadpool_init_control == 0) |
| { |
| #warning This is buggy and race prone. Find a better way. |
| ThreadPool_Init(); |
| threadpool_init_control = 1; |
| } |
| #else |
| cl_int err = pthread_once(&threadpool_init_control, ThreadPool_Init); |
| if (err) |
| { |
| log_error("Error %d from pthread_once. Unable to init threads. " |
| "ThreadPool_Do failed.\n", |
| err); |
| return err; |
| } |
| #endif // !_WIN32 |
| |
| if (gThreadCount < 1) return 1; |
| |
| return gThreadCount; |
| } |
| |
| #else |
| |
| #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS |
| #error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section. |
| #endif |
| // |
| // We require multithreading in parts of the test as a means of simultaneously |
| // testing reentrancy requirements of OpenCL API, while also checking |
| // |
| // A sample single threaded implementation follows, for documentation / |
| // bootstrapping purposes. It is not okay to use this for conformance testing!!! |
| // |
| // Exception: If your operating system does not support multithreaded execution |
| // of any kind, then you may use this code. |
| // |
| |
| cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b) |
| { |
| cl_uint r = *a; |
| |
| // since this fallback code path is not multithreaded, we just do a regular |
| // add here. If your operating system supports memory-barrier-atomics, use |
| // those here. |
| *a = r + b; |
| |
| return r; |
| } |
| |
| // Blocking API that farms out count jobs to a thread pool. |
| // It may return with some work undone if func_ptr() returns a non-zero |
| // result. |
| cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo) |
| { |
| cl_uint currentJob = 0; |
| cl_int result = CL_SUCCESS; |
| |
| #ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS |
| // THIS FUNCTION IS NOT INTENDED FOR USE!! |
| log_error("ERROR: Test must be multithreaded!\n"); |
| exit(-1); |
| #else |
| static int spewCount = 0; |
| |
| if (0 == spewCount) |
| { |
| log_info("\nWARNING: The operating system is claimed not to support " |
| "threads of any sort. Running single threaded.\n"); |
| spewCount = 1; |
| } |
| #endif |
| |
| // The multithreaded code should mimic this behavior: |
| for (currentJob = 0; currentJob < count; currentJob++) |
| if ((result = func_ptr(currentJob, 0, userInfo))) return result; |
| |
| return CL_SUCCESS; |
| } |
| |
| cl_uint GetThreadCount(void) { return 1; } |
| |
| void SetThreadCount(int count) |
| { |
| if (count > 1) log_info("WARNING: SetThreadCount(%d) ignored\n", count); |
| } |
| |
| #endif |