include/pthreadpool.h - external/github.com/google/pthreadpool - Git at Google

 // Copyright (c) 2017 Facebook Inc.
 // Copyright (c) 2015-2017 Georgia Institute of Technology
 // All rights reserved.
 //
 // Copyright 2019 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.

 #ifndef __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_
 #define __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_

 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>

 typedef struct pthreadpool* pthreadpool_t;

 typedef void (*pthreadpool_task_1d_t)(void*, size_t);
 typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_task_1d_tile_1d_dynamic_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_task_1d_tile_1d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t);
 typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t,
                                                   size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_dynamic_t)(void*, size_t, size_t,
                                                       size_t);
 typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t,
                                               size_t);
 typedef void (*pthreadpool_task_2d_tile_2d_dynamic_t)(void*, size_t, size_t,
                                                       size_t, size_t);
 typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t,
                                               size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t,
                                                           size_t, size_t,
                                                           size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_dynamic_t)(void*, size_t, size_t,
                                                       size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_2d_dynamic_t)(void*, size_t, size_t,
                                                       size_t, size_t, size_t);
 typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t);
 typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t, size_t);
 typedef void (*pthreadpool_task_4d_tile_2d_dynamic_t)(void*, size_t, size_t,
                                                       size_t, size_t, size_t,
                                                       size_t);
 typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t,
                                       size_t);
 typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t, size_t);
 typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t,
                                       size_t, size_t);
 typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t,
                                               size_t, size_t, size_t, size_t,
                                               size_t);

 typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t,
                                                       size_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t,
                                                               size_t);
 typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t,
                                                       size_t, size_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_2d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t,
                                                               size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t,
                                                       size_t, size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t,
                                                               size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t,
                                                       size_t, size_t, size_t,
                                                       size_t);
 typedef void (*pthreadpool_task_3d_tile_2d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t,
                                                               size_t, size_t,
                                                               size_t);
 typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t,
                                                       size_t, size_t, size_t,
                                                       size_t, size_t);
 typedef void (*pthreadpool_task_4d_tile_2d_dynamic_with_id_t)(void*, uint32_t,
                                                               size_t, size_t,
                                                               size_t, size_t,
                                                               size_t, size_t);

 typedef void (*pthreadpool_task_1d_tile_1d_dynamic_with_id_with_thread_t)(
     void*, uint32_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(
     void*, uint32_t, size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_2d_tile_1d_dynamic_with_id_with_thread_t)(
     void*, uint32_t, size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(
     void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
 typedef void (*pthreadpool_task_3d_tile_1d_dynamic_with_id_with_thread_t)(
     void*, uint32_t, size_t, size_t, size_t, size_t, size_t);

 /**
  * Disable support for denormalized numbers to the maximum extent possible for
  * the duration of the computation.
  *
  * Handling denormalized floating-point numbers is often implemented in
  * microcode, and incurs significant performance degradation. This hint
  * instructs the thread pool to disable support for denormalized numbers before
  * running the computation by manipulating architecture-specific control
  * registers, and restore the initial value of control registers after the
  * computation is complete. The thread pool temporary disables denormalized
  * numbers on all threads involved in the computation (i.e. the caller threads,
  * and potentially worker threads).
  *
  * Disabling denormalized numbers may have a small negative effect on results'
  * accuracy. As various architectures differ in capabilities to control
  * processing of denormalized numbers, using this flag may also hurt results'
  * reproducibility across different instruction set architectures.
  */
 #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001

 /**
  * Yield worker threads to the system scheduler after the operation is finished.
  *
  * Force workers to use kernel wait (instead of active spin-wait by default) for
  * new commands after this command is processed. This flag affects only the
  * immediate next operation on this thread pool. To make the thread pool always
  * use kernel wait, pass this flag to all parallelization functions.
  *
  * Note: This flag is currently ignored as yielding the worker threads after a
  * fixed number of spin-wait iterations is currently the default behaviour.
  */
 #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002

 /**
  * If worker threads are provided by an external @a pthreadpool_executor,
  * release them back to the executor instead of spinning for a fixed number of
  * iterations first.
  */
 #define PTHREADPOOL_FLAG_DONT_SPIN_WORKERS 0x00000004

 #ifdef __cplusplus
 extern "C" {
 #endif

 /// An abstract interface of a parallel task executor.
 struct pthreadpool_executor {
   /// Get the number of tasks that can be executed concurrently.
   int (*num_threads)(void* executor_context);

   /// Schedule `task` to be called, with `context` as its argument.
   void (*schedule)(void* executor_context, void* context,
                    void (*task)(void* context));
 };

 /**
  * Create a thread pool with the specified number of threads.
  *
  * @param  threads_count  the number of threads in the thread pool.
  *    A value of 0 has special interpretation: it creates a thread pool with as
  *    many threads as there are logical processors in the system.
  *
  * @return  A pointer to an opaque thread pool object if the call is
  *    successful, or NULL pointer if the call failed.
  */
 pthreadpool_t pthreadpool_create(size_t threads_count);

 /**
  * Create a thread pool with a given @a pthreadpool_executor and a maximum
  * specified number of threads.
  *
  * For each call to a `pthreadpool_parallelize_*` function, the minimum of @a
  * max_num_threads and @a executor->num_threads(executor_context) calls to @a
  * executor->schedule(executor_context, ...) will be executed, potentially
  * lasting for the entire duration of the `pthreadpool_parallelize_*` call.
  *
  * @param executor          A pointer to a @a pthreadpool_executor object that
  *                          will be used to determine the number of extra
  *                          threads (plus the calling thread), and provide the
  *                          threads itself, for each call to a
  *                          `pthreadpool_parallelize_*` function.
  * @param executor_context  A pointer to the context that will be passed to the
  *                          functions in the @a executor object.
  * @param max_num_thread    The maximum number of threads in the thread pool.
  *                          A value of 0 has special interpretation: it creates
  *                          a thread pool with as many threads as there are
  *                          logical processors in the system.
  *
  * @return  A pointer to an opaque thread pool object if the call is
  *    successful, or NULL pointer if the call failed.
  */
 pthreadpool_t pthreadpool_create_v2(struct pthreadpool_executor* executor,
                                     void* executor_context,
                                     size_t max_num_threads);

 /**
  * Query the number of threads in a thread pool.
  *
  * @param  threadpool  the thread pool to query.
  *
  * @return  The number of threads in the thread pool.
  */
 size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);

 /**
  * Try to set the number of threads in a thread pool.
  *
  * The number of threads can be at most the number of threads with which the @a
  * threadpool was created, or the number of threads provided by the @a
  * pthreadpool_executor if the threadpool was created with @a
  * pthreadpool_create_v2.
  *
  * Trying to set a larger value will set and return the maximum possible value.
  *
  * @param  threadpool   The thread pool to query.
  * @param  num_threads  The desired number of threads. A value of 0 sets the
  *                      number of threads to the maximum available, i.e. the
  *                      value used when the @a threadpool was created.
  *
  * @return  The updated number of threads in the thread pool.
  */
 size_t pthreadpool_set_threads_count(pthreadpool_t threadpool,
                                      size_t num_threads);

 /**
  * Release any threads borrowed from an @a pthreadpool_executor.
  *
  * If the @a threadpool was created with @a pthreadpool_create_v2, this function
  * returns any threads acquired during execution to the associated @a
  * pthreadpool_executor.
  *
  * Threads will be re-acquired as needed on the next call to a
  * `pthreadpool_parallelize_*` function.
  *
  * If the @a threadpool was _not_ created with @a pthreadpool_create_v2, then
  * this function does nothing.
  *
  * @param  threadpool   the thread pool on which to release the executor
  *                      threads.
  */
 void pthreadpool_release_executor_threads(struct pthreadpool* threadpool);

 /**
  * Updates a thread pool with a given @a pthreadpool_executor.
  *
  * @param threadpool        The thread pool in which to replace the executor.
  * @param executor          A pointer to a @a pthreadpool_executor object that
  *                          will be used to determine the number of extra
  *                          threads (plus the calling thread), and provide the
  *                          threads itself, for each call to a
  *                          `pthreadpool_parallelize_*` function.
  * @param executor_context  A pointer to the context that will be passed to the
  *                          functions in the @a executor object.
  *
  * @return  @c true if the @a executor was successfully swapped, and @c false if
  * it was not, e.g. because the current and nex @a executor and @a
  * executor_context are identical.
  */
 bool pthreadpool_update_executor(pthreadpool_t threadpool,
                                  struct pthreadpool_executor* executor,
                                  void* executor_context);

 /**
  * Process items on a 1D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range; i++)
  *     function(context, i);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each item.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process. The
  *    specified function will be called once for each item.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d(pthreadpool_t threadpool,
                                 pthreadpool_task_1d_t function, void* context,
                                 size_t range, uint32_t flags);

 /**
  * Process items on a 1D grid passing along the current thread id.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range; i++)
  *     function(context, thread_index, i);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each item.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process. The
  *    specified function will be called once for each item.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_with_thread(
     pthreadpool_t threadpool, pthreadpool_task_1d_with_thread_t function,
     void* context, size_t range, uint32_t flags);

 /**
  * Process items on a 1D grid using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range; i++)
  *     function(context, uarch_index, i);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each item.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range                the number of items on the 1D grid to process.
  *    The specified function will be called once for each item.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range, uint32_t flags);

 /**
  * Process items on a 1D grid with specified maximum tile size.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range; i += tile)
  *     function(context, i, min(range - i, tile));
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the maximum number of items on the 1D grid to process in
  *    one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_1d_tile_1d_t function,
                                         void* context, size_t range,
                                         size_t tile, uint32_t flags);

 /**
  * Process items on a 1D grid with specified prefered tile size.
  *
  * The function repeatedly calls
  *
  *   function(context, i, count)
  *
  * in parallel where `i` is in the range `[0, range)` and a multiple of the
  * provided @a tile and `count` is an integer multiple of @a tile unless `i
  * + count == range`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the preferred multiple number of items on the 1D grid to
  *     process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_tile_1d_dynamic(
     pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function,
     void* context, size_t range, size_t tile, uint32_t flags);

 /**
  * Process items on a 1D grid with specified prefered tile size, passing along
  * the current thread id.
  *
  * The function repeatedly calls
  *
  *   function(context, thread_id, i, count)
  *
  * in parallel where `i` is in the range `[0, range)` and a multiple of the
  * provided @a tile and `count` is an integer multiple of @a tile unless `i
  * + count == range`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the preferred multiple number of items on the 1D grid to
  *     process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_tile_1d_dynamic_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_1d_tile_1d_dynamic_with_id_t function, void* context,
     size_t range, size_t tile, uint32_t flags);

 /**
  * Process items on a 1D grid with specified prefered tile size, passing along
  * the current uarch index and thread id.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, thread_id, i, count)
  *
  * in parallel where `i` is in the range `[0, range)` and a multiple of the
  * provided @a tile and `count` is an integer multiple of @a tile unless `i
  * + count == range`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the preferred multiple number of items on the 1D grid to
  *     process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_1d_tile_1d_dynamic_with_uarch_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_1d_tile_1d_dynamic_with_id_with_thread_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range, size_t tile, uint32_t flags);

 /**
  * Process items on a 2D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       function(context, i, j);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each item.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d(pthreadpool_t threadpool,
                                 pthreadpool_task_2d_t function, void* context,
                                 size_t range_i, size_t range_j, uint32_t flags);

 /**
  * Process items on a 2D grid passing along the current thread id.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       function(context, thread_index, i, j);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each item.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_with_thread(
     pthreadpool_t threadpool, pthreadpool_task_2d_with_thread_t function,
     void* context, size_t range_i, size_t range_j, uint32_t flags);

 /**
  * Process items on a 2D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       function(context, i, j, min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_2d_tile_1d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t tile_j,
                                         uint32_t flags);

 /**
  * Process items on a 2D grid with the specified maximum tile size along the
  * last grid dimension using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       function(context, uarch_index, i, j, min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with the specified maximum tile size along the
  * last grid dimension using a microarchitecture-aware task function and passing
  * along the current thread id.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       function(context, uarch_index, thread_index, i, j, min(range_j - j,
  * tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context,
     uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
     size_t range_j, size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along the
  * last grid dimension.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range
  * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an
  * integer multiple of @a tile_j unless `j + count_j == range_j`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 2D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 2D
  *     grid to process.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d_dynamic(
     pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function,
     void* context, size_t range_i, size_t range_j, size_t tile_j,
     uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along the
  * last grid dimension, passing along the current thread id.
  *
  * The function repeatedly calls
  *
  *   function(context, thread_id, i, j, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range
  * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an
  * integer multiple of @a tile_j unless `j + count_j == range_j`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 2D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 2D
  *     grid to process.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d_dynamic_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_2d_tile_1d_dynamic_with_id_t function, void* context,
     size_t range_i, size_t range_j, size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along the
  * last grid dimension, passing along the current uarch index and thread id.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, thread_id, i, j, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range
  * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an
  * integer multiple of @a tile_j unless `j + count_j == range_j`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 2D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 2D
  *     grid to process.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_1d_dynamic_with_uarch_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_2d_tile_1d_dynamic_with_id_with_thread_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with the specified maximum tile size along each
  * grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i += tile_i)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       function(context, i, j,
  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the first dimension of
  *    the 2D grid to process in one function call.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool,
                                         pthreadpool_task_2d_tile_2d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t tile_i,
                                         size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along each grid
  * dimension.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, count_i, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)` and a multiple of the
  * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the
  * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a
  * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j ==
  * range_j`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *                    is NULL, all items are processed serially on the calling
  *                    thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items on the first dimension of the 2D
  *                    grid to process.
  * @param range_j     the number of items on the second dimension of the 2D
  *                    grid to process.
  * @param tile_i      the preferred multiple number of items on the first
  *                    dimension of the 2D grid to process in each function call.
  * @param tile_j      the preferred multiple number of items on the second
  *                    dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *                    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_2d_dynamic(
     pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function,
     void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
     uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along each grid
  * dimension using a microarchitecture-aware task function.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, i, j, count_i, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)` and a multiple of the
  * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the
  * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a
  * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j ==
  * range_j`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param default_uarch_index  the microarchitecture index to use when
  *                             pthreadpool is configured without cpuinfo,
  *                             cpuinfo initialization failed, or index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds
  *                             the max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected
  *                             by the specified function. If the index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds this
  *                             value, default_uarch_index will be used instead.
  *                             default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items on the first dimension of the
  *                             2D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 2D grid to process.
  * @param tile_i               the preferred multiple number of items on the
  *                             first dimension of the 2D grid to process in each
  *                             function call.
  * @param tile_j               the preferred multiple number of items on the
  *                             second dimension of the 2D grid to process in
  *                             each function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch(
     pthreadpool_t threadpool,
     pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context,
     uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
     size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags);

 /**
  * Process items on a 2D grid with specified prefered tile size along each grid
  * dimension passing along the current thread id.
  *
  * The function repeatedly calls
  *
  *   function(context, thread_id, i, j, count_i, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)` and a multiple of the
  * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the
  * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a
  * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j ==
  * range_j`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param range_i              the number of items on the first dimension of the
  *                             2D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 2D grid to process.
  * @param tile_i               the preferred multiple number of items on the
  *                             first dimension of the 2D grid to process in each
  *                             function call.
  * @param tile_j               the preferred multiple number of items on the
  *                             second dimension of the 2D grid to process in
  *                             each function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_2d_dynamic_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context,
     size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
     uint32_t flags);

 /**
  * Process items on a 2D grid with the specified maximum tile size along each
  * grid dimension using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i += tile_i)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       function(context, uarch_index, i, j,
  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each tile.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *                             pthreadpool is configured without cpuinfo,
  *                             cpuinfo initialization failed, or index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds
  *                             the max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected
  *                             by the specified function. If the index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds this
  *                             value, default_uarch_index will be used instead.
  *                             default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items to process along the first
  *    dimension of the 2D grid.
  * @param range_j              the number of items to process along the second
  *    dimension of the 2D grid.
  * @param tile_j               the maximum number of items along the first
  *    dimension of the 2D grid to process in one function call.
  * @param tile_j               the maximum number of items along the second
  *    dimension of the 2D grid to process in one function call.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_2d_tile_2d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t tile_i, size_t tile_j,
     uint32_t flags);

 /**
  * Process items on a 3D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         function(context, i, j, k);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d(pthreadpool_t threadpool,
                                 pthreadpool_task_3d_t function, void* context,
                                 size_t range_i, size_t range_j, size_t range_k,
                                 uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, i, j, k, min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 3D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_3d_tile_1d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t tile_k, uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last grid dimension and passing along the current thread id.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, thread_index, i, j, k, min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 3D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_1d_with_thread_t function, void* context,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last grid dimension using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each tile.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items to process along the first
  *    dimension of the 3D grid.
  * @param range_j              the number of items to process along the second
  *    dimension of the 3D grid.
  * @param range_k              the number of items to process along the third
  *    dimension of the 3D grid.
  * @param tile_k               the maximum number of items along the third
  *    dimension of the 3D grid to process in one function call.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last grid dimension using a microarchitecture-aware task function and passing
  * along the current thread id.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, uarch_index, thread_index, i, j, k, min(range_k -
  * k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each tile.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items to process along the first
  *    dimension of the 3D grid.
  * @param range_j              the number of items to process along the second
  *    dimension of the 3D grid.
  * @param range_k              the number of items to process along the third
  *    dimension of the 3D grid.
  * @param tile_k               the maximum number of items along the third
  *    dimension of the 3D grid to process in one function call.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, void* context,
     uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
     size_t range_j, size_t range_k, size_t tile_k, uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * grid dimension.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, k, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_k` is an integer multiple of @a tile_k, unless `k + count_k ==
  *    range_k`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param range_i              the number of items on the first dimension of the
  *                             3D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 3D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             3D grid to process.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 3D grid to process in each
  *                             function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_dynamic(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_1d_dynamic_t function, void* context,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * grid dimension, passing along the thread ID.
  *
  * The function repeatedly calls
  *
  *   function(context, thread_id, i, j, k, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_k` is an integer multiple of @a tile_k, unless `k + count_k ==
  *    range_k`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param range_i              the number of items on the first dimension of the
  *                             3D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 3D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             3D grid to process.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 3D grid to process in each
  *                             function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_dynamic_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_1d_dynamic_with_id_t function, void* context,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * grid dimension, passing along the thread ID.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, thread_index, i, j, k, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_k` is an integer multiple of @a tile_k, unless `k + count_k ==
  *    range_k`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param default_uarch_index  the microarchitecture index to use when
  *                             pthreadpool is configured without cpuinfo,
  *                             cpuinfo initialization failed, or index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds the
  *                             max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *                             the specified function. If the index returned by
  *                             cpuinfo_get_current_uarch_index() exceeds this
  *                             value, default_uarch_index will be used instead.
  *                             default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items on the first dimension of the
  *                             3D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 3D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             3D grid to process.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 3D grid to process in each
  *                             function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_1d_dynamic_with_uarch_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_1d_dynamic_with_id_with_thread_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, i, j, k,
  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 3D grid to process in one function call.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 3D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool,
                                         pthreadpool_task_3d_tile_2d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t tile_j, size_t tile_k,
                                         uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * two grid dimensions.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, k, count_j, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)` and a multiple of the provided @a
  *    tile_j,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k,
  *    unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *                    is NULL, all items are processed serially on the calling
  *                    thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items on the first dimension of the 3D
  *                    grid to process.
  * @param range_j     the number of items on the second dimension of the 3D
  *                    grid to process.
  * @param range_k     the number of items on the third dimension of the 3D
  *                    grid to process.
  * @param tile_j      the preferred multiple number of items on the second
  *                    dimension of the 3D grid to process in each function call.
  * @param tile_k      the preferred multiple number of items on the third
  *                    dimension of the 3D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *                    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_2d_dynamic(
     pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function,
     void* context, size_t range_i, size_t range_j, size_t range_k,
     size_t tile_j, size_t tile_k, uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * two grid dimensions using a microarchitecture-aware task function.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, i, j, k, count_j, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)` and a multiple of the provided @a
  *    tile_j,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k,
  *    unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param default_uarch_index  the microarchitecture index to use when
  *                             pthreadpool is configured without cpuinfo,
  *                             cpuinfo initialization failed, or index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds
  *                             the max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected
  *                             by the specified function. If the index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds this
  *                             value, default_uarch_index will be used instead.
  *                             default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items on the first dimension of the
  *                             3D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 3D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             3D grid to process.
  * @param tile_j               the preferred multiple number of items on the
  *                             second dimension of the 3D grid to process in
  *                             each function call.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 3D grid to process in each
  *                             function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_2d_dynamic_with_id_t function, void* context,
     uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
     size_t range_j, size_t range_k, size_t tile_j, size_t tile_k,
     uint32_t flags);

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * two grid dimensions passing along the thread ID.
  *
  * The function repeatedly calls
  *
  *   function(context, thread_id, i, j, k, count_j, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)` and a multiple of the provided @a
  *    tile_j,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k,
  *    unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param range_i              the number of items on the first dimension of the
  *                             3D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 3D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             3D grid to process.
  * @param tile_j               the preferred multiple number of items on the
  *                             second dimension of the 3D grid to process in
  *                             each function call.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 3D grid to process in each
  *                             function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_2d_dynamic_with_thread(
     pthreadpool_t threadpool,
     pthreadpool_task_3d_tile_2d_dynamic_with_id_t function, void* context,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_j,
     size_t tile_k, uint32_t flags);

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last two grid dimensions using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         function(context, uarch_index, i, j, k,
  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each tile.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items to process along the first
  *    dimension of the 3D grid.
  * @param range_j              the number of items to process along the second
  *    dimension of the 3D grid.
  * @param range_k              the number of items to process along the third
  *    dimension of the 3D grid.
  * @param tile_j               the maximum number of items along the second
  *    dimension of the 3D grid to process in one function call.
  * @param tile_k               the maximum number of items along the third
  *    dimension of the 3D grid to process in one function call.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_3d_tile_2d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t range_k, size_t tile_j,
     size_t tile_k, uint32_t flags);

 /**
  * Process items on a 4D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           function(context, i, j, k, l);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d(pthreadpool_t threadpool,
                                 pthreadpool_task_4d_t function, void* context,
                                 size_t range_i, size_t range_j, size_t range_k,
                                 size_t range_l, uint32_t flags);

 /**
  * Process items on a 4D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           function(context, i, j, k, l, min(range_l - l, tile_l));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 4D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_4d_tile_1d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t tile_l,
                                         uint32_t flags);

 /**
  * Process items on a 4D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           function(context, i, j, k, l,
  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 4D grid to process in one function call.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 4D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool,
                                         pthreadpool_task_4d_tile_2d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t tile_k,
                                         size_t tile_l, uint32_t flags);

 /**
  * Process items on a 4D grid with the specified maximum tile size along the
  * last two grid dimensions using a microarchitecture-aware task function.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   uint32_t uarch_index = cpuinfo_initialize() ?
  *       cpuinfo_get_current_uarch_index() : default_uarch_index;
  *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           function(context, uarch_index, i, j, k, l,
  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *    threadpool is NULL, all items are processed serially on the calling
  *    thread.
  * @param function             the function to call for each tile.
  * @param context              the first argument passed to the specified
  *    function.
  * @param default_uarch_index  the microarchitecture index to use when
  *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
  *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
  *    max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected by
  *    the specified function. If the index returned by
  *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
  *    will be used instead. default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items to process along the first
  *    dimension of the 4D grid.
  * @param range_j              the number of items to process along the second
  *    dimension of the 4D grid.
  * @param range_k              the number of items to process along the third
  *    dimension of the 4D grid.
  * @param range_l              the number of items to process along the fourth
  *    dimension of the 4D grid.
  * @param tile_k               the maximum number of items along the third
  *    dimension of the 4D grid to process in one function call.
  * @param tile_l               the maximum number of items along the fourth
  *    dimension of the 4D grid to process in one function call.
  * @param flags                a bitwise combination of zero or more optional
  *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d_tile_2d_with_uarch(
     pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function,
     void* context, uint32_t default_uarch_index, uint32_t max_uarch_index,
     size_t range_i, size_t range_j, size_t range_k, size_t range_l,
     size_t tile_k, size_t tile_l, uint32_t flags);

 /**
  * Process items on a 4D grid with specified prefered tile size along the last
  * two grid dimensions.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, k, l, count_k, count_l)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `l` is in the range `[0, range_l)` and a multiple of the provided @a
  *    tile_l,
  *  - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l,
  *    unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *                    is NULL, all items are processed serially on the calling
  *                    thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items on the first dimension of the 4D
  *                    grid to process.
  * @param range_j     the number of items on the second dimension of the 4D
  *                    grid to process.
  * @param range_k     the number of items on the third dimension of the 4D
  *                    grid to process.
  * @param range_l     the number of items on the fourth dimension of the 4D
  *                    grid to process.
  * @param tile_k      the preferred multiple number of items on the third
  *                    dimension of the 4D grid to process in each function call.
  * @param tile_l      the preferred multiple number of items on the fourth
  *                    dimension of the 4D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *                    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d_tile_2d_dynamic(
     pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_dynamic_t function,
     void* context, size_t range_i, size_t range_j, size_t range_k,
     size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags);

 /**
  * Process items on a 4D grid with specified prefered tile size along the last
  * two grid dimensions using a microarchitecture-aware task function.
  *
  * The function repeatedly calls
  *
  *   function(context, uarch_index, i, j, k, l, count_k, count_l)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `l` is in the range `[0, range_l)` and a multiple of the provided @a
  *    tile_l,
  *  - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l,
  *    unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool           the thread pool to use for parallelisation. If
  *                             threadpool is NULL, all items are processed
  *                             serially on the calling thread.
  * @param function             the function to call for each interval of the
  *                             given range.
  * @param context              the first argument passed to the specified
  *                             function.
  * @param default_uarch_index  the microarchitecture index to use when
  *                             pthreadpool is configured without cpuinfo,
  *                             cpuinfo initialization failed, or index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds
  *                             the max_uarch_index value.
  * @param max_uarch_index      the maximum microarchitecture index expected
  *                             by the specified function. If the index returned
  *                             by cpuinfo_get_current_uarch_index() exceeds this
  *                             value, default_uarch_index will be used instead.
  *                             default_uarch_index can exceed max_uarch_index.
  * @param range_i              the number of items on the first dimension of the
  *                             4D grid to process.
  * @param range_j              the number of items on the second dimension of
  *                             the 4D grid to process.
  * @param range_k              the number of items on the third dimension of the
  *                             4D grid to process.
  * @param range_l              the number of items on the fourth dimension of
  *                             the 4D grid to process.
  * @param tile_k               the preferred multiple number of items on the
  *                             third dimension of the 4D grid to process in each
  *                             function call.
  * @param tile_l               the preferred multiple number of items on the
  *                             fourth dimension of the 4D grid to process in
  *                             each function call.
  * @param flags                a bitwise combination of zero or more optional
  *                             flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                             PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch(
     pthreadpool_t threadpool,
     pthreadpool_task_4d_tile_2d_dynamic_with_id_t function, void* context,
     uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i,
     size_t range_j, size_t range_k, size_t range_l, size_t tile_k,
     size_t tile_l, uint32_t flags);

 /**
  * Process items on a 5D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             function(context, i, j, k, l, m);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_5d(pthreadpool_t threadpool,
                                 pthreadpool_task_5d_t function, void* context,
                                 size_t range_i, size_t range_j, size_t range_k,
                                 size_t range_l, size_t range_m, uint32_t flags);

 /**
  * Process items on a 5D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             function(context, i, j, k, l, m, min(range_m - m, tile_m));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 5D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_5d_tile_1d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t range_m,
                                         size_t tile_m, uint32_t flags);

 /**
  * Process items on a 5D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             function(context, i, j, k, l, m,
  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 5D grid to process in one function call.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 5D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool,
                                         pthreadpool_task_5d_tile_2d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t range_m,
                                         size_t tile_l, size_t tile_m,
                                         uint32_t flags);

 /**
  * Process items on a 6D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             for (size_t n = 0; n < range_n; n++)
  *               function(context, i, j, k, l, m, n);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_6d(pthreadpool_t threadpool,
                                 pthreadpool_task_6d_t function, void* context,
                                 size_t range_i, size_t range_j, size_t range_k,
                                 size_t range_l, size_t range_m, size_t range_n,
                                 uint32_t flags);

 /**
  * Process items on a 6D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             for (size_t n = 0; n < range_n; n += tile_n)
  *               function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool,
                                         pthreadpool_task_6d_tile_1d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t range_m,
                                         size_t range_n, size_t tile_n,
                                         uint32_t flags);

 /**
  * Process items on a 6D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             for (size_t n = 0; n < range_n; n += tile_n)
  *               function(context, i, j, k, l, m, n,
  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each tile.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 6D grid to process in one function call.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 void pthreadpool_parallelize_6d_tile_2d(pthreadpool_t threadpool,
                                         pthreadpool_task_6d_tile_2d_t function,
                                         void* context, size_t range_i,
                                         size_t range_j, size_t range_k,
                                         size_t range_l, size_t range_m,
                                         size_t range_n, size_t tile_m,
                                         size_t tile_n, uint32_t flags);

 /**
  * Terminates threads in the thread pool and releases associated resources.
  *
  * @warning  Accessing the thread pool after a call to this function constitutes
  *    undefined behaviour and may cause data corruption.
  *
  * @param[in,out]  threadpool  The thread pool to destroy.
  */
 void pthreadpool_destroy(pthreadpool_t threadpool);

 #ifndef PTHREADPOOL_NO_DEPRECATED_API

 /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
 #if defined(__GNUC__)
 #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
 #else
 #define PTHREADPOOL_DEPRECATED
 #endif

 typedef void (*pthreadpool_function_1d_t)(void*, size_t);
 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t,
                                                 size_t);
 typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t,
                                                 size_t, size_t, size_t);
 typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t,
                                                 size_t, size_t, size_t, size_t,
                                                 size_t);

 void pthreadpool_compute_1d(pthreadpool_t threadpool,
                             pthreadpool_function_1d_t function, void* argument,
                             size_t range) PTHREADPOOL_DEPRECATED;

 void pthreadpool_compute_1d_tiled(pthreadpool_t threadpool,
                                   pthreadpool_function_1d_tiled_t function,
                                   void* argument, size_t range,
                                   size_t tile) PTHREADPOOL_DEPRECATED;

 void pthreadpool_compute_2d(pthreadpool_t threadpool,
                             pthreadpool_function_2d_t function, void* argument,
                             size_t range_i,
                             size_t range_j) PTHREADPOOL_DEPRECATED;

 void pthreadpool_compute_2d_tiled(pthreadpool_t threadpool,
                                   pthreadpool_function_2d_tiled_t function,
                                   void* argument, size_t range_i,
                                   size_t range_j, size_t tile_i,
                                   size_t tile_j) PTHREADPOOL_DEPRECATED;

 void pthreadpool_compute_3d_tiled(pthreadpool_t threadpool,
                                   pthreadpool_function_3d_tiled_t function,
                                   void* argument, size_t range_i,
                                   size_t range_j, size_t range_k, size_t tile_i,
                                   size_t tile_j,
                                   size_t tile_k) PTHREADPOOL_DEPRECATED;

 void pthreadpool_compute_4d_tiled(pthreadpool_t threadpool,
                                   pthreadpool_function_4d_tiled_t function,
                                   void* argument, size_t range_i,
                                   size_t range_j, size_t range_k,
                                   size_t range_l, size_t tile_i, size_t tile_j,
                                   size_t tile_k,
                                   size_t tile_l) PTHREADPOOL_DEPRECATED;

 #endif /* PTHREADPOOL_NO_DEPRECATED_API */

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

 #ifdef __cplusplus

 namespace libpthreadpool {
 namespace detail {
 namespace {  // NOLINT: Naming this namespace would expose it.

 template <class T>
 void call_wrapper_1d(void* arg, size_t i) {
   (*static_cast<const T*>(arg))(i);
 }

 template <class T>
 void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
   (*static_cast<const T*>(arg))(range_i, tile_i);
 }

 template <class T>
 void call_wrapper_1d_tile_1d_dynamic(void* arg, size_t range_i, size_t tile_i) {
   (*static_cast<const T*>(arg))(range_i, tile_i);
 }

 template <class T>
 void call_wrapper_2d(void* functor, size_t i, size_t j) {
   (*static_cast<const T*>(functor))(i, j);
 }

 template <class T>
 void call_wrapper_2d_tile_1d(void* functor, size_t i, size_t range_j,
                              size_t tile_j) {
   (*static_cast<const T*>(functor))(i, range_j, tile_j);
 }

 template <class T>
 void call_wrapper_2d_tile_1d_dynamic(void* functor, size_t i, size_t range_j,
                                      size_t tile_j) {
   (*static_cast<const T*>(functor))(i, range_j, tile_j);
 }

 template <class T>
 void call_wrapper_2d_tile_2d(void* functor, size_t range_i, size_t range_j,
                              size_t tile_i, size_t tile_j) {
   (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
 }

 template <class T>
 void call_wrapper_2d_tile_2d_dynamic(void* functor, size_t range_i,
                                      size_t range_j, size_t tile_i,
                                      size_t tile_j) {
   (*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
 }

 template <class T>
 void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
   (*static_cast<const T*>(functor))(i, j, k);
 }

 template <class T>
 void call_wrapper_3d_tile_1d(void* functor, size_t i, size_t j, size_t range_k,
                              size_t tile_k) {
   (*static_cast<const T*>(functor))(i, j, range_k, tile_k);
 }

 template <class T>
 void call_wrapper_3d_tile_2d(void* functor, size_t i, size_t range_j,
                              size_t range_k, size_t tile_j, size_t tile_k) {
   (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
 }

 template <class T>
 void call_wrapper_3d_tile_2d_dynamic(void* functor, size_t i, size_t range_j,
                                      size_t range_k, size_t tile_j,
                                      size_t tile_k) {
   (*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
 }

 template <class T>
 void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
   (*static_cast<const T*>(functor))(i, j, k, l);
 }

 template <class T>
 void call_wrapper_4d_tile_1d(void* functor, size_t i, size_t j, size_t k,
                              size_t range_l, size_t tile_l) {
   (*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
 }

 template <class T>
 void call_wrapper_4d_tile_2d(void* functor, size_t i, size_t j, size_t range_k,
                              size_t range_l, size_t tile_k, size_t tile_l) {
   (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
 }

 template <class T>
 void call_wrapper_4d_tile_2d_dynamic(void* functor, size_t i, size_t j,
                                      size_t range_k, size_t range_l,
                                      size_t tile_k, size_t tile_l) {
   (*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
 }

 template <class T>
 void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l,
                      size_t m) {
   (*static_cast<const T*>(functor))(i, j, k, l, m);
 }

 template <class T>
 void call_wrapper_5d_tile_1d(void* functor, size_t i, size_t j, size_t k,
                              size_t l, size_t range_m, size_t tile_m) {
   (*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
 }

 template <class T>
 void call_wrapper_5d_tile_2d(void* functor, size_t i, size_t j, size_t k,
                              size_t range_l, size_t range_m, size_t tile_l,
                              size_t tile_m) {
   (*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
 }

 template <class T>
 void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l,
                      size_t m, size_t n) {
   (*static_cast<const T*>(functor))(i, j, k, l, m, n);
 }

 template <class T>
 void call_wrapper_6d_tile_1d(void* functor, size_t i, size_t j, size_t k,
                              size_t l, size_t m, size_t range_n,
                              size_t tile_n) {
   (*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
 }

 template <class T>
 void call_wrapper_6d_tile_2d(void* functor, size_t i, size_t j, size_t k,
                              size_t l, size_t range_m, size_t range_n,
                              size_t tile_m, size_t tile_n) {
   (*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m,
                                     tile_n);
 }

 } /* namespace */
 } /* namespace detail */
 } /* namespace libpthreadpool */

 /**
  * Drop-in wrapper for the @a pthreadpool_scheduler that uses itself as its own
  * context.
  */
 class PthreadpoolExecutor : public pthreadpool_executor {
  public:
   using TaskFunction = void (*)(void*);

   PthreadpoolExecutor() {
     num_threads = num_threads_impl;
     schedule = schedule_impl;
   }
   virtual ~PthreadpoolExecutor() = default;

   /**
    * Return the context of this @a PthreadpoolExecutor, e.g. for the @a
    * pthreadpool_create_v2 function.
    */
   void* GetContext() { return this; }

   /**
    * Override these methods for your own threadpool.
    */
   virtual int NumThreads() = 0;
   virtual void Schedule(void* context, TaskFunction task) = 0;

  private:
   static int num_threads_impl(void* executor) {
     return reinterpret_cast<PthreadpoolExecutor*>(executor)->NumThreads();
   }

   static void schedule_impl(void* executor, void* context, TaskFunction task) {
     reinterpret_cast<PthreadpoolExecutor*>(executor)->Schedule(context, task);
   }
 };

 /**
  * Process items on a 1D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range; i++)
  *     functor(i);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each item.
  * @param range       the number of items on the 1D grid to process. The
  *    specified functor will be called once for each item.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_1d(pthreadpool_t threadpool,
                                        const T& functor, size_t range,
                                        uint32_t flags = 0) {
   pthreadpool_parallelize_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range, flags);
 }

 /**
  * Process items on a 1D grid with specified maximum tile size.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range; i += tile)
  *     functor(i, min(range - i, tile));
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the maximum number of items on the 1D grid to process in
  *    one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range,
                                                size_t tile,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_1d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range, tile,
       flags);
 }

 /**
  * Process items on a 1D grid with specified prefered tile size.
  *
  * The function repeatedly calls
  *
  *   function(context, i, count)
  *
  * in parallel where `i` is in the range `[0, range)` and a multiple of the
  * provided @a tile and `count` is an integer multiple of @a tile unless `i
  * + count == range`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range       the number of items on the 1D grid to process.
  * @param tile        the preferred multiple number of items on the 1D grid to
  *     process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_1d_tile_1d_dynamic(pthreadpool_t threadpool,
                                                        const T& functor,
                                                        size_t range,
                                                        size_t tile,
                                                        uint32_t flags = 0) {
   pthreadpool_parallelize_1d_tile_1d_dynamic(
       threadpool,
       &libpthreadpool::detail::call_wrapper_1d_tile_1d_dynamic<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range, tile,
       flags);
 }

 /**
  * Process items on a 2D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       functor(i, j);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each item.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_2d(pthreadpool_t threadpool,
                                        const T& functor, size_t range_i,
                                        size_t range_j, uint32_t flags = 0) {
   pthreadpool_parallelize_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       flags);
 }

 /**
  * Process items on a 2D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       functor(i, j, min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t tile_j,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_2d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       tile_j, flags);
 }

 /**
  * Process items on a 2D grid with specified prefered tile size along the
  * last grid dimension.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range
  * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an
  * integer multiple of @a tile_j unless `j + count_j == range_j`.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 2D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 2D
  *     grid to process.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_2d_tile_1d_dynamic(
     pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j,
     size_t tile_j, uint32_t flags = 0) {
   pthreadpool_parallelize_2d_tile_1d_dynamic(
       threadpool,
       &libpthreadpool::detail::call_wrapper_2d_tile_1d_dynamic<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       tile_j, flags);
 }

 /**
  * Process items on a 2D grid with the specified maximum tile size along each
  * grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i += tile_i)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       functor(i, j,
  *         min(range_i - i, tile_i), min(range_j - j, tile_j));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 2D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 2D grid.
  * @param tile_j      the maximum number of items along the first dimension of
  *    the 2D grid to process in one functor call.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 2D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t tile_i,
                                                size_t tile_j,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_2d_tile_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       tile_i, tile_j, flags);
 }

 /**
  * Process items on a 2D grid with specified prefered tile size along each grid
  * dimension.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, count_i, count_j)
  *
  * in parallel where `i` is in the range `[0, range_i)` and a multiple of the
  * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the
  * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a
  * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j ==
  * range_j`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 2D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 2D
  *     grid to process.
  * @param tile_i        the preferred multiple number of items on the first
  *     dimension of the 2D grid to process in each function call.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 2D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_2d_tile_2d_dynamic(
     pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j,
     size_t tile_i, size_t tile_j, uint32_t flags = 0) {
   pthreadpool_parallelize_2d_tile_2d_dynamic(
       threadpool,
       &libpthreadpool::detail::call_wrapper_2d_tile_2d_dynamic<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       tile_i, tile_j, flags);
 }

 /**
  * Process items on a 3D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         functor(i, j, k);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_3d(pthreadpool_t threadpool,
                                        const T& functor, size_t range_i,
                                        size_t range_j, size_t range_k,
                                        uint32_t flags = 0) {
   pthreadpool_parallelize_3d(
       threadpool, &libpthreadpool::detail::call_wrapper_3d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, flags);
 }

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         functor(i, j, k, min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 3D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t tile_k,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_3d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, tile_k, flags);
 }

 /**
  * Process items on a 3D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j += tile_j)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         functor(i, j, k,
  *           min(range_j - j, tile_j), min(range_k - k, tile_k));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 3D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 3D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 3D grid.
  * @param tile_j      the maximum number of items along the second dimension of
  *    the 3D grid to process in one functor call.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 3D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t tile_j, size_t tile_k,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_3d_tile_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, tile_j, tile_k, flags);
 }

 /**
  * Process items on a 3D grid with specified prefered tile size along the last
  * two grid dimensions.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, k, count_j, count_k)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)` and a multiple of the provided @a
  *    tile_j,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k,
  *    unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i       the number of items on the first dimension of the 3D
  *     grid to process.
  * @param range_j       the number of items on the second dimension of the 3D
  *     grid to process.
  * @param range_k       the number of items on the third dimension of the 3D
  *     grid to process.
  * @param tile_j        the preferred multiple number of items on the second
  *     dimension of the 3D grid to process in each function call.
  * @param tile_k        the preferred multiple number of items on the third
  *     dimension of the 3D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_3d_tile_2d_dynamic(
     pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j,
     size_t range_k, size_t tile_j, size_t tile_k, uint32_t flags = 0) {
   pthreadpool_parallelize_3d_tile_2d_dynamic(
       threadpool,
       &libpthreadpool::detail::call_wrapper_3d_tile_2d_dynamic<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, tile_j, tile_k, flags);
 }

 /**
  * Process items on a 4D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           functor(i, j, k, l);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_4d(pthreadpool_t threadpool,
                                        const T& functor, size_t range_i,
                                        size_t range_j, size_t range_k,
                                        size_t range_l, uint32_t flags = 0) {
   pthreadpool_parallelize_4d(
       threadpool, &libpthreadpool::detail::call_wrapper_4d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, flags);
 }

 /**
  * Process items on a 4D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           functor(i, j, k, l, min(range_l - l, tile_l));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 4D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t range_l, size_t tile_l,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_4d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, tile_l, flags);
 }

 /**
  * Process items on a 4D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k += tile_k)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           functor(i, j, k, l,
  *             min(range_k - k, tile_k), min(range_l - l, tile_l));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 4D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 4D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 4D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 4D grid.
  * @param tile_k      the maximum number of items along the third dimension of
  *    the 4D grid to process in one functor call.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 4D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t range_l, size_t tile_k,
                                                size_t tile_l,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_4d_tile_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, tile_k, tile_l, flags);
 }

 /**
  * Process items on a 4D grid with specified prefered tile size along the last
  * two grid dimensions.
  *
  * The function repeatedly calls
  *
  *   function(context, i, j, k, l, count_k, count_l)
  *
  * in parallel where:
  *  - `i` is in the range `[0, range_i)`,
  *  - `j` is in the range `[0, range_j)`,
  *  - `k` is in the range `[0, range_k)` and a multiple of the provided @a
  *    tile_k,
  *  - `l` is in the range `[0, range_l)` and a multiple of the provided @a
  *    tile_l,
  *  - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l,
  *    unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly.
  *
  * The `count`s are chosen such as to minimize the number of calls to @a
  * function while keeping the computation load balanced across all threads.
  *
  * When the call returns, all items have been processed and the thread pool is
  * ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool,
  *    the calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *                    is NULL, all items are processed serially on the calling
  *                    thread.
  * @param function    the function to call for each interval of the given range.
  * @param context     the first argument passed to the specified function.
  * @param range_i     the number of items on the first dimension of the 4D
  *                    grid to process.
  * @param range_j     the number of items on the second dimension of the 4D
  *                    grid to process.
  * @param range_k     the number of items on the third dimension of the 4D
  *                    grid to process.
  * @param range_l     the number of items on the fourth dimension of the 4D
  *                    grid to process.
  * @param tile_k      the preferred multiple number of items on the third
  *                    dimension of the 4D grid to process in each function call.
  * @param tile_l      the preferred multiple number of items on the fourth
  *                    dimension of the 4D grid to process in each function call.
  * @param flags       a bitwise combination of zero or more optional flags
  *                    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
  *                    PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_4d_tile_2d_dynamic(
     pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j,
     size_t range_k, size_t range_l, size_t tile_k, size_t tile_l,
     uint32_t flags = 0) {
   pthreadpool_parallelize_3d_tile_2d_dynamic(
       threadpool,
       &libpthreadpool::detail::call_wrapper_4d_tile_2d_dynamic<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, tile_k, tile_l, flags);
 }

 /**
  * Process items on a 5D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             functor(i, j, k, l, m);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_5d(pthreadpool_t threadpool,
                                        const T& functor, size_t range_i,
                                        size_t range_j, size_t range_k,
                                        size_t range_l, size_t range_m,
                                        uint32_t flags = 0) {
   pthreadpool_parallelize_5d(
       threadpool, &libpthreadpool::detail::call_wrapper_5d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, flags);
 }

 /**
  * Process items on a 5D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             functor(i, j, k, l, m, min(range_m - m, tile_m));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 5D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t range_l, size_t range_m,
                                                size_t tile_m,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_5d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, tile_m, flags);
 }

 /**
  * Process items on a 5D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l += tile_l)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             functor(i, j, k, l, m,
  *               min(range_l - l, tile_l), min(range_m - m, tile_m));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 5D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 5D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 5D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 5D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 5D grid.
  * @param tile_l      the maximum number of items along the fourth dimension of
  *    the 5D grid to process in one functor call.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 5D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t range_l, size_t range_m,
                                                size_t tile_l, size_t tile_m,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_5d_tile_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, tile_l, tile_m, flags);
 }

 /**
  * Process items on a 6D grid.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             for (size_t n = 0; n < range_n; n++)
  *               functor(i, j, k, l, m, n);
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_6d(pthreadpool_t threadpool,
                                        const T& functor, size_t range_i,
                                        size_t range_j, size_t range_k,
                                        size_t range_l, size_t range_m,
                                        size_t range_n, uint32_t flags = 0) {
   pthreadpool_parallelize_6d(
       threadpool, &libpthreadpool::detail::call_wrapper_6d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, range_n, flags);
 }

 /**
  * Process items on a 6D grid with the specified maximum tile size along the
  * last grid dimension.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m++)
  *             for (size_t n = 0; n < range_n; n += tile_n)
  *               functor(i, j, k, l, m, n, min(range_n - n, tile_n));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool,
                                                const T& functor, size_t range_i,
                                                size_t range_j, size_t range_k,
                                                size_t range_l, size_t range_m,
                                                size_t range_n, size_t tile_n,
                                                uint32_t flags = 0) {
   pthreadpool_parallelize_6d_tile_1d(
       threadpool, &libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, range_n, tile_n, flags);
 }

 /**
  * Process items on a 6D grid with the specified maximum tile size along the
  * last two grid dimensions.
  *
  * The function implements a parallel version of the following snippet:
  *
  *   for (size_t i = 0; i < range_i; i++)
  *     for (size_t j = 0; j < range_j; j++)
  *       for (size_t k = 0; k < range_k; k++)
  *         for (size_t l = 0; l < range_l; l++)
  *           for (size_t m = 0; m < range_m; m += tile_m)
  *             for (size_t n = 0; n < range_n; n += tile_n)
  *               functor(i, j, k, l, m, n,
  *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
  *
  * When the function returns, all items have been processed and the thread pool
  * is ready for a new task.
  *
  * @note If multiple threads call this function with the same thread pool, the
  *    calls are serialized.
  *
  * @param threadpool  the thread pool to use for parallelisation. If threadpool
  *    is NULL, all items are processed serially on the calling thread.
  * @param functor     the functor to call for each tile.
  * @param range_i     the number of items to process along the first dimension
  *    of the 6D grid.
  * @param range_j     the number of items to process along the second dimension
  *    of the 6D grid.
  * @param range_k     the number of items to process along the third dimension
  *    of the 6D grid.
  * @param range_l     the number of items to process along the fourth dimension
  *    of the 6D grid.
  * @param range_m     the number of items to process along the fifth dimension
  *    of the 6D grid.
  * @param range_n     the number of items to process along the sixth dimension
  *    of the 6D grid.
  * @param tile_m      the maximum number of items along the fifth dimension of
  *    the 6D grid to process in one functor call.
  * @param tile_n      the maximum number of items along the sixth dimension of
  *    the 6D grid to process in one functor call.
  * @param flags       a bitwise combination of zero or more optional flags
  *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_DONT_SPIN_WORKERS)
  */
 template <class T>
 inline void pthreadpool_parallelize_6d_tile_2d(
     pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j,
     size_t range_k, size_t range_l, size_t range_m, size_t range_n,
     size_t tile_m, size_t tile_n, uint32_t flags = 0) {
   pthreadpool_parallelize_6d_tile_2d(
       threadpool, &libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
       const_cast<void*>(static_cast<const void*>(&functor)), range_i, range_j,
       range_k, range_l, range_m, range_n, tile_m, tile_n, flags);
 }

 #endif /* __cplusplus */

 #endif /* __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_ */