blob: fcf8803f0a1949dd32ec0bedf4600844af10c902 [file] [log] [blame]
/*
* kmp_runtime.cpp -- KPTS runtime support library
*/
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_affinity.h"
#include "kmp_atomic.h"
#include "kmp_environment.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_itt.h"
#include "kmp_settings.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#include "kmp_wait_release.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_dispatch.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
/* these are temporary issues to be dealt with */
#define KMP_USE_PRCTL 0
#if KMP_OS_WINDOWS
#include <process.h>
#endif
#include "tsan_annotations.h"
#if defined(KMP_GOMP_COMPAT)
char const __kmp_version_alt_comp[] =
KMP_VERSION_PREFIX "alternative compiler support: yes";
#endif /* defined(KMP_GOMP_COMPAT) */
char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
#if OMP_50_ENABLED
"5.0 (201611)";
#elif OMP_45_ENABLED
"4.5 (201511)";
#elif OMP_40_ENABLED
"4.0 (201307)";
#else
"3.1 (201107)";
#endif
#ifdef KMP_DEBUG
char const __kmp_version_lock[] =
KMP_VERSION_PREFIX "lock type: run time selectable";
#endif /* KMP_DEBUG */
#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
/* ------------------------------------------------------------------------ */
#if KMP_USE_MONITOR
kmp_info_t __kmp_monitor;
#endif
/* Forward declarations */
void __kmp_cleanup(void);
static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
int gtid);
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs,
ident_t *loc);
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
static void __kmp_partition_places(kmp_team_t *team,
int update_master_only = 0);
#endif
static void __kmp_do_serial_initialize(void);
void __kmp_fork_barrier(int gtid, int tid);
void __kmp_join_barrier(int gtid);
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs, ident_t *loc);
#ifdef USE_LOAD_BALANCE
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
#endif
static int __kmp_expand_threads(int nNeed);
#if KMP_OS_WINDOWS
static int __kmp_unregister_root_other_thread(int gtid);
#endif
static void __kmp_unregister_library(void); // called by __kmp_internal_end()
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
int __kmp_get_global_thread_id() {
int i;
kmp_info_t **other_threads;
size_t stack_data;
char *stack_addr;
size_t stack_size;
char *stack_base;
KA_TRACE(
1000,
("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
__kmp_nth, __kmp_all_nth));
/* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
a parallel region, made it return KMP_GTID_DNE to force serial_initialize
by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
__kmp_init_gtid for this to work. */
if (!TCR_4(__kmp_init_gtid))
return KMP_GTID_DNE;
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
return __kmp_gtid;
}
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
return __kmp_gtid_get_specific();
}
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
stack_addr = (char *)&stack_data;
other_threads = __kmp_threads;
/* ATT: The code below is a source of potential bugs due to unsynchronized
access to __kmp_threads array. For example:
1. Current thread loads other_threads[i] to thr and checks it, it is
non-NULL.
2. Current thread is suspended by OS.
3. Another thread unregisters and finishes (debug versions of free()
may fill memory with something like 0xEF).
4. Current thread is resumed.
5. Current thread reads junk from *thr.
TODO: Fix it. --ln */
for (i = 0; i < __kmp_threads_capacity; i++) {
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
if (!thr)
continue;
stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
/* stack grows down -- search through all of the active threads */
if (stack_addr <= stack_base) {
size_t stack_diff = stack_base - stack_addr;
if (stack_diff <= stack_size) {
/* The only way we can be closer than the allocated */
/* stack size is if we are running on this thread. */
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
return i;
}
}
}
/* get specific to try and determine our gtid */
KA_TRACE(1000,
("*** __kmp_get_global_thread_id: internal alg. failed to find "
"thread, using TLS\n"));
i = __kmp_gtid_get_specific();
/*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
/* if we havn't been assigned a gtid, then return code */
if (i < 0)
return i;
/* dynamically updated stack window for uber threads to avoid get_specific
call */
if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
KMP_FATAL(StackOverflow, i);
}
stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
if (stack_addr > stack_base) {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
stack_base);
} else {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
stack_base - stack_addr);
}
/* Reprint stack bounds for ubermaster since they have been refined */
if (__kmp_storage_map) {
char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
__kmp_print_storage_map_gtid(i, stack_beg, stack_end,
other_threads[i]->th.th_info.ds.ds_stacksize,
"th_%d stack (refinement)", i);
}
return i;
}
int __kmp_get_global_thread_id_reg() {
int gtid;
if (!__kmp_init_serial) {
gtid = KMP_GTID_DNE;
} else
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
gtid = __kmp_gtid;
} else
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
gtid = __kmp_gtid_get_specific();
} else {
KA_TRACE(1000,
("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
gtid = __kmp_get_global_thread_id();
}
/* we must be a new uber master sibling thread */
if (gtid == KMP_GTID_DNE) {
KA_TRACE(10,
("__kmp_get_global_thread_id_reg: Encountered new root thread. "
"Registering a new gtid.\n"));
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (!__kmp_init_serial) {
__kmp_do_serial_initialize();
gtid = __kmp_gtid_get_specific();
} else {
gtid = __kmp_register_root(FALSE);
}
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
/*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
}
KMP_DEBUG_ASSERT(gtid >= 0);
return gtid;
}
/* caller must hold forkjoin_lock */
void __kmp_check_stack_overlap(kmp_info_t *th) {
int f;
char *stack_beg = NULL;
char *stack_end = NULL;
int gtid;
KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
if (__kmp_storage_map) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
gtid = __kmp_gtid_from_thread(th);
if (gtid == KMP_GTID_MONITOR) {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%s stack (%s)", "mon",
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
} else {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%d stack (%s)", gtid,
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
}
}
/* No point in checking ubermaster threads since they use refinement and
* cannot overlap */
gtid = __kmp_gtid_from_thread(th);
if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
KA_TRACE(10,
("__kmp_check_stack_overlap: performing extensive checking\n"));
if (stack_beg == NULL) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
}
for (f = 0; f < __kmp_threads_capacity; f++) {
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
if (f_th && f_th != th) {
char *other_stack_end =
(char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
char *other_stack_beg =
other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
/* Print the other stack values before the abort */
if (__kmp_storage_map)
__kmp_print_storage_map_gtid(
-1, other_stack_beg, other_stack_end,
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
"th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
__kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
__kmp_msg_null);
}
}
}
}
KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
}
/* ------------------------------------------------------------------------ */
void __kmp_infinite_loop(void) {
static int done = FALSE;
while (!done) {
KMP_YIELD(TRUE);
}
}
#define MAX_MESSAGE 512
void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
p2, (unsigned long)size, format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
#if KMP_PRINT_DATA_PLACEMENT
int node;
if (gtid >= 0) {
if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
if (__kmp_storage_map_verbose) {
node = __kmp_get_host_node(p1);
if (node < 0) /* doesn't work, so don't try this next time */
__kmp_storage_map_verbose = FALSE;
else {
char *last;
int lastNode;
int localProc = __kmp_get_cpu_from_gtid(gtid);
const int page_size = KMP_GET_PAGE_SIZE();
p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
if (localProc >= 0)
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
localProc >> 1);
else
__kmp_printf_no_lock(" GTID %d\n", gtid);
#if KMP_USE_PRCTL
/* The more elaborate format is disabled for now because of the prctl
* hanging bug. */
do {
last = p1;
lastNode = node;
/* This loop collates adjacent pages with the same host node. */
do {
(char *)p1 += page_size;
} while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
__kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
lastNode);
} while (p1 <= p2);
#else
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
(char *)p1 + (page_size - 1),
__kmp_get_host_node(p1));
if (p1 < p2) {
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
(char *)p2 + (page_size - 1),
__kmp_get_host_node(p2));
}
#endif
}
}
} else
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
}
#endif /* KMP_PRINT_DATA_PLACEMENT */
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
}
void __kmp_warn(char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
if (__kmp_generate_warnings == kmp_warnings_off) {
return;
}
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
va_end(ap);
}
void __kmp_abort_process() {
// Later threads may stall here, but that's ok because abort() will kill them.
__kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
if (__kmp_debug_buf) {
__kmp_dump_debug_buffer();
}
if (KMP_OS_WINDOWS) {
// Let other threads know of abnormal termination and prevent deadlock
// if abort happened during library initialization or shutdown
__kmp_global.g.g_abort = SIGABRT;
/* On Windows* OS by default abort() causes pop-up error box, which stalls
nightly testing. Unfortunately, we cannot reliably suppress pop-up error
boxes. _set_abort_behavior() works well, but this function is not
available in VS7 (this is not problem for DLL, but it is a problem for
static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
help, at least in some versions of MS C RTL.
It seems following sequence is the only way to simulate abort() and
avoid pop-up error box. */
raise(SIGABRT);
_exit(3); // Just in case, if signal ignored, exit anyway.
} else {
abort();
}
__kmp_infinite_loop();
__kmp_release_bootstrap_lock(&__kmp_exit_lock);
} // __kmp_abort_process
void __kmp_abort_thread(void) {
// TODO: Eliminate g_abort global variable and this function.
// In case of abort just call abort(), it will kill all the threads.
__kmp_infinite_loop();
} // __kmp_abort_thread
/* Print out the storage map for the major kmp_info_t thread data structures
that are allocated together. */
static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
__kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
sizeof(kmp_desc_t), "th_%d.th_info", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
sizeof(kmp_local_t), "th_%d.th_local", gtid);
__kmp_print_storage_map_gtid(
gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
&thr->th.th_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
&thr->th.th_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
gtid);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
&thr->th.th_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
gtid);
#endif // KMP_FAST_REDUCTION_BARRIER
}
/* Print out the storage map for the major kmp_team_t team data structures
that are allocated together. */
static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
int team_id, int num_thr) {
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
__kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
&team->t.t_bar[bs_last_barrier],
sizeof(kmp_balign_team_t) * bs_last_barrier,
"%s_%d.t_bar", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
&team->t.t_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
&team->t.t_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[forkjoin]", header, team_id);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
&team->t.t_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[reduction]", header, team_id);
#endif // KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(
-1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
__kmp_print_storage_map_gtid(
-1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
&team->t.t_disp_buffer[num_disp_buff],
sizeof(dispatch_shared_info_t) * num_disp_buff,
"%s_%d.t_disp_buffer", header, team_id);
}
static void __kmp_init_allocator() {
#if OMP_50_ENABLED
__kmp_init_memkind();
#endif
}
static void __kmp_fini_allocator() {
#if OMP_50_ENABLED
__kmp_fini_memkind();
#endif
}
/* ------------------------------------------------------------------------ */
#if KMP_DYNAMIC_LIB
#if KMP_OS_WINDOWS
static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
// TODO: Change to __kmp_break_bootstrap_lock().
__kmp_init_bootstrap_lock(lck); // make the lock released
}
static void __kmp_reset_locks_on_process_detach(int gtid_req) {
int i;
int thread_count;
// PROCESS_DETACH is expected to be called by a thread that executes
// ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
// calling ProcessExit or FreeLibrary). So, it might be safe to access the
// __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
// threads can be still alive here, although being about to be terminated. The
// threads in the array with ds_thread==0 are most suspicious. Actually, it
// can be not safe to access the __kmp_threads[].
// TODO: does it make sense to check __kmp_roots[] ?
// Let's check that there are no other alive threads registered with the OMP
// lib.
while (1) {
thread_count = 0;
for (i = 0; i < __kmp_threads_capacity; ++i) {
if (!__kmp_threads)
continue;
kmp_info_t *th = __kmp_threads[i];
if (th == NULL)
continue;
int gtid = th->th.th_info.ds.ds_gtid;
if (gtid == gtid_req)
continue;
if (gtid < 0)
continue;
DWORD exit_val;
int alive = __kmp_is_thread_alive(th, &exit_val);
if (alive) {
++thread_count;
}
}
if (thread_count == 0)
break; // success
}
// Assume that I'm alone. Now it might be safe to check and reset locks.
// __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
__kmp_reset_lock(&__kmp_forkjoin_lock);
#ifdef KMP_DEBUG
__kmp_reset_lock(&__kmp_stdio_lock);
#endif // KMP_DEBUG
}
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
//__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
switch (fdwReason) {
case DLL_PROCESS_ATTACH:
KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
return TRUE;
case DLL_PROCESS_DETACH:
KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
if (lpReserved != NULL) {
// lpReserved is used for telling the difference:
// lpReserved == NULL when FreeLibrary() was called,
// lpReserved != NULL when the process terminates.
// When FreeLibrary() is called, worker threads remain alive. So they will
// release the forkjoin lock by themselves. When the process terminates,
// worker threads disappear triggering the problem of unreleased forkjoin
// lock as described below.
// A worker thread can take the forkjoin lock. The problem comes up if
// that worker thread becomes dead before it releases the forkjoin lock.
// The forkjoin lock remains taken, while the thread executing
// DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
// to take the forkjoin lock and will always fail, so that the application
// will never finish [normally]. This scenario is possible if
// __kmpc_end() has not been executed. It looks like it's not a corner
// case, but common cases:
// - the main function was compiled by an alternative compiler;
// - the main function was compiled by icl but without /Qopenmp
// (application with plugins);
// - application terminates by calling C exit(), Fortran CALL EXIT() or
// Fortran STOP.
// - alive foreign thread prevented __kmpc_end from doing cleanup.
//
// This is a hack to work around the problem.
// TODO: !!! figure out something better.
__kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
}
__kmp_internal_end_library(__kmp_gtid_get_specific());
return TRUE;
case DLL_THREAD_ATTACH:
KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
/* if we want to register new siblings all the time here call
* __kmp_get_gtid(); */
return TRUE;
case DLL_THREAD_DETACH:
KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
__kmp_internal_end_thread(__kmp_gtid_get_specific());
return TRUE;
}
return TRUE;
}
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_DYNAMIC_LIB */
/* __kmp_parallel_deo -- Wait until it's our turn. */
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif /* BUILD_PARALLEL_ORDERED */
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
#if KMP_USE_DYNAMIC_LOCK
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
#else
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
#endif
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB();
KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
NULL);
KMP_MB();
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* __kmp_parallel_dxo -- Signal the next task. */
void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
int tid = __kmp_tid_from_gtid(gtid);
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif /* BUILD_PARALLEL_ORDERED */
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
__kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB(); /* Flush all pending memory write invalidates. */
/* use the tid of the next thread in this team */
/* TODO replace with general release procedure */
team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
KMP_MB(); /* Flush all pending memory write invalidates. */
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* ------------------------------------------------------------------------ */
/* The BARRIER for a SINGLE process section is always explicit */
int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
int status;
kmp_info_t *th;
kmp_team_t *team;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
#if OMP_50_ENABLED
__kmp_resume_if_soft_paused();
#endif
th = __kmp_threads[gtid];
team = th->th.th_team;
status = 0;
th->th.th_ident = id_ref;
if (team->t.t_serialized) {
status = 1;
} else {
kmp_int32 old_this = th->th.th_local.this_construct;
++th->th.th_local.this_construct;
/* try to set team count to thread count--success means thread got the
single block */
/* TODO: Should this be acquire or release? */
if (team->t.t_construct == old_this) {
status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
th->th.th_local.this_construct);
}
#if USE_ITT_BUILD
if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) &&
#if OMP_40_ENABLED
th->th.th_teams_microtask == NULL &&
#endif
team->t.t_active_level ==
1) { // Only report metadata by master of active team at level 1
__kmp_itt_metadata_single(id_ref);
}
#endif /* USE_ITT_BUILD */
}
if (__kmp_env_consistency_check) {
if (status && push_ws) {
__kmp_push_workshare(gtid, ct_psingle, id_ref);
} else {
__kmp_check_workshare(gtid, ct_psingle, id_ref);
}
}
#if USE_ITT_BUILD
if (status) {
__kmp_itt_single_start(gtid);
}
#endif /* USE_ITT_BUILD */
return status;
}
void __kmp_exit_single(int gtid) {
#if USE_ITT_BUILD
__kmp_itt_single_end(gtid);
#endif /* USE_ITT_BUILD */
if (__kmp_env_consistency_check)
__kmp_pop_workshare(gtid, ct_psingle, NULL);
}
/* determine if we can go parallel or must use a serialized parallel region and
* how many threads we can use
* set_nproc is the number of threads requested for the team
* returns 0 if we should serialize or only use one thread,
* otherwise the number of threads to use
* The forkjoin lock is held by the caller. */
static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
int master_tid, int set_nthreads
#if OMP_40_ENABLED
,
int enter_teams
#endif /* OMP_40_ENABLED */
) {
int capacity;
int new_nthreads;
KMP_DEBUG_ASSERT(__kmp_init_serial);
KMP_DEBUG_ASSERT(root && parent_team);
kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
// If dyn-var is set, dynamically adjust the number of desired threads,
// according to the method specified by dynamic_mode.
new_nthreads = set_nthreads;
if (!get__dynamic_2(parent_team, master_tid)) {
;
}
#ifdef USE_LOAD_BALANCE
else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
#endif /* USE_LOAD_BALANCE */
else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
new_nthreads = __kmp_avail_proc - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (new_nthreads <= 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
} else {
new_nthreads = set_nthreads;
}
} else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
if (set_nthreads > 2) {
new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
new_nthreads = (new_nthreads % set_nthreads) + 1;
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
} else {
KMP_ASSERT(0);
}
// Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
__kmp_max_nth) {
int tl_nthreads = __kmp_max_nth - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
// Respect OMP_THREAD_LIMIT
int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
if (cg_nthreads + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
max_cg_threads) {
int tl_nthreads = max_cg_threads - cg_nthreads +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
// Check if the threads array is large enough, or needs expanding.
// See comment in __kmp_register_root() about the adjustment if
// __kmp_threads[0] == NULL.
capacity = __kmp_threads_capacity;
if (TCR_PTR(__kmp_threads[0]) == NULL) {
--capacity;
}
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
capacity) {
// Expand the threads array.
int slotsRequired = __kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
capacity;
int slotsAdded = __kmp_expand_threads(slotsRequired);
if (slotsAdded < slotsRequired) {
// The threads array was not expanded enough.
new_nthreads -= (slotsRequired - slotsAdded);
KMP_ASSERT(new_nthreads >= 1);
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
if (__kmp_tp_cached) {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
} else {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
}
}
}
}
#ifdef KMP_DEBUG
if (new_nthreads == 1) {
KC_TRACE(10,
("__kmp_reserve_threads: T#%d serializing team after reclaiming "
"dead roots and rechecking; requested %d threads\n",
__kmp_get_gtid(), set_nthreads));
} else {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
" %d threads\n",
__kmp_get_gtid(), new_nthreads, set_nthreads));
}
#endif // KMP_DEBUG
return new_nthreads;
}
/* Allocate threads from the thread pool and assign them to the new team. We are
assured that there are enough threads available, because we checked on that
earlier within critical section forkjoin */
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
kmp_info_t *master_th, int master_gtid) {
int i;
int use_hot_team;
KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
KMP_MB();
/* first, let's setup the master thread */
master_th->th.th_info.ds.ds_tid = 0;
master_th->th.th_team = team;
master_th->th.th_team_nproc = team->t.t_nproc;
master_th->th.th_team_master = master_th;
master_th->th.th_team_serialized = FALSE;
master_th->th.th_dispatch = &team->t.t_dispatch[0];
/* make sure we are not the optimized hot team */
#if KMP_NESTED_HOT_TEAMS
use_hot_team = 0;
kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
if (hot_teams) { // hot teams array is not allocated if
// KMP_HOT_TEAMS_MAX_LEVEL=0
int level = team->t.t_active_level - 1; // index in array of hot teams
if (master_th->th.th_teams_microtask) { // are we inside the teams?
if (master_th->th.th_teams_size.nteams > 1) {
++level; // level was not increased in teams construct for
// team_of_masters
}
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
master_th->th.th_teams_level == team->t.t_level) {
++level; // level was not increased in teams construct for
// team_of_workers before the parallel
} // team->t.t_level will be increased inside parallel
}
if (level < __kmp_hot_teams_max_level) {
if (hot_teams[level].hot_team) {
// hot team has already been allocated for given level
KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
use_hot_team = 1; // the team is ready to use
} else {
use_hot_team = 0; // AC: threads are not allocated yet
hot_teams[level].hot_team = team; // remember new hot team
hot_teams[level].hot_team_nth = team->t.t_nproc;
}
} else {
use_hot_team = 0;
}
}
#else
use_hot_team = team == root->r.r_hot_team;
#endif
if (!use_hot_team) {
/* install the master thread */
team->t.t_threads[0] = master_th;
__kmp_initialize_info(master_th, team, 0, master_gtid);
/* now, install the worker threads */
for (i = 1; i < team->t.t_nproc; i++) {
/* fork or reallocate a new thread and install it in team */
kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
team->t.t_threads[i] = thr;
KMP_DEBUG_ASSERT(thr);
KMP_DEBUG_ASSERT(thr->th.th_team == team);
/* align team and thread arrived states */
KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
"T#%d(%d:%d) join =%llu, plain=%llu\n",
__kmp_gtid_from_tid(0, team), team->t.t_id, 0,
__kmp_gtid_from_tid(i, team), team->t.t_id, i,
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
team->t.t_bar[bs_plain_barrier].b_arrived));
#if OMP_40_ENABLED
thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
thr->th.th_teams_level = master_th->th.th_teams_level;
thr->th.th_teams_size = master_th->th.th_teams_size;
#endif
{ // Initialize threads' barrier data.
int b;
kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
}
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
__kmp_partition_places(team);
#endif
}
#if OMP_50_ENABLED
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
for (i = 0; i < team->t.t_nproc; i++) {
kmp_info_t *thr = team->t.t_threads[i];
if (thr->th.th_prev_num_threads != team->t.t_nproc ||
thr->th.th_prev_level != team->t.t_level) {
team->t.t_display_affinity = 1;
break;
}
}
}
#endif
KMP_MB();
}
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// Propagate any changes to the floating point control registers out to the team
// We try to avoid unnecessary writes to the relevant cache line in the team
// structure, so we don't make changes unless they are needed.
inline static void propagateFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control) {
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
// Get master values of FPU control flags (both X87 and vector)
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
// There is no point looking at t_fp_control_saved here.
// If it is TRUE, we still have to update the values if they are different
// from those we now have. If it is FALSE we didn't save anything yet, but
// our objective is the same. We have to ensure that the values in the team
// are the same as those we have.
// So, this code achieves what we need whether or not t_fp_control_saved is
// true. By checking whether the value needs updating we avoid unnecessary
// writes that would put the cache-line into a written state, causing all
// threads in the team to have to read it again.
KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
// Although we don't use this value, other code in the runtime wants to know
// whether it should restore them. So we must ensure it is correct.
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
} else {
// Similarly here. Don't write to this cache-line in the team structure
// unless we have to.
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
}
}
// Do the opposite, setting the hardware registers to the updated values from
// the team.
inline static void updateHWFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
// Only reset the fp control regs if they have been changed in the team.
// the parallel region that we are exiting.
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
__kmp_clear_x87_fpu_status_word();
__kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
}
if (team->t.t_mxcsr != mxcsr) {
__kmp_load_mxcsr(&team->t.t_mxcsr);
}
}
}
#else
#define propagateFPControl(x) ((void)0)
#define updateHWFPControl(x) ((void)0)
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
int realloc); // forward declaration
/* Run a parallel region that has been serialized, so runs only in a team of the
single master thread. */
void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
kmp_info_t *this_thr;
kmp_team_t *serial_team;
KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
/* Skip all this code for autopar serialized loops since it results in
unacceptable overhead */
if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
return;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
#if OMP_50_ENABLED
__kmp_resume_if_soft_paused();
#endif
this_thr = __kmp_threads[global_tid];
serial_team = this_thr->th.th_serial_team;
/* utilize the serialized team held by this thread */
KMP_DEBUG_ASSERT(serial_team);
KMP_MB();
if (__kmp_tasking_mode != tskm_immediate_exec) {
KMP_DEBUG_ASSERT(
this_thr->th.th_task_team ==
this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
NULL);
KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
"team %p, new task_team = NULL\n",
global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
this_thr->th.th_task_team = NULL;
}
#if OMP_40_ENABLED
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else if (proc_bind == proc_bind_default) {
// No proc_bind clause was specified, so use the current value
// of proc-bind-var for this parallel region.
proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
}
// Reset for next parallel region
this_thr->th.th_set_proc_bind = proc_bind_default;
#endif /* OMP_40_ENABLED */
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
ompt_data_t *implicit_task_data;
void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
ompt_task_info_t *parent_task_info;
parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = 1;
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
&(parent_task_info->task_data), &(parent_task_info->frame),
&ompt_parallel_data, team_size, ompt_parallel_invoker_program,
codeptr);
}
}
#endif // OMPT_SUPPORT
if (this_thr->th.th_team != serial_team) {
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
if (serial_team->t.t_serialized) {
/* this serial team was already used
TODO increase performance by making this locks more specific */
kmp_team_t *new_team;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
#if OMP_40_ENABLED
proc_bind,
#endif
&this_thr->th.th_current_task->td_icvs,
0 USE_NESTED_HOT_ARG(NULL));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
KMP_ASSERT(new_team);
/* setup new serialized team and install it */
new_team->t.t_threads[0] = this_thr;
new_team->t.t_parent = this_thr->th.th_team;
serial_team = new_team;
this_thr->th.th_serial_team = serial_team;
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
global_tid, serial_team));
/* TODO the above breaks the requirement that if we run out of resources,
then we can still guarantee that serialized teams are ok, since we may
need to allocate a new one */
} else {
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
global_tid, serial_team));
}
/* we have to initialize this serial team */
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
serial_team->t.t_ident = loc;
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
this_thr->th.th_current_task));
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
this_thr->th.th_current_task->td_flags.executing = 0;
__kmp_push_current_task_to_thread(this_thr, serial_team, 0);
/* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
implicit task for each serialized task represented by
team->t.t_serialized? */
copy_icvs(&this_thr->th.th_current_task->td_icvs,
&this_thr->th.th_current_task->td_parent->td_icvs);
// Thread value exists in the nested nthreads array for the next nested
// level
if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
this_thr->th.th_current_task->td_icvs.nproc =
__kmp_nested_nth.nth[level + 1];
}
#if OMP_40_ENABLED
if (__kmp_nested_proc_bind.used &&
(level + 1 < __kmp_nested_proc_bind.used)) {
this_thr->th.th_current_task->td_icvs.proc_bind =
__kmp_nested_proc_bind.bind_types[level + 1];
}
#endif /* OMP_40_ENABLED */
#if USE_DEBUGGER
serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
#endif
this_thr->th.th_info.ds.ds_tid = 0;
/* set thread cache values */
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
#if OMP_50_ENABLED
serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
#endif
propagateFPControl(serial_team);
/* check if we need to allocate dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
if (!serial_team->t.t_dispatch->th_disp_buffer) {
serial_team->t.t_dispatch->th_disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
} else {
/* this serialized team is already being used,
* that's fine, just add another nested level */
KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
++serial_team->t.t_serialized;
this_thr->th.th_team_serialized = serial_team->t.t_serialized;
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
// Thread value exists in the nested nthreads array for the next nested
// level
if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
this_thr->th.th_current_task->td_icvs.nproc =
__kmp_nested_nth.nth[level + 1];
}
serial_team->t.t_level++;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
"of serial team %p to %d\n",
global_tid, serial_team, serial_team->t.t_level));
/* allocate/push dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
{
dispatch_private_info_t *disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
}
#if OMP_40_ENABLED
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
#endif
#if OMP_50_ENABLED
// Perform the display affinity functionality for
// serialized parallel regions
if (__kmp_display_affinity) {
if (this_thr->th.th_prev_level != serial_team->t.t_level ||
this_thr->th.th_prev_num_threads != 1) {
// NULL means use the affinity-format-var ICV
__kmp_aux_display_affinity(global_tid, NULL);
this_thr->th.th_prev_level = serial_team->t.t_level;
this_thr->th.th_prev_num_threads = 1;
}
}
#endif
if (__kmp_env_consistency_check)
__kmp_push_parallel(global_tid, NULL);
#if OMPT_SUPPORT
serial_team->t.ompt_team_info.master_return_address = codeptr;
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
&ompt_parallel_data, codeptr);
__ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
// don't use lw_taskteam after linking. content was swaped
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
OMPT_CUR_TASK_INFO(this_thr)
->thread_num = __kmp_tid_from_gtid(global_tid);
}
/* OMPT state */
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
}
#endif
}
/* most of the work for a fork */
/* return true if we really went parallel, false if serialized */
int __kmp_fork_call(ident_t *loc, int gtid,
enum fork_context_e call_context, // Intel, GNU, ...
kmp_int32 argc, microtask_t microtask, launch_t invoker,
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
va_list *ap
#else
va_list ap
#endif
) {
void **argv;
int i;
int master_tid;
int master_this_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int level;
#if OMP_40_ENABLED
int active_level;
int teams_level;
#endif
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
/* Some systems prefer the stack for the root thread(s) to start with */
/* some gap from the parent stack to prevent false sharing. */
void *dummy = KMP_ALLOCA(__kmp_stkpadding);
/* These 2 lines below are so this does not get optimized out */
if (__kmp_stkpadding > KMP_MAX_STKPADDING)
__kmp_stkpadding += (short)((kmp_int64)dummy);
}
/* initialize if needed */
KMP_DEBUG_ASSERT(
__kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
#if OMP_50_ENABLED
__kmp_resume_if_soft_paused();
#endif
/* setup current data */
master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
// shutdown
parent_team = master_th->th.th_team;
master_tid = master_th->th.th_info.ds.ds_tid;
master_this_cons = master_th->th.th_local.this_construct;
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
ompt_data_t *parent_task_data;
ompt_frame_t *ompt_frame;
ompt_data_t *implicit_task_data;
void *return_address = NULL;
if (ompt_enabled.enabled) {
__ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
NULL, NULL);
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
}
#endif
// Nested level will be an index in the nested nthreads array
level = parent_team->t.t_level;
// used to launch non-serial teams even if nested is not allowed
active_level = parent_team->t.t_active_level;
#if OMP_40_ENABLED
// needed to check nesting inside the teams
teams_level = master_th->th.th_teams_level;
#endif
#if KMP_NESTED_HOT_TEAMS
p_hot_teams = &master_th->th.th_hot_teams;
if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
*p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
// it is either actual or not needed (when active_level > 0)
(*p_hot_teams)[0].hot_team_nth = 1;
}
#endif
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = master_set_numthreads
? master_set_numthreads
: get__nproc_2(parent_team, master_tid);
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
OMPT_INVOKER(call_context), return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
master_th->th.th_ident = loc;
#if OMP_40_ENABLED
if (master_th->th.th_teams_microtask && ap &&
microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
// AC: This is start of parallel that is nested inside teams construct.
// The team is actual (hot), all workers are ready at the fork barrier.
// No lock needed to initialize the team a bit, then free workers.
parent_team->t.t_ident = loc;
__kmp_alloc_argv_entries(argc, parent_team, TRUE);
parent_team->t.t_argc = argc;
argv = (void **)parent_team->t.t_argv;
for (i = argc - 1; i >= 0; --i)
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg(*ap, void *);
#else
*argv++ = va_arg(ap, void *);
#endif
// Increment our nested depth levels, but not increase the serialization
if (parent_team == master_th->th.th_serial_team) {
// AC: we are in serialized parallel
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
// AC: need this in order enquiry functions work
// correctly, will restore at join time
parent_team->t.t_serialized--;
#if OMPT_SUPPORT
void *dummy;
void **exit_runtime_p;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
OMPT_CUR_TASK_INFO(master_th)
->thread_num = __kmp_tid_from_gtid(gtid);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_runtime_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_runtime_p
#endif
);
}
#if OMPT_SUPPORT
*exit_runtime_p = NULL;
if (ompt_enabled.enabled) {
OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, implicit_task_data, 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
}
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
OMPT_INVOKER(call_context), return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_ATOMIC_INC(&root->r.r_in_parallel);
parent_team->t.t_active_level++;
parent_team->t.t_level++;
#if OMP_50_ENABLED
parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
#endif
/* Change number of threads in the team if requested */
if (master_set_numthreads) { // The parallel has num_threads clause
if (master_set_numthreads < master_th->th.th_teams_size.nth) {
// AC: only can reduce number of threads dynamically, can't increase
kmp_info_t **other_threads = parent_team->t.t_threads;
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
// Keep extra threads hot in the team for possible next parallels
}
master_th->th.th_set_nproc = 0;
}
#if USE_DEBUGGER
if (__kmp_debugging) { // Let debugger override number of threads.
int nth = __kmp_omp_num_threads(loc);
if (nth > 0) { // 0 means debugger doesn't want to change num threads
master_set_numthreads = nth;
}
}
#endif
KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
__kmp_internal_fork(loc, gtid, parent_team);
KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
/* Invoke microtask for MASTER thread */
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
if (!parent_team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
} // Parallel closely nested in teams construct
#endif /* OMP_40_ENABLED */
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec) {
KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
parent_team->t.t_task_team[master_th->th.th_task_state]);
}
#endif
if (parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels) {
nthreads = 1;
} else {
#if OMP_40_ENABLED
int enter_teams = ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
#endif
nthreads =
master_set_numthreads
? master_set_numthreads
: get__nproc_2(
parent_team,
master_tid); // TODO: get nproc directly from current task
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct). This code moved here from
// __kmp_reserve_threads() to speedup nested serialized parallels.
if (nthreads > 1) {
if ((get__max_active_levels(master_th) == 1 && (root->r.r_in_parallel
#if OMP_40_ENABLED
&& !enter_teams
#endif /* OMP_40_ENABLED */
)) ||
(__kmp_library == library_serial)) {
KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
" threads\n",
gtid, nthreads));
nthreads = 1;
}
}
if (nthreads > 1) {
/* determine how many new threads we can use */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
nthreads = __kmp_reserve_threads(
root, parent_team, master_tid, nthreads
#if OMP_40_ENABLED
/* AC: If we execute teams from parallel region (on host), then
teams should be created but each can only have 1 thread if
nesting is disabled. If teams called from serial region, then
teams and their threads should be created regardless of the
nesting setting. */
,
enter_teams
#endif /* OMP_40_ENABLED */
);
if (nthreads == 1) {
// Free lock for single thread execution here; for multi-thread
// execution it will be freed later after team of threads created
// and initialized
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
}
}
}
KMP_DEBUG_ASSERT(nthreads > 0);
// If we temporarily changed the set number of threads then restore it now
master_th->th.th_set_nproc = 0;
/* create a serialized parallel region? */
if (nthreads == 1) {
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
void *args[argc];
#else
void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
KMP_ARCH_AARCH64) */
KA_TRACE(20,
("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
__kmpc_serialized_parallel(loc, gtid);
if (call_context == fork_context_intel) {
/* TODO this sucks, use the compiler itself to pass args! :) */
master_th->th.th_serial_team->t.t_ident = loc;
#if OMP_40_ENABLED
if (!ap) {
// revert change made in __kmpc_serialized_parallel()
master_th->th.th_serial_team->t.t_level--;
// Get args from parent team for teams construct
#if OMPT_SUPPORT
void *dummy;
void **exit_runtime_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_runtime_p = &(task_info->frame.exit_frame.ptr);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
&(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
OMPT_CUR_TASK_INFO(master_th)
->thread_num = __kmp_tid_from_gtid(gtid);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_runtime_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc,
parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_runtime_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
exit_runtime_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
}
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
OMPT_INVOKER(call_context), return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else if (microtask == (microtask_t)__kmp_teams_master) {
KMP_DEBUG_ASSERT(master_th->th.th_team ==
master_th->th.th_serial_team);
team = master_th->th.th_team;
// team->t.t_pkfn = microtask;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries(argc, team, TRUE);
team->t.t_argc = argc;
argv = (void **)team->t.t_argv;
if (ap) {
for (i = argc - 1; i >= 0; --i)
// TODO: revert workaround for Intel(R) 64 tracker #96
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg(*ap, void *);
#else
*argv++ = va_arg(ap, void *);
#endif
} else {
for (i = 0; i < argc; ++i)
// Get args from parent team for teams construct
argv[i] = parent_team->t.t_argv[i];
}
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0
team->t.t_level--;
// AC: call special invoker for outer "parallel" of teams construct
invoker(gtid);
} else {
#endif /* OMP_40_ENABLED */
argv = args;
for (i = argc - 1; i >= 0; --i)
// TODO: revert workaround for Intel(R) 64 tracker #96
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg(*ap, void *);
#else
*argv++ = va_arg(ap, void *);
#endif
KMP_MB();
#if OMPT_SUPPORT
void *dummy;
void **exit_runtime_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_runtime_p = &(task_info->frame.exit_frame.ptr);
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
OMPT_CUR_TASK_INFO(master_th)
->thread_num = __kmp_tid_from_gtid(gtid);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_runtime_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
,
exit_runtime_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_runtime_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, parent_task_data,
OMPT_INVOKER(call_context), return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
#if OMP_40_ENABLED
}
#endif /* OMP_40_ENABLED */
} else if (call_context == fork_context_gnu) {
#if OMPT_SUPPORT
ompt_lw_taskteam_t lwt;
__ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
return_address);
lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
__ompt_lw_taskteam_link(&lwt, master_th, 1);
// don't use lw_taskteam after linking. content was swaped
#endif
// we were called from GNU native code
KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
return FALSE;
} else {
KMP_ASSERT2(call_context < fork_context_last,
"__kmp_fork_call: unknown fork_context parameter");
}
KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
KMP_MB();
return FALSE;
} // if (nthreads == 1)
// GEH: only modify the executing flag in the case when not serialized
// serialized case is handled in kmpc_serialized_parallel
KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
"curtask=%p, curtask_max_aclevel=%d\n",
parent_team->t.t_active_level, master_th,
master_th->th.th_current_task,
master_th->th.th_current_task->td_icvs.max_active_levels));
// TODO: GEH - cannot do this assertion because root thread not set up as
// executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
master_th->th.th_current_task->td_flags.executing = 0;
#if OMP_40_ENABLED
if (!master_th->th.th_teams_microtask || level > teams_level)
#endif /* OMP_40_ENABLED */
{
/* Increment our nested depth level */
KMP_ATOMIC_INC(&root->r.r_in_parallel);
}
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
if ((level + 1 < __kmp_nested_nth.used) &&
(__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
nthreads_icv = __kmp_nested_nth.nth[level + 1];
} else {
nthreads_icv = 0; // don't update
}
#if OMP_40_ENABLED
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
kmp_proc_bind_t proc_bind_icv =
proc_bind_default; // proc_bind_default means don't update
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
if (proc_bind == proc_bind_default) {
// No proc_bind clause specified; use current proc-bind-var for this
// parallel region
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
#endif /* OMP_40_ENABLED */
if ((nthreads_icv > 0)
#if OMP_40_ENABLED
|| (proc_bind_icv != proc_bind_default)
#endif /* OMP_40_ENABLED */
) {
kmp_internal_control_t new_icvs;
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
new_icvs.next = NULL;
if (nthreads_icv > 0) {
new_icvs.nproc = nthreads_icv;
}
#if OMP_40_ENABLED
if (proc_bind_icv != proc_bind_default) {
new_icvs.proc_bind = proc_bind_icv;
}
#endif /* OMP_40_ENABLED */
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
#if OMP_40_ENABLED
proc_bind,
#endif
&new_icvs, argc USE_NESTED_HOT_ARG(master_th));
} else {
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
#if OMP_40_ENABLED
proc_bind,
#endif
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
/* setup the new team */
KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
KMP_CHECK_UPDATE(team->t.t_ident, loc);
KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
#if OMPT_SUPPORT
KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
return_address);
#endif
KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
// TODO: parent_team->t.t_level == INT_MAX ???
#if OMP_40_ENABLED
if (!master_th->th.th_teams_microtask || level > teams_level) {
#endif /* OMP_40_ENABLED */
int new_level = parent_team->t.t_level + 1;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level + 1;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
#if OMP_40_ENABLED
} else {
// AC: Do not increase parallel level at start of the teams construct
int new_level = parent_team->t.t_level;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
}
#endif /* OMP_40_ENABLED */
kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
// set master's schedule as new run-time schedule
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
#if OMP_40_ENABLED
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
#endif
#if OMP_50_ENABLED
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
#endif
// Update the floating point rounding in the team if required.
propagateFPControl(team);
if (__kmp_tasking_mode != tskm_immediate_exec) {
// Set master's task team to team's task team. Unless this is hot team, it
// should be NULL.
KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
parent_team->t.t_task_team[master_th->th.th_task_state]);
KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
"%p, new task_team %p / team %p\n",
__kmp_gtid_from_thread(master_th),
master_th->th.th_task_team, parent_team,
team->t.t_task_team[master_th->th.th_task_state], team));
if (active_level || master_th->th.th_task_team) {
// Take a memo of master's task_state
KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
if (master_th->th.th_task_state_top >=
master_th->th.th_task_state_stack_sz) { // increase size
kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
kmp_uint8 *old_stack, *new_stack;
kmp_uint32 i;
new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
new_stack[i] = master_th->th.th_task_state_memo_stack[i];
}
for (i = master_th->th.th_task_state_stack_sz; i < new_size;
++i) { // zero-init rest of stack
new_stack[i] = 0;
}
old_stack = master_th->th.th_task_state_memo_stack;
master_th->th.th_task_state_memo_stack = new_stack;
master_th->th.th_task_state_stack_sz = new_size;
__kmp_free(old_stack);
}
// Store master's task_state on stack
master_th->th
.th_task_state_memo_stack[master_th->th.th_task_state_top] =
master_th->th.th_task_state;
master_th->th.th_task_state_top++;
#if KMP_NESTED_HOT_TEAMS
if (master_th->th.th_hot_teams &&
active_level < __kmp_hot_teams_max_level &&
team == master_th->th.th_hot_teams[active_level].hot_team) {
// Restore master's nested state if nested hot team
master_th->th.th_task_state =
master_th->th
.th_task_state_memo_stack[master_th->th.th_task_state_top];
} else {
#endif
master_th->th.th_task_state = 0;
#if KMP_NESTED_HOT_TEAMS
}
#endif
}
#if !KMP_NESTED_HOT_TEAMS
KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
(team == root->r.r_hot_team));
#endif
}
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
team->t.t_nproc));
KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
(team->t.t_master_tid == 0 &&
(team->t.t_parent == root->r.r_root_team ||
team->t.t_parent->t.t_serialized)));
KMP_MB();
/* now, setup the arguments */
argv = (void **)team->t.t_argv;
#if OMP_40_ENABLED
if (ap) {
#endif /* OMP_40_ENABLED */
for (i = argc - 1; i >= 0; --i) {
// TODO: revert workaround for Intel(R) 64 tracker #96
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
void *new_argv = va_arg(*ap, void *);
#else
void *new_argv = va_arg(ap, void *);
#endif
KMP_CHECK_UPDATE(*argv, new_argv);
argv++;
}
#if OMP_40_ENABLED
} else {
for (i = 0; i < argc; ++i) {
// Get args from parent team for teams construct
KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
}
}
#endif /* OMP_40_ENABLED */
/* now actually fork the threads */
KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
__kmp_fork_team_threads(root, team, master_th, gtid);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
#if OMPT_SUPPORT
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
#endif
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if USE_ITT_BUILD
if (team->t.t_active_level == 1 // only report frames at level 1
#if OMP_40_ENABLED
&& !master_th->th.th_teams_microtask // not in teams construct
#endif /* OMP_40_ENABLED */
) {
#if USE_ITT_NOTIFY
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
(__kmp_forkjoin_frames_mode == 3 ||
__kmp_forkjoin_frames_mode == 1)) {
kmp_uint64 tmp_time = 0;
if (__itt_get_timestamp_ptr)
tmp_time = __itt_get_timestamp();
// Internal fork - report frame begin
master_th->th.th_frame_time = tmp_time;
if (__kmp_forkjoin_frames_mode == 3)
team->t.t_region_time = tmp_time;
} else
// only one notification scheme (either "submit" or "forking/joined", not both)
#endif /* USE_ITT_NOTIFY */
if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
// Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
}
}
#endif /* USE_ITT_BUILD */
/* now go on and do the work */
KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
KMP_MB();
KF_TRACE(10,
("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
team->t.t_stack_id =
__kmp_itt_stack_caller_create(); // create new stack stitching id
// before entering fork barrier
}
#endif /* USE_ITT_BUILD */
#if OMP_40_ENABLED
// AC: skip __kmp_internal_fork at teams construct, let only master
// threads execute
if (ap)
#endif /* OMP_40_ENABLED */
{
__kmp_internal_fork(loc, gtid, team);
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
}
if (call_context == fork_context_gnu) {
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
}
/* Invoke microtask for MASTER thread */
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
} // END of timer KMP_fork_call block
#if KMP_STATS_ENABLED && OMP_40_ENABLED
// If beginning a teams construct, then change thread state
stats_state_e previous_state = KMP_GET_THREAD_STATE();
if (!ap) {
KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
}
#endif
if (!team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
}
#if KMP_STATS_ENABLED && OMP_40_ENABLED
// If was beginning of a teams construct, then reset thread state
if (!ap) {
KMP_SET_THREAD_STATE(previous_state);
}
#endif
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
#if OMPT_SUPPORT
static inline void __kmp_join_restore_state(kmp_info_t *thread,
kmp_team_t *team) {
// restore state outside the region
thread->th.ompt_thread_info.state =
((team->t.t_serialized) ? ompt_state_work_serial
: ompt_state_work_parallel);
}
static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
kmp_team_t *team, ompt_data_t *parallel_data,
fork_context_e fork_context, void *codeptr) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
codeptr);
}
task_info->frame.enter_frame = ompt_data_none;
__kmp_join_restore_state(thread, team);
}
#endif
void __kmp_join_call(ident_t *loc, int gtid
#if OMPT_SUPPORT
,
enum fork_context_e fork_context
#endif
#if OMP_40_ENABLED
,
int exit_teams
#endif /* OMP_40_ENABLED */
) {
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int master_active;
KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
/* setup current data */
master_th = __kmp_threads[gtid];
root = master_th->th.th_root;
team = master_th->th.th_team;
parent_team = team->t.t_parent;
master_th->th.th_ident = loc;
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
"th_task_team = %p\n",
__kmp_gtid_from_thread(master_th), team,
team->t.t_task_team[master_th->th.th_task_state],
master_th->th.th_task_team));
KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
team->t.t_task_team[master_th->th.th_task_state]);
}
#endif
if (team->t.t_serialized) {
#if OMP_40_ENABLED
if (master_th->th.th_teams_microtask) {
// We are in teams construct
int level = team->t.t_level;
int tlevel = master_th->th.th_teams_level;
if (level == tlevel) {
// AC: we haven't incremented it earlier at start of teams construct,
// so do it here - at the end of teams construct
team->t.t_level++;
} else if (level == tlevel + 1) {
// AC: we are exiting parallel inside teams, need to increment
// serialization in order to restore it in the next call to
// __kmpc_end_serialized_parallel
team->t.t_serialized++;
}
}
#endif /* OMP_40_ENABLED */
__kmpc_end_serialized_parallel(loc, gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__kmp_join_restore_state(master_th, parent_team);
}
#endif
return;
}
master_active = team->t.t_master_active;
#if OMP_40_ENABLED
if (!exit_teams)
#endif /* OMP_40_ENABLED */
{
// AC: No barrier for internal teams at exit from teams construct.
// But there is barrier for external team (league).
__kmp_internal_join(loc, gtid, team);
}
#if OMP_40_ENABLED
else {
master_th->th.th_task_state =
0; // AC: no tasking in teams (out of any parallel)
}
#endif /* OMP_40_ENABLED */
KMP_MB();
#if OMPT_SUPPORT
ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
void *codeptr = team->t.ompt_team_info.master_return_address;
#endif
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
__kmp_itt_stack_caller_destroy(
(__itt_caller)team->t
.t_stack_id); // destroy the stack stitching id after join barrier
}
// Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
if (team->t.t_active_level == 1
#if OMP_40_ENABLED
&& !master_th->th.th_teams_microtask /* not in teams construct */
#endif /* OMP_40_ENABLED */
) {
master_th->th.th_ident = loc;
// only one notification scheme (either "submit" or "forking/joined", not
// both)
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3)
__kmp_itt_frame_submit(gtid, team->t.t_region_time,
master_th->th.th_frame_time, 0, loc,
master_th->th.th_team_nproc, 1);
else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
!__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
__kmp_itt_region_joined(gtid);
} // active_level == 1
#endif /* USE_ITT_BUILD */
#if OMP_40_ENABLED
if (master_th->th.th_teams_microtask && !exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1) {
// AC: We need to leave the team structure intact at the end of parallel
// inside the teams construct, so that at the next parallel same (hot) team
// works, only adjust nesting levels
/* Decrement our nested depth level */
team->t.t_level--;
team->t.t_active_level--;
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
// Restore number of threads in the team if needed. This code relies on
// the proper adjustment of th_teams_size.nth after the fork in
// __kmp_teams_master on each teams master in the case that
// __kmp_reserve_threads reduced it.
if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
int old_num = master_th->th.th_team_nproc;
int new_num = master_th->th.th_teams_size.nth;
kmp_info_t **other_threads = team->t.t_threads;
team->t.t_nproc = new_num;
for (int i = 0; i < old_num; ++i) {
other_threads[i]->th.th_team_nproc = new_num;
}
// Adjust states of non-used threads of the team
for (int i = old_num; i < new_num; ++i) {
// Re-initialize thread's barrier data.
KMP_DEBUG_ASSERT(other_threads[i]);
kmp_balign_t *balign = other_threads[i]->th.th_bar;
for (int b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
// Synchronize thread's task state
other_threads[i]->th.th_task_state = master_th->th.th_task_state;
}
}
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
codeptr);
}
#endif
return;
}
#endif /* OMP_40_ENABLED */
/* do cleanup and restore the parent team */
master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
master_th->th.th_local.this_construct = team->t.t_master_this_cons;
master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
/* jc: The following lock has instructions with REL and ACQ semantics,
separating the parallel user code called in this parallel region
from the serial user code called after this function returns. */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
#if OMP_40_ENABLED
if (!master_th->th.th_teams_microtask ||
team->t.t_level > master_th->th.th_teams_level)
#endif /* OMP_40_ENABLED */
{
/* Decrement our nested depth level */
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
}
KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_implicit_task) {
int ompt_team_size = team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
}
task_info->frame.exit_frame = ompt_data_none;
task_info->task_data = ompt_data_none;
}
#endif
KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
master_th, team));
__kmp_pop_current_task_from_thread(master_th);
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
// Restore master thread's partition.
master_th->th.th_first_place = team->t.t_first_place;
master_th->th.th_last_place = team->t.t_last_place;
#endif /* OMP_40_ENABLED */
#if OMP_50_ENABLED
master_th->th.th_def_allocator = team->t.t_def_allocator;
#endif
updateHWFPControl(team);
if (root->r.r_active != master_active)
root->r.r_active = master_active;
__kmp_free_team(root, team USE_NESTED_HOT_ARG(
master_th)); // this will free worker threads
/* this race was fun to find. make sure the following is in the critical
region otherwise assertions may fail occasionally since the old team may be
reallocated and the hierarchy appears inconsistent. it is actually safe to
run and won't cause any bugs, but will cause those assertion failures. it's
only one deref&assign so might as well put this in the critical region */
master_th->th.th_team = parent_team;
master_th->th.th_team_nproc = parent_team->t.t_nproc;
master_th->th.th_team_master = parent_team->t.t_threads[0];
master_th->th.th_team_serialized = parent_team->t.t_serialized;
/* restore serialized team, if need be */
if (parent_team->t.t_serialized &&
parent_team != master_th->th.th_serial_team &&
parent_team != root->r.r_root_team) {
__kmp_free_team(root,
master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
master_th->th.th_serial_team = parent_team;
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
if (master_th->th.th_task_state_top >
0) { // Restore task state from memo stack
KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
// Remember master's state if we re-use this nested hot team
master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
master_th->th.th_task_state;
--master_th->th.th_task_state_top; // pop
// Now restore state at this level
master_th->th.th_task_state =
master_th->th
.th_task_state_memo_stack[master_th->th.th_task_state_top];
}
// Copy the task team from the parent team to the master thread
master_th->th.th_task_team =
parent_team->t.t_task_team[master_th->th.th_task_state];
KA_TRACE(20,
("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
parent_team));
}
// TODO: GEH - cannot do this assertion because root thread not set up as
// executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
master_th->th.th_current_task->td_flags.executing = 1;
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
codeptr);
}
#endif
KMP_MB();
KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
}
/* Check whether we should push an internal control record onto the
serial team stack. If so, do it. */
void __kmp_save_internal_controls(kmp_info_t *thread) {
if (thread->th.th_team != thread->th.th_serial_team) {
return;
}
if (thread->th.th_team->t.t_serialized > 1) {
int push = 0;
if <