2 * kmp_runtime.cpp -- KPTS runtime support library
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
32 #include "ompt-specific.h"
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
42 #include "tsan_annotations.h"
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46 KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
49 char const __kmp_version_omp_api[] =
50 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
53 char const __kmp_version_lock[] =
54 KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
59 /* ------------------------------------------------------------------------ */
62 kmp_info_t __kmp_monitor;
65 /* Forward declarations */
67 void __kmp_cleanup(void);
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72 kmp_internal_control_t *new_icvs,
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76 int update_master_only = 0);
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82 kmp_internal_control_t *new_icvs, ident_t *loc);
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
88 static int __kmp_expand_threads(int nNeed);
90 static int __kmp_unregister_root_other_thread(int gtid);
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
101 kmp_info_t **other_threads;
109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
110 __kmp_nth, __kmp_all_nth));
112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115 __kmp_init_gtid for this to work. */
117 if (!TCR_4(__kmp_init_gtid))
120 #ifdef KMP_TDATA_GTID
121 if (TCR_4(__kmp_gtid_mode) >= 3) {
122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126 if (TCR_4(__kmp_gtid_mode) >= 2) {
127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128 return __kmp_gtid_get_specific();
130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
132 stack_addr = (char *)&stack_data;
133 other_threads = __kmp_threads;
135 /* ATT: The code below is a source of potential bugs due to unsynchronized
136 access to __kmp_threads array. For example:
137 1. Current thread loads other_threads[i] to thr and checks it, it is
139 2. Current thread is suspended by OS.
140 3. Another thread unregisters and finishes (debug versions of free()
141 may fill memory with something like 0xEF).
142 4. Current thread is resumed.
143 5. Current thread reads junk from *thr.
144 TODO: Fix it. --ln */
146 for (i = 0; i < __kmp_threads_capacity; i++) {
148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
155 /* stack grows down -- search through all of the active threads */
157 if (stack_addr <= stack_base) {
158 size_t stack_diff = stack_base - stack_addr;
160 if (stack_diff <= stack_size) {
161 /* The only way we can be closer than the allocated */
162 /* stack size is if we are running on this thread. */
163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
169 /* get specific to try and determine our gtid */
171 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172 "thread, using TLS\n"));
173 i = __kmp_gtid_get_specific();
175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
177 /* if we havn't been assigned a gtid, then return code */
181 /* dynamically updated stack window for uber threads to avoid get_specific
183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184 KMP_FATAL(StackOverflow, i);
187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188 if (stack_addr > stack_base) {
189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195 stack_base - stack_addr);
198 /* Reprint stack bounds for ubermaster since they have been refined */
199 if (__kmp_storage_map) {
200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203 other_threads[i]->th.th_info.ds.ds_stacksize,
204 "th_%d stack (refinement)", i);
209 int __kmp_get_global_thread_id_reg() {
212 if (!__kmp_init_serial) {
215 #ifdef KMP_TDATA_GTID
216 if (TCR_4(__kmp_gtid_mode) >= 3) {
217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221 if (TCR_4(__kmp_gtid_mode) >= 2) {
222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223 gtid = __kmp_gtid_get_specific();
226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227 gtid = __kmp_get_global_thread_id();
230 /* we must be a new uber master sibling thread */
231 if (gtid == KMP_GTID_DNE) {
233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234 "Registering a new gtid.\n"));
235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236 if (!__kmp_init_serial) {
237 __kmp_do_serial_initialize();
238 gtid = __kmp_gtid_get_specific();
240 gtid = __kmp_register_root(FALSE);
242 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
246 KMP_DEBUG_ASSERT(gtid >= 0);
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
254 char *stack_beg = NULL;
255 char *stack_end = NULL;
258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259 if (__kmp_storage_map) {
260 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
263 gtid = __kmp_gtid_from_thread(th);
265 if (gtid == KMP_GTID_MONITOR) {
266 __kmp_print_storage_map_gtid(
267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268 "th_%s stack (%s)", "mon",
269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
271 __kmp_print_storage_map_gtid(
272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273 "th_%d stack (%s)", gtid,
274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278 /* No point in checking ubermaster threads since they use refinement and
280 gtid = __kmp_gtid_from_thread(th);
281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
283 ("__kmp_check_stack_overlap: performing extensive checking\n"));
284 if (stack_beg == NULL) {
285 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
289 for (f = 0; f < __kmp_threads_capacity; f++) {
290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
292 if (f_th && f_th != th) {
293 char *other_stack_end =
294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295 char *other_stack_beg =
296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
300 /* Print the other stack values before the abort */
301 if (__kmp_storage_map)
302 __kmp_print_storage_map_gtid(
303 -1, other_stack_beg, other_stack_end,
304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
316 /* ------------------------------------------------------------------------ */
318 void __kmp_infinite_loop(void) {
319 static int done = FALSE;
326 #define MAX_MESSAGE 512
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329 char const *format, ...) {
330 char buffer[MAX_MESSAGE];
333 va_start(ap, format);
334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335 p2, (unsigned long)size, format);
336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337 __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342 if (__kmp_storage_map_verbose) {
343 node = __kmp_get_host_node(p1);
344 if (node < 0) /* doesn't work, so don't try this next time */
345 __kmp_storage_map_verbose = FALSE;
349 int localProc = __kmp_get_cpu_from_gtid(gtid);
351 const int page_size = KMP_GET_PAGE_SIZE();
353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
359 __kmp_printf_no_lock(" GTID %d\n", gtid);
361 /* The more elaborate format is disabled for now because of the prctl
366 /* This loop collates adjacent pages with the same host node. */
368 (char *)p1 += page_size;
369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
375 (char *)p1 + (page_size - 1),
376 __kmp_get_host_node(p1));
378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
379 (char *)p2 + (page_size - 1),
380 __kmp_get_host_node(p2));
386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
392 void __kmp_warn(char const *format, ...) {
393 char buffer[MAX_MESSAGE];
396 if (__kmp_generate_warnings == kmp_warnings_off) {
400 va_start(ap, format);
402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404 __kmp_vprintf(kmp_err, buffer, ap);
405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
410 void __kmp_abort_process() {
411 // Later threads may stall here, but that's ok because abort() will kill them.
412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
414 if (__kmp_debug_buf) {
415 __kmp_dump_debug_buffer();
418 if (KMP_OS_WINDOWS) {
419 // Let other threads know of abnormal termination and prevent deadlock
420 // if abort happened during library initialization or shutdown
421 __kmp_global.g.g_abort = SIGABRT;
423 /* On Windows* OS by default abort() causes pop-up error box, which stalls
424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425 boxes. _set_abort_behavior() works well, but this function is not
426 available in VS7 (this is not problem for DLL, but it is a problem for
427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428 help, at least in some versions of MS C RTL.
430 It seems following sequence is the only way to simulate abort() and
431 avoid pop-up error box. */
433 _exit(3); // Just in case, if signal ignored, exit anyway.
438 __kmp_infinite_loop();
439 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
441 } // __kmp_abort_process
443 void __kmp_abort_thread(void) {
444 // TODO: Eliminate g_abort global variable and this function.
445 // In case of abort just call abort(), it will kill all the threads.
446 __kmp_infinite_loop();
447 } // __kmp_abort_thread
449 /* Print out the storage map for the major kmp_info_t thread data structures
450 that are allocated together. */
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460 sizeof(kmp_local_t), "th_%d.th_local", gtid);
462 __kmp_print_storage_map_gtid(
463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467 &thr->th.th_bar[bs_plain_barrier + 1],
468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472 &thr->th.th_bar[bs_forkjoin_barrier + 1],
473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
476 #if KMP_FAST_REDUCTION_BARRIER
477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478 &thr->th.th_bar[bs_reduction_barrier + 1],
479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
481 #endif // KMP_FAST_REDUCTION_BARRIER
484 /* Print out the storage map for the major kmp_team_t team data structures
485 that are allocated together. */
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488 int team_id, int num_thr) {
489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494 &team->t.t_bar[bs_last_barrier],
495 sizeof(kmp_balign_team_t) * bs_last_barrier,
496 "%s_%d.t_bar", header, team_id);
498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499 &team->t.t_bar[bs_plain_barrier + 1],
500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504 &team->t.t_bar[bs_forkjoin_barrier + 1],
505 sizeof(kmp_balign_team_t),
506 "%s_%d.t_bar[forkjoin]", header, team_id);
508 #if KMP_FAST_REDUCTION_BARRIER
509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510 &team->t.t_bar[bs_reduction_barrier + 1],
511 sizeof(kmp_balign_team_t),
512 "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
515 __kmp_print_storage_map_gtid(
516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
519 __kmp_print_storage_map_gtid(
520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524 &team->t.t_disp_buffer[num_disp_buff],
525 sizeof(dispatch_shared_info_t) * num_disp_buff,
526 "%s_%d.t_disp_buffer", header, team_id);
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
532 /* ------------------------------------------------------------------------ */
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538 // TODO: Change to __kmp_break_bootstrap_lock().
539 __kmp_init_bootstrap_lock(lck); // make the lock released
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
546 // PROCESS_DETACH is expected to be called by a thread that executes
547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550 // threads can be still alive here, although being about to be terminated. The
551 // threads in the array with ds_thread==0 are most suspicious. Actually, it
552 // can be not safe to access the __kmp_threads[].
554 // TODO: does it make sense to check __kmp_roots[] ?
556 // Let's check that there are no other alive threads registered with the OMP
560 for (i = 0; i < __kmp_threads_capacity; ++i) {
563 kmp_info_t *th = __kmp_threads[i];
566 int gtid = th->th.th_info.ds.ds_gtid;
567 if (gtid == gtid_req)
572 int alive = __kmp_is_thread_alive(th, &exit_val);
577 if (thread_count == 0)
581 // Assume that I'm alone. Now it might be safe to check and reset locks.
582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583 __kmp_reset_lock(&__kmp_forkjoin_lock);
585 __kmp_reset_lock(&__kmp_stdio_lock);
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
594 case DLL_PROCESS_ATTACH:
595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
599 case DLL_PROCESS_DETACH:
600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
602 if (lpReserved != NULL) {
603 // lpReserved is used for telling the difference:
604 // lpReserved == NULL when FreeLibrary() was called,
605 // lpReserved != NULL when the process terminates.
606 // When FreeLibrary() is called, worker threads remain alive. So they will
607 // release the forkjoin lock by themselves. When the process terminates,
608 // worker threads disappear triggering the problem of unreleased forkjoin
609 // lock as described below.
611 // A worker thread can take the forkjoin lock. The problem comes up if
612 // that worker thread becomes dead before it releases the forkjoin lock.
613 // The forkjoin lock remains taken, while the thread executing
614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615 // to take the forkjoin lock and will always fail, so that the application
616 // will never finish [normally]. This scenario is possible if
617 // __kmpc_end() has not been executed. It looks like it's not a corner
618 // case, but common cases:
619 // - the main function was compiled by an alternative compiler;
620 // - the main function was compiled by icl but without /Qopenmp
621 // (application with plugins);
622 // - application terminates by calling C exit(), Fortran CALL EXIT() or
624 // - alive foreign thread prevented __kmpc_end from doing cleanup.
626 // This is a hack to work around the problem.
627 // TODO: !!! figure out something better.
628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
631 __kmp_internal_end_library(__kmp_gtid_get_specific());
635 case DLL_THREAD_ATTACH:
636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
638 /* if we want to register new siblings all the time here call
639 * __kmp_get_gtid(); */
642 case DLL_THREAD_DETACH:
643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
645 __kmp_internal_end_thread(__kmp_gtid_get_specific());
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657 int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659 kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
662 if (__kmp_env_consistency_check) {
663 if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
670 #ifdef BUILD_PARALLEL_ORDERED
671 if (!team->t.t_serialized) {
673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
677 #endif /* BUILD_PARALLEL_ORDERED */
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682 int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684 int tid = __kmp_tid_from_gtid(gtid);
685 kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
688 if (__kmp_env_consistency_check) {
689 if (__kmp_threads[gtid]->th.th_root->r.r_active)
690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
692 #ifdef BUILD_PARALLEL_ORDERED
693 if (!team->t.t_serialized) {
694 KMP_MB(); /* Flush all pending memory write invalidates. */
696 /* use the tid of the next thread in this team */
697 /* TODO replace with general release procedure */
698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
700 KMP_MB(); /* Flush all pending memory write invalidates. */
702 #endif /* BUILD_PARALLEL_ORDERED */
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit */
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
713 if (!TCR_4(__kmp_init_parallel))
714 __kmp_parallel_initialize();
715 __kmp_resume_if_soft_paused();
717 th = __kmp_threads[gtid];
718 team = th->th.th_team;
721 th->th.th_ident = id_ref;
723 if (team->t.t_serialized) {
726 kmp_int32 old_this = th->th.th_local.this_construct;
728 ++th->th.th_local.this_construct;
729 /* try to set team count to thread count--success means thread got the
731 /* TODO: Should this be acquire or release? */
732 if (team->t.t_construct == old_this) {
733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734 th->th.th_local.this_construct);
737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739 team->t.t_active_level ==
740 1) { // Only report metadata by master of active team at level 1
741 __kmp_itt_metadata_single(id_ref);
743 #endif /* USE_ITT_BUILD */
746 if (__kmp_env_consistency_check) {
747 if (status && push_ws) {
748 __kmp_push_workshare(gtid, ct_psingle, id_ref);
750 __kmp_check_workshare(gtid, ct_psingle, id_ref);
755 __kmp_itt_single_start(gtid);
757 #endif /* USE_ITT_BUILD */
761 void __kmp_exit_single(int gtid) {
763 __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765 if (__kmp_env_consistency_check)
766 __kmp_pop_workshare(gtid, ct_psingle, NULL);
769 /* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776 int master_tid, int set_nthreads,
780 KMP_DEBUG_ASSERT(__kmp_init_serial);
781 KMP_DEBUG_ASSERT(root && parent_team);
782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
784 // If dyn-var is set, dynamically adjust the number of desired threads,
785 // according to the method specified by dynamic_mode.
786 new_nthreads = set_nthreads;
787 if (!get__dynamic_2(parent_team, master_tid)) {
790 #ifdef USE_LOAD_BALANCE
791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793 if (new_nthreads == 1) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to 1 thread\n",
799 if (new_nthreads < set_nthreads) {
800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801 "reservation to %d threads\n",
802 master_tid, new_nthreads));
805 #endif /* USE_LOAD_BALANCE */
806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807 new_nthreads = __kmp_avail_proc - __kmp_nth +
808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809 if (new_nthreads <= 1) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to 1 thread\n",
815 if (new_nthreads < set_nthreads) {
816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817 "reservation to %d threads\n",
818 master_tid, new_nthreads));
820 new_nthreads = set_nthreads;
822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823 if (set_nthreads > 2) {
824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825 new_nthreads = (new_nthreads % set_nthreads) + 1;
826 if (new_nthreads == 1) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to 1 thread\n",
832 if (new_nthreads < set_nthreads) {
833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834 "reservation to %d threads\n",
835 master_tid, new_nthreads));
842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843 if (__kmp_nth + new_nthreads -
844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
846 int tl_nthreads = __kmp_max_nth - __kmp_nth +
847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848 if (tl_nthreads <= 0) {
852 // If dyn-var is false, emit a 1-time warning.
853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854 __kmp_reserve_warn = 1;
855 __kmp_msg(kmp_ms_warning,
856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
859 if (tl_nthreads == 1) {
860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861 "reduced reservation to 1 thread\n",
865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866 "reservation to %d threads\n",
867 master_tid, tl_nthreads));
868 new_nthreads = tl_nthreads;
871 // Respect OMP_THREAD_LIMIT
872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874 if (cg_nthreads + new_nthreads -
875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
877 int tl_nthreads = max_cg_threads - cg_nthreads +
878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879 if (tl_nthreads <= 0) {
883 // If dyn-var is false, emit a 1-time warning.
884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885 __kmp_reserve_warn = 1;
886 __kmp_msg(kmp_ms_warning,
887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
890 if (tl_nthreads == 1) {
891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892 "reduced reservation to 1 thread\n",
896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897 "reservation to %d threads\n",
898 master_tid, tl_nthreads));
899 new_nthreads = tl_nthreads;
902 // Check if the threads array is large enough, or needs expanding.
903 // See comment in __kmp_register_root() about the adjustment if
904 // __kmp_threads[0] == NULL.
905 capacity = __kmp_threads_capacity;
906 if (TCR_PTR(__kmp_threads[0]) == NULL) {
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
916 int slotsAdded = __kmp_expand_threads(slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
931 __kmp_msg(kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
940 if (new_nthreads == 1) {
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955 assured that there are enough threads available, because we checked on that
956 earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958 kmp_info_t *master_th, int master_gtid) {
962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
966 /* first, let's setup the master thread */
967 master_th->th.th_info.ds.ds_tid = 0;
968 master_th->th.th_team = team;
969 master_th->th.th_team_nproc = team->t.t_nproc;
970 master_th->th.th_team_master = master_th;
971 master_th->th.th_team_serialized = FALSE;
972 master_th->th.th_dispatch = &team->t.t_dispatch[0];
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978 if (hot_teams) { // hot teams array is not allocated if
979 // KMP_HOT_TEAMS_MAX_LEVEL=0
980 int level = team->t.t_active_level - 1; // index in array of hot teams
981 if (master_th->th.th_teams_microtask) { // are we inside the teams?
982 if (master_th->th.th_teams_size.nteams > 1) {
983 ++level; // level was not increased in teams construct for
986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987 master_th->th.th_teams_level == team->t.t_level) {
988 ++level; // level was not increased in teams construct for
989 // team_of_workers before the parallel
990 } // team->t.t_level will be increased inside parallel
992 if (level < __kmp_hot_teams_max_level) {
993 if (hot_teams[level].hot_team) {
994 // hot team has already been allocated for given level
995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996 use_hot_team = 1; // the team is ready to use
998 use_hot_team = 0; // AC: threads are not allocated yet
999 hot_teams[level].hot_team = team; // remember new hot team
1000 hot_teams[level].hot_team_nth = team->t.t_nproc;
1007 use_hot_team = team == root->r.r_hot_team;
1009 if (!use_hot_team) {
1011 /* install the master thread */
1012 team->t.t_threads[0] = master_th;
1013 __kmp_initialize_info(master_th, team, 0, master_gtid);
1015 /* now, install the worker threads */
1016 for (i = 1; i < team->t.t_nproc; i++) {
1018 /* fork or reallocate a new thread and install it in team */
1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020 team->t.t_threads[i] = thr;
1021 KMP_DEBUG_ASSERT(thr);
1022 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023 /* align team and thread arrived states */
1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029 team->t.t_bar[bs_plain_barrier].b_arrived));
1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031 thr->th.th_teams_level = master_th->th.th_teams_level;
1032 thr->th.th_teams_size = master_th->th.th_teams_size;
1033 { // Initialize threads' barrier data.
1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036 for (b = 0; b < bs_last_barrier; ++b) {
1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1046 #if KMP_AFFINITY_SUPPORTED
1047 __kmp_partition_places(team);
1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052 for (i = 0; i < team->t.t_nproc; i++) {
1053 kmp_info_t *thr = team->t.t_threads[i];
1054 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055 thr->th.th_prev_level != team->t.t_level) {
1056 team->t.t_display_affinity = 1;
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070 if (__kmp_inherit_fp_control) {
1071 kmp_int16 x87_fpu_control_word;
1074 // Get master values of FPU control flags (both X87 and vector)
1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076 __kmp_store_mxcsr(&mxcsr);
1077 mxcsr &= KMP_X86_MXCSR_MASK;
1079 // There is no point looking at t_fp_control_saved here.
1080 // If it is TRUE, we still have to update the values if they are different
1081 // from those we now have. If it is FALSE we didn't save anything yet, but
1082 // our objective is the same. We have to ensure that the values in the team
1083 // are the same as those we have.
1084 // So, this code achieves what we need whether or not t_fp_control_saved is
1085 // true. By checking whether the value needs updating we avoid unnecessary
1086 // writes that would put the cache-line into a written state, causing all
1087 // threads in the team to have to read it again.
1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090 // Although we don't use this value, other code in the runtime wants to know
1091 // whether it should restore them. So we must ensure it is correct.
1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1094 // Similarly here. Don't write to this cache-line in the team structure
1095 // unless we have to.
1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1100 // Do the opposite, setting the hardware registers to the updated values from
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104 // Only reset the fp control regs if they have been changed in the team.
1105 // the parallel region that we are exiting.
1106 kmp_int16 x87_fpu_control_word;
1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109 __kmp_store_mxcsr(&mxcsr);
1110 mxcsr &= KMP_X86_MXCSR_MASK;
1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113 __kmp_clear_x87_fpu_status_word();
1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1117 if (team->t.t_mxcsr != mxcsr) {
1118 __kmp_load_mxcsr(&team->t.t_mxcsr);
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128 int realloc); // forward declaration
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131 single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133 kmp_info_t *this_thr;
1134 kmp_team_t *serial_team;
1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1138 /* Skip all this code for autopar serialized loops since it results in
1139 unacceptable overhead */
1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1143 if (!TCR_4(__kmp_init_parallel))
1144 __kmp_parallel_initialize();
1145 __kmp_resume_if_soft_paused();
1147 this_thr = __kmp_threads[global_tid];
1148 serial_team = this_thr->th.th_serial_team;
1150 /* utilize the serialized team held by this thread */
1151 KMP_DEBUG_ASSERT(serial_team);
1154 if (__kmp_tasking_mode != tskm_immediate_exec) {
1156 this_thr->th.th_task_team ==
1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161 "team %p, new task_team = NULL\n",
1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163 this_thr->th.th_task_team = NULL;
1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168 proc_bind = proc_bind_false;
1169 } else if (proc_bind == proc_bind_default) {
1170 // No proc_bind clause was specified, so use the current value
1171 // of proc-bind-var for this parallel region.
1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1174 // Reset for next parallel region
1175 this_thr->th.th_set_proc_bind = proc_bind_default;
1178 ompt_data_t ompt_parallel_data = ompt_data_none;
1179 ompt_data_t *implicit_task_data;
1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181 if (ompt_enabled.enabled &&
1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1184 ompt_task_info_t *parent_task_info;
1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188 if (ompt_enabled.ompt_callback_parallel_begin) {
1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192 &(parent_task_info->task_data), &(parent_task_info->frame),
1193 &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1197 #endif // OMPT_SUPPORT
1199 if (this_thr->th.th_team != serial_team) {
1200 // Nested level will be an index in the nested nthreads array
1201 int level = this_thr->th.th_team->t.t_level;
1203 if (serial_team->t.t_serialized) {
1204 /* this serial team was already used
1205 TODO increase performance by making this locks more specific */
1206 kmp_team_t *new_team;
1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1215 proc_bind, &this_thr->th.th_current_task->td_icvs,
1216 0 USE_NESTED_HOT_ARG(NULL));
1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218 KMP_ASSERT(new_team);
1220 /* setup new serialized team and install it */
1221 new_team->t.t_threads[0] = this_thr;
1222 new_team->t.t_parent = this_thr->th.th_team;
1223 serial_team = new_team;
1224 this_thr->th.th_serial_team = serial_team;
1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229 global_tid, serial_team));
1231 /* TODO the above breaks the requirement that if we run out of resources,
1232 then we can still guarantee that serialized teams are ok, since we may
1233 need to allocate a new one */
1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238 global_tid, serial_team));
1241 /* we have to initialize this serial team */
1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245 serial_team->t.t_ident = loc;
1246 serial_team->t.t_serialized = 1;
1247 serial_team->t.t_nproc = 1;
1248 serial_team->t.t_parent = this_thr->th.th_team;
1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250 this_thr->th.th_team = serial_team;
1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254 this_thr->th.th_current_task));
1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256 this_thr->th.th_current_task->td_flags.executing = 0;
1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261 implicit task for each serialized task represented by
1262 team->t.t_serialized? */
1263 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264 &this_thr->th.th_current_task->td_parent->td_icvs);
1266 // Thread value exists in the nested nthreads array for the next nested
1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269 this_thr->th.th_current_task->td_icvs.nproc =
1270 __kmp_nested_nth.nth[level + 1];
1273 if (__kmp_nested_proc_bind.used &&
1274 (level + 1 < __kmp_nested_proc_bind.used)) {
1275 this_thr->th.th_current_task->td_icvs.proc_bind =
1276 __kmp_nested_proc_bind.bind_types[level + 1];
1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1282 this_thr->th.th_info.ds.ds_tid = 0;
1284 /* set thread cache values */
1285 this_thr->th.th_team_nproc = 1;
1286 this_thr->th.th_team_master = this_thr;
1287 this_thr->th.th_team_serialized = 1;
1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1293 propagateFPControl(serial_team);
1295 /* check if we need to allocate dispatch buffers stack */
1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298 serial_team->t.t_dispatch->th_disp_buffer =
1299 (dispatch_private_info_t *)__kmp_allocate(
1300 sizeof(dispatch_private_info_t));
1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1307 /* this serialized team is already being used,
1308 * that's fine, just add another nested level */
1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312 ++serial_team->t.t_serialized;
1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1315 // Nested level will be an index in the nested nthreads array
1316 int level = this_thr->th.th_team->t.t_level;
1317 // Thread value exists in the nested nthreads array for the next nested
1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320 this_thr->th.th_current_task->td_icvs.nproc =
1321 __kmp_nested_nth.nth[level + 1];
1323 serial_team->t.t_level++;
1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325 "of serial team %p to %d\n",
1326 global_tid, serial_team, serial_team->t.t_level));
1328 /* allocate/push dispatch buffers stack */
1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1331 dispatch_private_info_t *disp_buffer =
1332 (dispatch_private_info_t *)__kmp_allocate(
1333 sizeof(dispatch_private_info_t));
1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1343 // Perform the display affinity functionality for
1344 // serialized parallel regions
1345 if (__kmp_display_affinity) {
1346 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347 this_thr->th.th_prev_num_threads != 1) {
1348 // NULL means use the affinity-format-var ICV
1349 __kmp_aux_display_affinity(global_tid, NULL);
1350 this_thr->th.th_prev_level = serial_team->t.t_level;
1351 this_thr->th.th_prev_num_threads = 1;
1355 if (__kmp_env_consistency_check)
1356 __kmp_push_parallel(global_tid, NULL);
1358 serial_team->t.ompt_team_info.master_return_address = codeptr;
1359 if (ompt_enabled.enabled &&
1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1363 ompt_lw_taskteam_t lw_taskteam;
1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365 &ompt_parallel_data, codeptr);
1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368 // don't use lw_taskteam after linking. content was swaped
1370 /* OMPT implicit task begin */
1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372 if (ompt_enabled.ompt_callback_implicit_task) {
1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376 OMPT_CUR_TASK_INFO(this_thr)
1377 ->thread_num = __kmp_tid_from_gtid(global_tid);
1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390 enum fork_context_e call_context, // Intel, GNU, ...
1391 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1393 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1402 int master_this_cons;
1404 kmp_team_t *parent_team;
1405 kmp_info_t *master_th;
1409 int master_set_numthreads;
1413 #if KMP_NESTED_HOT_TEAMS
1414 kmp_hot_team_ptr_t **p_hot_teams;
1417 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1418 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1420 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1421 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1422 /* Some systems prefer the stack for the root thread(s) to start with */
1423 /* some gap from the parent stack to prevent false sharing. */
1424 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1425 /* These 2 lines below are so this does not get optimized out */
1426 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1427 __kmp_stkpadding += (short)((kmp_int64)dummy);
1430 /* initialize if needed */
1432 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1433 if (!TCR_4(__kmp_init_parallel))
1434 __kmp_parallel_initialize();
1435 __kmp_resume_if_soft_paused();
1437 /* setup current data */
1438 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1440 parent_team = master_th->th.th_team;
1441 master_tid = master_th->th.th_info.ds.ds_tid;
1442 master_this_cons = master_th->th.th_local.this_construct;
1443 root = master_th->th.th_root;
1444 master_active = root->r.r_active;
1445 master_set_numthreads = master_th->th.th_set_nproc;
1448 ompt_data_t ompt_parallel_data = ompt_data_none;
1449 ompt_data_t *parent_task_data;
1450 ompt_frame_t *ompt_frame;
1451 ompt_data_t *implicit_task_data;
1452 void *return_address = NULL;
1454 if (ompt_enabled.enabled) {
1455 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1457 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461 // Nested level will be an index in the nested nthreads array
1462 level = parent_team->t.t_level;
1463 // used to launch non-serial teams even if nested is not allowed
1464 active_level = parent_team->t.t_active_level;
1465 // needed to check nesting inside the teams
1466 teams_level = master_th->th.th_teams_level;
1467 #if KMP_NESTED_HOT_TEAMS
1468 p_hot_teams = &master_th->th.th_hot_teams;
1469 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1470 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1471 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1472 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1473 // it is either actual or not needed (when active_level > 0)
1474 (*p_hot_teams)[0].hot_team_nth = 1;
1479 if (ompt_enabled.enabled) {
1480 if (ompt_enabled.ompt_callback_parallel_begin) {
1481 int team_size = master_set_numthreads
1482 ? master_set_numthreads
1483 : get__nproc_2(parent_team, master_tid);
1484 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1485 parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1486 OMPT_INVOKER(call_context), return_address);
1488 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1492 master_th->th.th_ident = loc;
1494 if (master_th->th.th_teams_microtask && ap &&
1495 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1496 // AC: This is start of parallel that is nested inside teams construct.
1497 // The team is actual (hot), all workers are ready at the fork barrier.
1498 // No lock needed to initialize the team a bit, then free workers.
1499 parent_team->t.t_ident = loc;
1500 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1501 parent_team->t.t_argc = argc;
1502 argv = (void **)parent_team->t.t_argv;
1503 for (i = argc - 1; i >= 0; --i)
1504 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1505 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1506 *argv++ = va_arg(*ap, void *);
1508 *argv++ = va_arg(ap, void *);
1510 // Increment our nested depth levels, but not increase the serialization
1511 if (parent_team == master_th->th.th_serial_team) {
1512 // AC: we are in serialized parallel
1513 __kmpc_serialized_parallel(loc, gtid);
1514 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1515 // AC: need this in order enquiry functions work
1516 // correctly, will restore at join time
1517 parent_team->t.t_serialized--;
1520 void **exit_runtime_p;
1522 ompt_lw_taskteam_t lw_taskteam;
1524 if (ompt_enabled.enabled) {
1525 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1526 &ompt_parallel_data, return_address);
1527 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1529 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1530 // don't use lw_taskteam after linking. content was swaped
1532 /* OMPT implicit task begin */
1533 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1534 if (ompt_enabled.ompt_callback_implicit_task) {
1535 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1538 OMPT_CUR_TASK_INFO(master_th)
1539 ->thread_num = __kmp_tid_from_gtid(gtid);
1543 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1545 exit_runtime_p = &dummy;
1550 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1551 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1552 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1561 *exit_runtime_p = NULL;
1562 if (ompt_enabled.enabled) {
1563 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1564 if (ompt_enabled.ompt_callback_implicit_task) {
1565 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1566 ompt_scope_end, NULL, implicit_task_data, 1,
1567 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1569 __ompt_lw_taskteam_unlink(master_th);
1571 if (ompt_enabled.ompt_callback_parallel_end) {
1572 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1573 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1574 OMPT_INVOKER(call_context), return_address);
1576 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1582 parent_team->t.t_pkfn = microtask;
1583 parent_team->t.t_invoke = invoker;
1584 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1585 parent_team->t.t_active_level++;
1586 parent_team->t.t_level++;
1587 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1589 /* Change number of threads in the team if requested */
1590 if (master_set_numthreads) { // The parallel has num_threads clause
1591 if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1592 // AC: only can reduce number of threads dynamically, can't increase
1593 kmp_info_t **other_threads = parent_team->t.t_threads;
1594 parent_team->t.t_nproc = master_set_numthreads;
1595 for (i = 0; i < master_set_numthreads; ++i) {
1596 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1598 // Keep extra threads hot in the team for possible next parallels
1600 master_th->th.th_set_nproc = 0;
1604 if (__kmp_debugging) { // Let debugger override number of threads.
1605 int nth = __kmp_omp_num_threads(loc);
1606 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1607 master_set_numthreads = nth;
1612 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1613 "master_th=%p, gtid=%d\n",
1614 root, parent_team, master_th, gtid));
1615 __kmp_internal_fork(loc, gtid, parent_team);
1616 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1617 "master_th=%p, gtid=%d\n",
1618 root, parent_team, master_th, gtid));
1620 /* Invoke microtask for MASTER thread */
1621 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1622 parent_team->t.t_id, parent_team->t.t_pkfn));
1624 if (!parent_team->t.t_invoke(gtid)) {
1625 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1627 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1628 parent_team->t.t_id, parent_team->t.t_pkfn));
1629 KMP_MB(); /* Flush all pending memory write invalidates. */
1631 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1634 } // Parallel closely nested in teams construct
1637 if (__kmp_tasking_mode != tskm_immediate_exec) {
1638 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1639 parent_team->t.t_task_team[master_th->th.th_task_state]);
1643 if (parent_team->t.t_active_level >=
1644 master_th->th.th_current_task->td_icvs.max_active_levels) {
1647 int enter_teams = ((ap == NULL && active_level == 0) ||
1648 (ap && teams_level > 0 && teams_level == level));
1650 master_set_numthreads
1651 ? master_set_numthreads
1654 master_tid); // TODO: get nproc directly from current task
1656 // Check if we need to take forkjoin lock? (no need for serialized
1657 // parallel out of teams construct). This code moved here from
1658 // __kmp_reserve_threads() to speedup nested serialized parallels.
1660 if ((get__max_active_levels(master_th) == 1 &&
1661 (root->r.r_in_parallel && !enter_teams)) ||
1662 (__kmp_library == library_serial)) {
1663 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1670 /* determine how many new threads we can use */
1671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1672 /* AC: If we execute teams from parallel region (on host), then teams
1673 should be created but each can only have 1 thread if nesting is
1674 disabled. If teams called from serial region, then teams and their
1675 threads should be created regardless of the nesting setting. */
1676 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1677 nthreads, enter_teams);
1678 if (nthreads == 1) {
1679 // Free lock for single thread execution here; for multi-thread
1680 // execution it will be freed later after team of threads created
1682 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1686 KMP_DEBUG_ASSERT(nthreads > 0);
1688 // If we temporarily changed the set number of threads then restore it now
1689 master_th->th.th_set_nproc = 0;
1691 /* create a serialized parallel region? */
1692 if (nthreads == 1) {
1693 /* josh todo: hypothetical question: what do we do for OS X*? */
1694 #if KMP_OS_LINUX && \
1695 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1698 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1700 KMP_ARCH_AARCH64) */
1703 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1705 __kmpc_serialized_parallel(loc, gtid);
1707 if (call_context == fork_context_intel) {
1708 /* TODO this sucks, use the compiler itself to pass args! :) */
1709 master_th->th.th_serial_team->t.t_ident = loc;
1711 // revert change made in __kmpc_serialized_parallel()
1712 master_th->th.th_serial_team->t.t_level--;
1713 // Get args from parent team for teams construct
1717 void **exit_runtime_p;
1718 ompt_task_info_t *task_info;
1720 ompt_lw_taskteam_t lw_taskteam;
1722 if (ompt_enabled.enabled) {
1723 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1724 &ompt_parallel_data, return_address);
1726 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1727 // don't use lw_taskteam after linking. content was swaped
1729 task_info = OMPT_CUR_TASK_INFO(master_th);
1730 exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1731 if (ompt_enabled.ompt_callback_implicit_task) {
1732 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1733 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1734 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1735 OMPT_CUR_TASK_INFO(master_th)
1736 ->thread_num = __kmp_tid_from_gtid(gtid);
1740 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1742 exit_runtime_p = &dummy;
1747 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1750 parent_team->t.t_argv
1759 if (ompt_enabled.enabled) {
1760 exit_runtime_p = NULL;
1761 if (ompt_enabled.ompt_callback_implicit_task) {
1762 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1763 ompt_scope_end, NULL, &(task_info->task_data), 1,
1764 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1767 __ompt_lw_taskteam_unlink(master_th);
1768 if (ompt_enabled.ompt_callback_parallel_end) {
1769 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1770 OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1771 OMPT_INVOKER(call_context), return_address);
1773 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1776 } else if (microtask == (microtask_t)__kmp_teams_master) {
1777 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1778 master_th->th.th_serial_team);
1779 team = master_th->th.th_team;
1780 // team->t.t_pkfn = microtask;
1781 team->t.t_invoke = invoker;
1782 __kmp_alloc_argv_entries(argc, team, TRUE);
1783 team->t.t_argc = argc;
1784 argv = (void **)team->t.t_argv;
1786 for (i = argc - 1; i >= 0; --i)
1787 // TODO: revert workaround for Intel(R) 64 tracker #96
1788 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1789 *argv++ = va_arg(*ap, void *);
1791 *argv++ = va_arg(ap, void *);
1794 for (i = 0; i < argc; ++i)
1795 // Get args from parent team for teams construct
1796 argv[i] = parent_team->t.t_argv[i];
1798 // AC: revert change made in __kmpc_serialized_parallel()
1799 // because initial code in teams should have level=0
1801 // AC: call special invoker for outer "parallel" of teams construct
1805 for (i = argc - 1; i >= 0; --i)
1806 // TODO: revert workaround for Intel(R) 64 tracker #96
1807 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1808 *argv++ = va_arg(*ap, void *);
1810 *argv++ = va_arg(ap, void *);
1816 void **exit_runtime_p;
1817 ompt_task_info_t *task_info;
1819 ompt_lw_taskteam_t lw_taskteam;
1821 if (ompt_enabled.enabled) {
1822 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1823 &ompt_parallel_data, return_address);
1824 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1825 // don't use lw_taskteam after linking. content was swaped
1826 task_info = OMPT_CUR_TASK_INFO(master_th);
1827 exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1829 /* OMPT implicit task begin */
1830 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1831 if (ompt_enabled.ompt_callback_implicit_task) {
1832 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1833 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1834 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1835 OMPT_CUR_TASK_INFO(master_th)
1836 ->thread_num = __kmp_tid_from_gtid(gtid);
1840 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1842 exit_runtime_p = &dummy;
1847 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1848 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1849 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1858 if (ompt_enabled.enabled) {
1859 *exit_runtime_p = NULL;
1860 if (ompt_enabled.ompt_callback_implicit_task) {
1861 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862 ompt_scope_end, NULL, &(task_info->task_data), 1,
1863 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1866 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1867 __ompt_lw_taskteam_unlink(master_th);
1868 if (ompt_enabled.ompt_callback_parallel_end) {
1869 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1870 &ompt_parallel_data, parent_task_data,
1871 OMPT_INVOKER(call_context), return_address);
1873 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1877 } else if (call_context == fork_context_gnu) {
1879 ompt_lw_taskteam_t lwt;
1880 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1883 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1884 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1885 // don't use lw_taskteam after linking. content was swaped
1888 // we were called from GNU native code
1889 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1892 KMP_ASSERT2(call_context < fork_context_last,
1893 "__kmp_fork_call: unknown fork_context parameter");
1896 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1899 } // if (nthreads == 1)
1901 // GEH: only modify the executing flag in the case when not serialized
1902 // serialized case is handled in kmpc_serialized_parallel
1903 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1904 "curtask=%p, curtask_max_aclevel=%d\n",
1905 parent_team->t.t_active_level, master_th,
1906 master_th->th.th_current_task,
1907 master_th->th.th_current_task->td_icvs.max_active_levels));
1908 // TODO: GEH - cannot do this assertion because root thread not set up as
1910 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1911 master_th->th.th_current_task->td_flags.executing = 0;
1913 if (!master_th->th.th_teams_microtask || level > teams_level) {
1914 /* Increment our nested depth level */
1915 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1918 // See if we need to make a copy of the ICVs.
1919 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1920 if ((level + 1 < __kmp_nested_nth.used) &&
1921 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1922 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1924 nthreads_icv = 0; // don't update
1927 // Figure out the proc_bind_policy for the new team.
1928 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1929 kmp_proc_bind_t proc_bind_icv =
1930 proc_bind_default; // proc_bind_default means don't update
1931 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1932 proc_bind = proc_bind_false;
1934 if (proc_bind == proc_bind_default) {
1935 // No proc_bind clause specified; use current proc-bind-var for this
1937 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1939 /* else: The proc_bind policy was specified explicitly on parallel clause.
1940 This overrides proc-bind-var for this parallel region, but does not
1941 change proc-bind-var. */
1942 // Figure the value of proc-bind-var for the child threads.
1943 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1944 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1945 master_th->th.th_current_task->td_icvs.proc_bind)) {
1946 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1950 // Reset for next parallel region
1951 master_th->th.th_set_proc_bind = proc_bind_default;
1953 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1954 kmp_internal_control_t new_icvs;
1955 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1956 new_icvs.next = NULL;
1957 if (nthreads_icv > 0) {
1958 new_icvs.nproc = nthreads_icv;
1960 if (proc_bind_icv != proc_bind_default) {
1961 new_icvs.proc_bind = proc_bind_icv;
1964 /* allocate a new parallel team */
1965 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1966 team = __kmp_allocate_team(root, nthreads, nthreads,
1970 proc_bind, &new_icvs,
1971 argc USE_NESTED_HOT_ARG(master_th));
1973 /* allocate a new parallel team */
1974 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1975 team = __kmp_allocate_team(root, nthreads, nthreads,
1980 &master_th->th.th_current_task->td_icvs,
1981 argc USE_NESTED_HOT_ARG(master_th));
1984 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1986 /* setup the new team */
1987 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1988 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1989 KMP_CHECK_UPDATE(team->t.t_ident, loc);
1990 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1991 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1993 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1996 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
1997 // TODO: parent_team->t.t_level == INT_MAX ???
1998 if (!master_th->th.th_teams_microtask || level > teams_level) {
1999 int new_level = parent_team->t.t_level + 1;
2000 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2001 new_level = parent_team->t.t_active_level + 1;
2002 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2004 // AC: Do not increase parallel level at start of the teams construct
2005 int new_level = parent_team->t.t_level;
2006 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2007 new_level = parent_team->t.t_active_level;
2008 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2010 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2011 // set master's schedule as new run-time schedule
2012 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2014 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2015 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2017 // Update the floating point rounding in the team if required.
2018 propagateFPControl(team);
2020 if (__kmp_tasking_mode != tskm_immediate_exec) {
2021 // Set master's task team to team's task team. Unless this is hot team, it
2023 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2024 parent_team->t.t_task_team[master_th->th.th_task_state]);
2025 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2026 "%p, new task_team %p / team %p\n",
2027 __kmp_gtid_from_thread(master_th),
2028 master_th->th.th_task_team, parent_team,
2029 team->t.t_task_team[master_th->th.th_task_state], team));
2031 if (active_level || master_th->th.th_task_team) {
2032 // Take a memo of master's task_state
2033 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2034 if (master_th->th.th_task_state_top >=
2035 master_th->th.th_task_state_stack_sz) { // increase size
2036 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2037 kmp_uint8 *old_stack, *new_stack;
2039 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2040 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2041 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2043 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2044 ++i) { // zero-init rest of stack
2047 old_stack = master_th->th.th_task_state_memo_stack;
2048 master_th->th.th_task_state_memo_stack = new_stack;
2049 master_th->th.th_task_state_stack_sz = new_size;
2050 __kmp_free(old_stack);
2052 // Store master's task_state on stack
2054 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2055 master_th->th.th_task_state;
2056 master_th->th.th_task_state_top++;
2057 #if KMP_NESTED_HOT_TEAMS
2058 if (master_th->th.th_hot_teams &&
2059 active_level < __kmp_hot_teams_max_level &&
2060 team == master_th->th.th_hot_teams[active_level].hot_team) {
2061 // Restore master's nested state if nested hot team
2062 master_th->th.th_task_state =
2064 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2067 master_th->th.th_task_state = 0;
2068 #if KMP_NESTED_HOT_TEAMS
2072 #if !KMP_NESTED_HOT_TEAMS
2073 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2074 (team == root->r.r_hot_team));
2080 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2081 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2083 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2084 (team->t.t_master_tid == 0 &&
2085 (team->t.t_parent == root->r.r_root_team ||
2086 team->t.t_parent->t.t_serialized)));
2089 /* now, setup the arguments */
2090 argv = (void **)team->t.t_argv;
2092 for (i = argc - 1; i >= 0; --i) {
2093 // TODO: revert workaround for Intel(R) 64 tracker #96
2094 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2095 void *new_argv = va_arg(*ap, void *);
2097 void *new_argv = va_arg(ap, void *);
2099 KMP_CHECK_UPDATE(*argv, new_argv);
2103 for (i = 0; i < argc; ++i) {
2104 // Get args from parent team for teams construct
2105 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2109 /* now actually fork the threads */
2110 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2111 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2112 root->r.r_active = TRUE;
2114 __kmp_fork_team_threads(root, team, master_th, gtid);
2115 __kmp_setup_icv_copy(team, nthreads,
2116 &master_th->th.th_current_task->td_icvs, loc);
2119 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2122 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2125 if (team->t.t_active_level == 1 // only report frames at level 1
2126 && !master_th->th.th_teams_microtask) { // not in teams construct
2128 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2129 (__kmp_forkjoin_frames_mode == 3 ||
2130 __kmp_forkjoin_frames_mode == 1)) {
2131 kmp_uint64 tmp_time = 0;
2132 if (__itt_get_timestamp_ptr)
2133 tmp_time = __itt_get_timestamp();
2134 // Internal fork - report frame begin
2135 master_th->th.th_frame_time = tmp_time;
2136 if (__kmp_forkjoin_frames_mode == 3)
2137 team->t.t_region_time = tmp_time;
2139 // only one notification scheme (either "submit" or "forking/joined", not both)
2140 #endif /* USE_ITT_NOTIFY */
2141 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2142 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2143 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2144 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2147 #endif /* USE_ITT_BUILD */
2149 /* now go on and do the work */
2150 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2153 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2154 root, team, master_th, gtid));
2157 if (__itt_stack_caller_create_ptr) {
2158 team->t.t_stack_id =
2159 __kmp_itt_stack_caller_create(); // create new stack stitching id
2160 // before entering fork barrier
2162 #endif /* USE_ITT_BUILD */
2164 // AC: skip __kmp_internal_fork at teams construct, let only master
2167 __kmp_internal_fork(loc, gtid, team);
2168 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2169 "master_th=%p, gtid=%d\n",
2170 root, team, master_th, gtid));
2173 if (call_context == fork_context_gnu) {
2174 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2178 /* Invoke microtask for MASTER thread */
2179 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2180 team->t.t_id, team->t.t_pkfn));
2181 } // END of timer KMP_fork_call block
2183 #if KMP_STATS_ENABLED
2184 // If beginning a teams construct, then change thread state
2185 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2187 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2191 if (!team->t.t_invoke(gtid)) {
2192 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2195 #if KMP_STATS_ENABLED
2196 // If was beginning of a teams construct, then reset thread state
2198 KMP_SET_THREAD_STATE(previous_state);
2202 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2203 team->t.t_id, team->t.t_pkfn));
2204 KMP_MB(); /* Flush all pending memory write invalidates. */
2206 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2209 if (ompt_enabled.enabled) {
2210 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2218 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2220 // restore state outside the region
2221 thread->th.ompt_thread_info.state =
2222 ((team->t.t_serialized) ? ompt_state_work_serial
2223 : ompt_state_work_parallel);
2226 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2227 kmp_team_t *team, ompt_data_t *parallel_data,
2228 fork_context_e fork_context, void *codeptr) {
2229 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2230 if (ompt_enabled.ompt_callback_parallel_end) {
2231 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2232 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2236 task_info->frame.enter_frame = ompt_data_none;
2237 __kmp_join_restore_state(thread, team);
2241 void __kmp_join_call(ident_t *loc, int gtid
2244 enum fork_context_e fork_context
2248 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2250 kmp_team_t *parent_team;
2251 kmp_info_t *master_th;
2255 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2257 /* setup current data */
2258 master_th = __kmp_threads[gtid];
2259 root = master_th->th.th_root;
2260 team = master_th->th.th_team;
2261 parent_team = team->t.t_parent;
2263 master_th->th.th_ident = loc;
2266 if (ompt_enabled.enabled) {
2267 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2272 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2273 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2274 "th_task_team = %p\n",
2275 __kmp_gtid_from_thread(master_th), team,
2276 team->t.t_task_team[master_th->th.th_task_state],
2277 master_th->th.th_task_team));
2278 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2279 team->t.t_task_team[master_th->th.th_task_state]);
2283 if (team->t.t_serialized) {
2284 if (master_th->th.th_teams_microtask) {
2285 // We are in teams construct
2286 int level = team->t.t_level;
2287 int tlevel = master_th->th.th_teams_level;
2288 if (level == tlevel) {
2289 // AC: we haven't incremented it earlier at start of teams construct,
2290 // so do it here - at the end of teams construct
2292 } else if (level == tlevel + 1) {
2293 // AC: we are exiting parallel inside teams, need to increment
2294 // serialization in order to restore it in the next call to
2295 // __kmpc_end_serialized_parallel
2296 team->t.t_serialized++;
2299 __kmpc_end_serialized_parallel(loc, gtid);
2302 if (ompt_enabled.enabled) {
2303 __kmp_join_restore_state(master_th, parent_team);
2310 master_active = team->t.t_master_active;
2313 // AC: No barrier for internal teams at exit from teams construct.
2314 // But there is barrier for external team (league).
2315 __kmp_internal_join(loc, gtid, team);
2317 master_th->th.th_task_state =
2318 0; // AC: no tasking in teams (out of any parallel)
2324 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2325 void *codeptr = team->t.ompt_team_info.master_return_address;
2329 if (__itt_stack_caller_create_ptr) {
2330 __kmp_itt_stack_caller_destroy(
2331 (__itt_caller)team->t
2332 .t_stack_id); // destroy the stack stitching id after join barrier
2335 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2336 if (team->t.t_active_level == 1 &&
2337 !master_th->th.th_teams_microtask) { /* not in teams construct */
2338 master_th->th.th_ident = loc;
2339 // only one notification scheme (either "submit" or "forking/joined", not
2341 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2342 __kmp_forkjoin_frames_mode == 3)
2343 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2344 master_th->th.th_frame_time, 0, loc,
2345 master_th->th.th_team_nproc, 1);
2346 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2347 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2348 __kmp_itt_region_joined(gtid);
2349 } // active_level == 1
2350 #endif /* USE_ITT_BUILD */
2352 if (master_th->th.th_teams_microtask && !exit_teams &&
2353 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2354 team->t.t_level == master_th->th.th_teams_level + 1) {
2355 // AC: We need to leave the team structure intact at the end of parallel
2356 // inside the teams construct, so that at the next parallel same (hot) team
2357 // works, only adjust nesting levels
2359 /* Decrement our nested depth level */
2361 team->t.t_active_level--;
2362 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2364 // Restore number of threads in the team if needed. This code relies on
2365 // the proper adjustment of th_teams_size.nth after the fork in
2366 // __kmp_teams_master on each teams master in the case that
2367 // __kmp_reserve_threads reduced it.
2368 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2369 int old_num = master_th->th.th_team_nproc;
2370 int new_num = master_th->th.th_teams_size.nth;
2371 kmp_info_t **other_threads = team->t.t_threads;
2372 team->t.t_nproc = new_num;
2373 for (int i = 0; i < old_num; ++i) {
2374 other_threads[i]->th.th_team_nproc = new_num;
2376 // Adjust states of non-used threads of the team
2377 for (int i = old_num; i < new_num; ++i) {
2378 // Re-initialize thread's barrier data.
2379 KMP_DEBUG_ASSERT(other_threads[i]);
2380 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2381 for (int b = 0; b < bs_last_barrier; ++b) {
2382 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2383 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2385 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2388 if (__kmp_tasking_mode != tskm_immediate_exec) {
2389 // Synchronize thread's task state
2390 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2396 if (ompt_enabled.enabled) {
2397 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2405 /* do cleanup and restore the parent team */
2406 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2407 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2409 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2411 /* jc: The following lock has instructions with REL and ACQ semantics,
2412 separating the parallel user code called in this parallel region
2413 from the serial user code called after this function returns. */
2414 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2416 if (!master_th->th.th_teams_microtask ||
2417 team->t.t_level > master_th->th.th_teams_level) {
2418 /* Decrement our nested depth level */
2419 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2421 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2424 if (ompt_enabled.enabled) {
2425 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2426 if (ompt_enabled.ompt_callback_implicit_task) {
2427 int ompt_team_size = team->t.t_nproc;
2428 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2429 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2430 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2433 task_info->frame.exit_frame = ompt_data_none;
2434 task_info->task_data = ompt_data_none;
2438 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2440 __kmp_pop_current_task_from_thread(master_th);
2442 #if KMP_AFFINITY_SUPPORTED
2443 // Restore master thread's partition.
2444 master_th->th.th_first_place = team->t.t_first_place;
2445 master_th->th.th_last_place = team->t.t_last_place;
2446 #endif // KMP_AFFINITY_SUPPORTED
2447 master_th->th.th_def_allocator = team->t.t_def_allocator;
2449 updateHWFPControl(team);
2451 if (root->r.r_active != master_active)
2452 root->r.r_active = master_active;
2454 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2455 master_th)); // this will free worker threads
2457 /* this race was fun to find. make sure the following is in the critical
2458 region otherwise assertions may fail occasionally since the old team may be
2459 reallocated and the hierarchy appears inconsistent. it is actually safe to
2460 run and won't cause any bugs, but will cause those assertion failures. it's
2461 only one deref&assign so might as well put this in the critical region */
2462 master_th->th.th_team = parent_team;
2463 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2464 master_th->th.th_team_master = parent_team->t.t_threads[0];
2465 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2467 /* restore serialized team, if need be */
2468 if (parent_team->t.t_serialized &&
2469 parent_team != master_th->th.th_serial_team &&
2470 parent_team != root->r.r_root_team) {
2471 __kmp_free_team(root,
2472 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2473 master_th->th.th_serial_team = parent_team;
2476 if (__kmp_tasking_mode != tskm_immediate_exec) {
2477 if (master_th->th.th_task_state_top >
2478 0) { // Restore task state from memo stack
2479 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2480 // Remember master's state if we re-use this nested hot team
2481 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2482 master_th->th.th_task_state;
2483 --master_th->th.th_task_state_top; // pop
2484 // Now restore state at this level
2485 master_th->th.th_task_state =
2487 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2489 // Copy the task team from the parent team to the master thread
2490 master_th->th.th_task_team =
2491 parent_team->t.t_task_team[master_th->th.th_task_state];
2493 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2494 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2498 // TODO: GEH - cannot do this assertion because root thread not set up as
2500 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2501 master_th->th.th_current_task->td_flags.executing = 1;
2503 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2506 if (ompt_enabled.enabled) {
2507 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2513 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2516 /* Check whether we should push an internal control record onto the
2517 serial team stack. If so, do it. */
2518 void __kmp_save_internal_controls(kmp_info_t *thread) {
2520 if (thread->th.th_team != thread->th.th_serial_team) {
2523 if (thread->th.th_team->t.t_serialized > 1) {
2526 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2529 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530 thread->th.th_team->t.t_serialized) {
2534 if (push) { /* push a record on the serial team's stack */
2535 kmp_internal_control_t *control =
2536 (kmp_internal_control_t *)__kmp_allocate(
2537 sizeof(kmp_internal_control_t));
2539 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2541 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2543 control->next = thread->th.th_team->t.t_control_stack_top;
2544 thread->th.th_team->t.t_control_stack_top = control;
2549 /* Changes set_nproc */
2550 void __kmp_set_num_threads(int new_nth, int gtid) {
2554 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2555 KMP_DEBUG_ASSERT(__kmp_init_serial);
2559 else if (new_nth > __kmp_max_nth)
2560 new_nth = __kmp_max_nth;
2562 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563 thread = __kmp_threads[gtid];
2564 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2565 return; // nothing to do
2567 __kmp_save_internal_controls(thread);
2569 set__nproc(thread, new_nth);
2571 // If this omp_set_num_threads() call will cause the hot team size to be
2572 // reduced (in the absence of a num_threads clause), then reduce it now,
2573 // rather than waiting for the next parallel region.
2574 root = thread->th.th_root;
2575 if (__kmp_init_parallel && (!root->r.r_active) &&
2576 (root->r.r_hot_team->t.t_nproc > new_nth)
2577 #if KMP_NESTED_HOT_TEAMS
2578 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2581 kmp_team_t *hot_team = root->r.r_hot_team;
2584 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2586 // Release the extra threads we don't need any more.
2587 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2588 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2589 if (__kmp_tasking_mode != tskm_immediate_exec) {
2590 // When decreasing team size, threads no longer in the team should unref
2592 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2594 __kmp_free_thread(hot_team->t.t_threads[f]);
2595 hot_team->t.t_threads[f] = NULL;
2597 hot_team->t.t_nproc = new_nth;
2598 #if KMP_NESTED_HOT_TEAMS
2599 if (thread->th.th_hot_teams) {
2600 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2601 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2605 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2607 // Update the t_nproc field in the threads that are still active.
2608 for (f = 0; f < new_nth; f++) {
2609 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2610 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2612 // Special flag in case omp_set_num_threads() call
2613 hot_team->t.t_size_changed = -1;
2617 /* Changes max_active_levels */
2618 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2621 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2623 gtid, max_active_levels));
2624 KMP_DEBUG_ASSERT(__kmp_init_serial);
2626 // validate max_active_levels
2627 if (max_active_levels < 0) {
2628 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2629 // We ignore this call if the user has specified a negative value.
2630 // The current setting won't be changed. The last valid setting will be
2631 // used. A warning will be issued (if warnings are allowed as controlled by
2632 // the KMP_WARNINGS env var).
2633 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2634 "max_active_levels for thread %d = (%d)\n",
2635 gtid, max_active_levels));
2638 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2639 // it's OK, the max_active_levels is within the valid range: [ 0;
2640 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2641 // We allow a zero value. (implementation defined behavior)
2643 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2644 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2645 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2646 // Current upper limit is MAX_INT. (implementation defined behavior)
2647 // If the input exceeds the upper limit, we correct the input to be the
2648 // upper limit. (implementation defined behavior)
2649 // Actually, the flow should never get here until we use MAX_INT limit.
2651 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2652 "max_active_levels for thread %d = (%d)\n",
2653 gtid, max_active_levels));
2655 thread = __kmp_threads[gtid];
2657 __kmp_save_internal_controls(thread);
2659 set__max_active_levels(thread, max_active_levels);
2662 /* Gets max_active_levels */
2663 int __kmp_get_max_active_levels(int gtid) {
2666 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2667 KMP_DEBUG_ASSERT(__kmp_init_serial);
2669 thread = __kmp_threads[gtid];
2670 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2671 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2672 "curtask_maxaclevel=%d\n",
2673 gtid, thread->th.th_current_task,
2674 thread->th.th_current_task->td_icvs.max_active_levels));
2675 return thread->th.th_current_task->td_icvs.max_active_levels;
2678 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2679 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2681 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2682 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2684 kmp_sched_t orig_kind;
2685 // kmp_team_t *team;
2687 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2688 gtid, (int)kind, chunk));
2689 KMP_DEBUG_ASSERT(__kmp_init_serial);
2691 // Check if the kind parameter is valid, correct if needed.
2692 // Valid parameters should fit in one of two intervals - standard or extended:
2693 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2694 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2696 kind = __kmp_sched_without_mods(kind);
2698 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2699 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2700 // TODO: Hint needs attention in case we change the default schedule.
2701 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2702 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2704 kind = kmp_sched_default;
2705 chunk = 0; // ignore chunk value in case of bad kind
2708 thread = __kmp_threads[gtid];
2710 __kmp_save_internal_controls(thread);
2712 if (kind < kmp_sched_upper_std) {
2713 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2714 // differ static chunked vs. unchunked: chunk should be invalid to
2715 // indicate unchunked schedule (which is the default)
2716 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2718 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2719 __kmp_sch_map[kind - kmp_sched_lower - 1];
2722 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2723 // kmp_sched_lower - 2 ];
2724 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2725 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2726 kmp_sched_lower - 2];
2728 __kmp_sched_apply_mods_intkind(
2729 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2730 if (kind == kmp_sched_auto || chunk < 1) {
2731 // ignore parameter chunk for schedule auto
2732 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2734 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2738 /* Gets def_sched_var ICV values */
2739 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2741 enum sched_type th_type;
2743 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2744 KMP_DEBUG_ASSERT(__kmp_init_serial);
2746 thread = __kmp_threads[gtid];
2748 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2750 case kmp_sch_static:
2751 case kmp_sch_static_greedy:
2752 case kmp_sch_static_balanced:
2753 *kind = kmp_sched_static;
2754 __kmp_sched_apply_mods_stdkind(kind, th_type);
2755 *chunk = 0; // chunk was not set, try to show this fact via zero value
2757 case kmp_sch_static_chunked:
2758 *kind = kmp_sched_static;
2760 case kmp_sch_dynamic_chunked:
2761 *kind = kmp_sched_dynamic;
2763 case kmp_sch_guided_chunked:
2764 case kmp_sch_guided_iterative_chunked:
2765 case kmp_sch_guided_analytical_chunked:
2766 *kind = kmp_sched_guided;
2769 *kind = kmp_sched_auto;
2771 case kmp_sch_trapezoidal:
2772 *kind = kmp_sched_trapezoidal;
2774 #if KMP_STATIC_STEAL_ENABLED
2775 case kmp_sch_static_steal:
2776 *kind = kmp_sched_static_steal;
2780 KMP_FATAL(UnknownSchedulingType, th_type);
2783 __kmp_sched_apply_mods_stdkind(kind, th_type);
2784 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2787 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2793 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2794 KMP_DEBUG_ASSERT(__kmp_init_serial);
2801 thr = __kmp_threads[gtid];
2802 team = thr->th.th_team;
2803 ii = team->t.t_level;
2807 if (thr->th.th_teams_microtask) {
2808 // AC: we are in teams region where multiple nested teams have same level
2809 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2811 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2812 KMP_DEBUG_ASSERT(ii >= tlevel);
2813 // AC: As we need to pass by the teams league, we need to artificially
2816 ii += 2; // three teams have same level
2818 ii++; // two teams have same level
2824 return __kmp_tid_from_gtid(gtid);
2826 dd = team->t.t_serialized;
2828 while (ii > level) {
2829 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2831 if ((team->t.t_serialized) && (!dd)) {
2832 team = team->t.t_parent;
2836 team = team->t.t_parent;
2837 dd = team->t.t_serialized;
2842 return (dd > 1) ? (0) : (team->t.t_master_tid);
2845 int __kmp_get_team_size(int gtid, int level) {
2851 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2852 KMP_DEBUG_ASSERT(__kmp_init_serial);
2859 thr = __kmp_threads[gtid];
2860 team = thr->th.th_team;
2861 ii = team->t.t_level;
2865 if (thr->th.th_teams_microtask) {
2866 // AC: we are in teams region where multiple nested teams have same level
2867 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2869 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2870 KMP_DEBUG_ASSERT(ii >= tlevel);
2871 // AC: As we need to pass by the teams league, we need to artificially
2874 ii += 2; // three teams have same level
2876 ii++; // two teams have same level
2881 while (ii > level) {
2882 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2884 if (team->t.t_serialized && (!dd)) {
2885 team = team->t.t_parent;
2889 team = team->t.t_parent;
2894 return team->t.t_nproc;
2897 kmp_r_sched_t __kmp_get_schedule_global() {
2898 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2899 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2900 // independently. So one can get the updated schedule here.
2902 kmp_r_sched_t r_sched;
2904 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2905 // __kmp_guided. __kmp_sched should keep original value, so that user can set
2906 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2907 // different roots (even in OMP 2.5)
2908 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2909 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2910 if (s == kmp_sch_static) {
2911 // replace STATIC with more detailed schedule (balanced or greedy)
2912 r_sched.r_sched_type = __kmp_static;
2913 } else if (s == kmp_sch_guided_chunked) {
2914 // replace GUIDED with more detailed schedule (iterative or analytical)
2915 r_sched.r_sched_type = __kmp_guided;
2916 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2917 r_sched.r_sched_type = __kmp_sched;
2919 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2921 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2922 // __kmp_chunk may be wrong here (if it was not ever set)
2923 r_sched.chunk = KMP_DEFAULT_CHUNK;
2925 r_sched.chunk = __kmp_chunk;
2931 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2932 at least argc number of *t_argv entries for the requested team. */
2933 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2935 KMP_DEBUG_ASSERT(team);
2936 if (!realloc || argc > team->t.t_max_argc) {
2938 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2939 "current entries=%d\n",
2940 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2941 /* if previously allocated heap space for args, free them */
2942 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2943 __kmp_free((void *)team->t.t_argv);
2945 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2946 /* use unused space in the cache line for arguments */
2947 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2948 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2950 team->t.t_id, team->t.t_max_argc));
2951 team->t.t_argv = &team->t.t_inline_argv[0];
2952 if (__kmp_storage_map) {
2953 __kmp_print_storage_map_gtid(
2954 -1, &team->t.t_inline_argv[0],
2955 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2956 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2960 /* allocate space for arguments in the heap */
2961 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2962 ? KMP_MIN_MALLOC_ARGV_ENTRIES
2964 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
2966 team->t.t_id, team->t.t_max_argc));
2968 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
2969 if (__kmp_storage_map) {
2970 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
2971 &team->t.t_argv[team->t.t_max_argc],
2972 sizeof(void *) * team->t.t_max_argc,
2973 "team_%d.t_argv", team->t.t_id);
2979 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
2981 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2983 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
2984 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
2985 sizeof(dispatch_shared_info_t) * num_disp_buff);
2986 team->t.t_dispatch =
2987 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
2988 team->t.t_implicit_task_taskdata =
2989 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
2990 team->t.t_max_nproc = max_nth;
2992 /* setup dispatch buffers */
2993 for (i = 0; i < num_disp_buff; ++i) {
2994 team->t.t_disp_buffer[i].buffer_index = i;
2995 team->t.t_disp_buffer[i].doacross_buf_idx = i;
2999 static void __kmp_free_team_arrays(kmp_team_t *team) {
3000 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3002 for (i = 0; i < team->t.t_max_nproc; ++i) {
3003 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3004 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3005 team->t.t_dispatch[i].th_disp_buffer = NULL;
3008 #if KMP_USE_HIER_SCHED
3009 __kmp_dispatch_free_hierarchies(team);
3011 __kmp_free(team->t.t_threads);
3012 __kmp_free(team->t.t_disp_buffer);
3013 __kmp_free(team->t.t_dispatch);
3014 __kmp_free(team->t.t_implicit_task_taskdata);
3015 team->t.t_threads = NULL;
3016 team->t.t_disp_buffer = NULL;
3017 team->t.t_dispatch = NULL;
3018 team->t.t_implicit_task_taskdata = 0;
3021 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3022 kmp_info_t **oldThreads = team->t.t_threads;
3024 __kmp_free(team->t.t_disp_buffer);
3025 __kmp_free(team->t.t_dispatch);
3026 __kmp_free(team->t.t_implicit_task_taskdata);
3027 __kmp_allocate_team_arrays(team, max_nth);
3029 KMP_MEMCPY(team->t.t_threads, oldThreads,
3030 team->t.t_nproc * sizeof(kmp_info_t *));
3032 __kmp_free(oldThreads);
3035 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3037 kmp_r_sched_t r_sched =
3038 __kmp_get_schedule_global(); // get current state of scheduling globals
3040 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3042 kmp_internal_control_t g_icvs = {
3043 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3044 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3045 // adjustment of threads (per thread)
3046 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3047 // whether blocktime is explicitly set
3048 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3050 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3053 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3054 // next parallel region (per thread)
3055 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3056 __kmp_cg_max_nth, // int thread_limit;
3057 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3058 // for max_active_levels
3059 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3060 // {sched,chunk} pair
3061 __kmp_nested_proc_bind.bind_types[0],
3062 __kmp_default_device,
3063 NULL // struct kmp_internal_control *next;
3069 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3071 kmp_internal_control_t gx_icvs;
3072 gx_icvs.serial_nesting_level =
3073 0; // probably =team->t.t_serial like in save_inter_controls
3074 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3075 gx_icvs.next = NULL;
3080 static void __kmp_initialize_root(kmp_root_t *root) {
3082 kmp_team_t *root_team;
3083 kmp_team_t *hot_team;
3084 int hot_team_max_nth;
3085 kmp_r_sched_t r_sched =
3086 __kmp_get_schedule_global(); // get current state of scheduling globals
3087 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3088 KMP_DEBUG_ASSERT(root);
3089 KMP_ASSERT(!root->r.r_begin);
3091 /* setup the root state structure */
3092 __kmp_init_lock(&root->r.r_begin_lock);
3093 root->r.r_begin = FALSE;
3094 root->r.r_active = FALSE;
3095 root->r.r_in_parallel = 0;
3096 root->r.r_blocktime = __kmp_dflt_blocktime;
3098 /* setup the root team for this task */
3099 /* allocate the root team structure */
3100 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3103 __kmp_allocate_team(root,
3107 ompt_data_none, // root parallel id
3109 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3111 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3114 // Non-NULL value should be assigned to make the debugger display the root
3116 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3119 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3121 root->r.r_root_team = root_team;
3122 root_team->t.t_control_stack_top = NULL;
3124 /* initialize root team */
3125 root_team->t.t_threads[0] = NULL;
3126 root_team->t.t_nproc = 1;
3127 root_team->t.t_serialized = 1;
3128 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3129 root_team->t.t_sched.sched = r_sched.sched;
3132 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3133 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3135 /* setup the hot team for this task */
3136 /* allocate the hot team structure */
3137 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3140 __kmp_allocate_team(root,
3142 __kmp_dflt_team_nth_ub * 2, // max_nproc
3144 ompt_data_none, // root parallel id
3146 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3148 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3150 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3152 root->r.r_hot_team = hot_team;
3153 root_team->t.t_control_stack_top = NULL;
3155 /* first-time initialization */
3156 hot_team->t.t_parent = root_team;
3158 /* initialize hot team */
3159 hot_team_max_nth = hot_team->t.t_max_nproc;
3160 for (f = 0; f < hot_team_max_nth; ++f) {
3161 hot_team->t.t_threads[f] = NULL;
3163 hot_team->t.t_nproc = 1;
3164 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3165 hot_team->t.t_sched.sched = r_sched.sched;
3166 hot_team->t.t_size_changed = 0;
3171 typedef struct kmp_team_list_item {
3172 kmp_team_p const *entry;
3173 struct kmp_team_list_item *next;
3174 } kmp_team_list_item_t;
3175 typedef kmp_team_list_item_t *kmp_team_list_t;
3177 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3178 kmp_team_list_t list, // List of teams.
3179 kmp_team_p const *team // Team to add.
3182 // List must terminate with item where both entry and next are NULL.
3183 // Team is added to the list only once.
3184 // List is sorted in ascending order by team id.
3185 // Team id is *not* a key.
3189 KMP_DEBUG_ASSERT(list != NULL);
3194 __kmp_print_structure_team_accum(list, team->t.t_parent);
3195 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3197 // Search list for the team.
3199 while (l->next != NULL && l->entry != team) {
3202 if (l->next != NULL) {
3203 return; // Team has been added before, exit.
3206 // Team is not found. Search list again for insertion point.
3208 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3214 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3215 sizeof(kmp_team_list_item_t));
3222 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3225 __kmp_printf("%s", title);
3227 __kmp_printf("%2x %p\n", team->t.t_id, team);
3229 __kmp_printf(" - (nil)\n");
3233 static void __kmp_print_structure_thread(char const *title,
3234 kmp_info_p const *thread) {
3235 __kmp_printf("%s", title);
3236 if (thread != NULL) {
3237 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3239 __kmp_printf(" - (nil)\n");
3243 void __kmp_print_structure(void) {
3245 kmp_team_list_t list;
3247 // Initialize list of teams.
3249 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3253 __kmp_printf("\n------------------------------\nGlobal Thread "
3254 "Table\n------------------------------\n");
3257 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3258 __kmp_printf("%2d", gtid);
3259 if (__kmp_threads != NULL) {
3260 __kmp_printf(" %p", __kmp_threads[gtid]);
3262 if (__kmp_root != NULL) {
3263 __kmp_printf(" %p", __kmp_root[gtid]);
3269 // Print out __kmp_threads array.
3270 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3272 if (__kmp_threads != NULL) {
3274 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3275 kmp_info_t const *thread = __kmp_threads[gtid];
3276 if (thread != NULL) {
3277 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3278 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3279 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3280 __kmp_print_structure_team(" Serial Team: ",
3281 thread->th.th_serial_team);
3282 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3283 __kmp_print_structure_thread(" Master: ",
3284 thread->th.th_team_master);
3285 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3286 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3287 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3288 __kmp_print_structure_thread(" Next in pool: ",
3289 thread->th.th_next_pool);
3291 __kmp_print_structure_team_accum(list, thread->th.th_team);
3292 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3296 __kmp_printf("Threads array is not allocated.\n");
3299 // Print out __kmp_root array.
3300 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3302 if (__kmp_root != NULL) {
3304 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3305 kmp_root_t const *root = __kmp_root[gtid];
3307 __kmp_printf("GTID %2d %p:\n", gtid, root);
3308 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3309 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3310 __kmp_print_structure_thread(" Uber Thread: ",
3311 root->r.r_uber_thread);
3312 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3313 __kmp_printf(" In Parallel: %2d\n",
3314 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3316 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3317 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3321 __kmp_printf("Ubers array is not allocated.\n");
3324 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3326 while (list->next != NULL) {
3327 kmp_team_p const *team = list->entry;
3329 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3330 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3331 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3332 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3333 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3334 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3335 for (i = 0; i < team->t.t_nproc; ++i) {
3336 __kmp_printf(" Thread %2d: ", i);
3337 __kmp_print_structure_thread("", team->t.t_threads[i]);
3339 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3344 // Print out __kmp_thread_pool and __kmp_team_pool.
3345 __kmp_printf("\n------------------------------\nPools\n----------------------"
3347 __kmp_print_structure_thread("Thread pool: ",
3348 CCAST(kmp_info_t *, __kmp_thread_pool));
3349 __kmp_print_structure_team("Team pool: ",
3350 CCAST(kmp_team_t *, __kmp_team_pool));
3354 while (list != NULL) {
3355 kmp_team_list_item_t *item = list;
3357 KMP_INTERNAL_FREE(item);
3363 //---------------------------------------------------------------------------
3364 // Stuff for per-thread fast random number generator
3366 static const unsigned __kmp_primes[] = {
3367 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3368 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3369 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3370 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3371 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3372 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3373 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3374 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3375 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3376 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3377 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3379 //---------------------------------------------------------------------------
3380 // __kmp_get_random: Get a random number using a linear congruential method.
3381 unsigned short __kmp_get_random(kmp_info_t *thread) {
3382 unsigned x = thread->th.th_x;
3383 unsigned short r = x >> 16;
3385 thread->th.th_x = x * thread->th.th_a + 1;
3387 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3388 thread->th.th_info.ds.ds_tid, r));
3392 //--------------------------------------------------------
3393 // __kmp_init_random: Initialize a random number generator
3394 void __kmp_init_random(kmp_info_t *thread) {
3395 unsigned seed = thread->th.th_info.ds.ds_tid;
3398 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3399 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3401 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3405 /* reclaim array entries for root threads that are already dead, returns number
3407 static int __kmp_reclaim_dead_roots(void) {
3410 for (i = 0; i < __kmp_threads_capacity; ++i) {
3411 if (KMP_UBER_GTID(i) &&
3412 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3414 ->r.r_active) { // AC: reclaim only roots died in non-active state
3415 r += __kmp_unregister_root_other_thread(i);
3422 /* This function attempts to create free entries in __kmp_threads and
3423 __kmp_root, and returns the number of free entries generated.
3425 For Windows* OS static library, the first mechanism used is to reclaim array
3426 entries for root threads that are already dead.
3428 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3429 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3430 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3431 threadprivate cache array has been created. Synchronization with
3432 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3434 After any dead root reclamation, if the clipping value allows array expansion
3435 to result in the generation of a total of nNeed free slots, the function does
3436 that expansion. If not, nothing is done beyond the possible initial root
3439 If any argument is negative, the behavior is undefined. */
3440 static int __kmp_expand_threads(int nNeed) {
3442 int minimumRequiredCapacity;
3444 kmp_info_t **newThreads;
3445 kmp_root_t **newRoot;
3447 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3448 // resizing __kmp_threads does not need additional protection if foreign
3449 // threads are present
3451 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3452 /* only for Windows static library */
3453 /* reclaim array entries for root threads that are already dead */
3454 added = __kmp_reclaim_dead_roots();
3465 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3466 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3467 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3468 // > __kmp_max_nth in one of two ways:
3470 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3471 // may not be resused by another thread, so we may need to increase
3472 // __kmp_threads_capacity to __kmp_max_nth + 1.
3474 // 2) New foreign root(s) are encountered. We always register new foreign
3475 // roots. This may cause a smaller # of threads to be allocated at
3476 // subsequent parallel regions, but the worker threads hang around (and
3477 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3479 // Anyway, that is the reason for moving the check to see if
3480 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3481 // instead of having it performed here. -BB
3483 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3485 /* compute expansion headroom to check if we can expand */
3486 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3487 /* possible expansion too small -- give up */
3490 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3492 newCapacity = __kmp_threads_capacity;
3494 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3495 : __kmp_sys_max_nth;
3496 } while (newCapacity < minimumRequiredCapacity);
3497 newThreads = (kmp_info_t **)__kmp_allocate(
3498 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3500 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3501 KMP_MEMCPY(newThreads, __kmp_threads,
3502 __kmp_threads_capacity * sizeof(kmp_info_t *));
3503 KMP_MEMCPY(newRoot, __kmp_root,
3504 __kmp_threads_capacity * sizeof(kmp_root_t *));
3506 kmp_info_t **temp_threads = __kmp_threads;
3507 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3508 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3509 __kmp_free(temp_threads);
3510 added += newCapacity - __kmp_threads_capacity;
3511 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3513 if (newCapacity > __kmp_tp_capacity) {
3514 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3515 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3516 __kmp_threadprivate_resize_cache(newCapacity);
3517 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3518 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3520 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3526 /* Register the current thread as a root thread and obtain our gtid. We must
3527 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3528 thread that calls from __kmp_do_serial_initialize() */
3529 int __kmp_register_root(int initial_thread) {
3530 kmp_info_t *root_thread;
3534 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3535 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3539 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3540 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3541 work as expected -- it may return false (that means there is at least one
3542 empty slot in __kmp_threads array), but it is possible the only free slot
3543 is #0, which is reserved for initial thread and so cannot be used for this
3544 one. Following code workarounds this bug.
3546 However, right solution seems to be not reserving slot #0 for initial
3548 (1) there is no magic in slot #0,
3549 (2) we cannot detect initial thread reliably (the first thread which does
3550 serial initialization may be not a real initial thread).
3552 capacity = __kmp_threads_capacity;
3553 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3557 /* see if there are too many threads */
3558 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3559 if (__kmp_tp_cached) {
3560 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3561 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3562 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3564 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3569 /* find an available thread slot */
3570 /* Don't reassign the zero slot since we need that to only be used by initial
3572 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3576 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3577 KMP_ASSERT(gtid < __kmp_threads_capacity);
3579 /* update global accounting */
3581 TCW_4(__kmp_nth, __kmp_nth + 1);
3583 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3584 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3585 if (__kmp_adjust_gtid_mode) {
3586 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3587 if (TCR_4(__kmp_gtid_mode) != 2) {
3588 TCW_4(__kmp_gtid_mode, 2);
3591 if (TCR_4(__kmp_gtid_mode) != 1) {
3592 TCW_4(__kmp_gtid_mode, 1);
3597 #ifdef KMP_ADJUST_BLOCKTIME
3598 /* Adjust blocktime to zero if necessary */
3599 /* Middle initialization might not have occurred yet */
3600 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3601 if (__kmp_nth > __kmp_avail_proc) {
3602 __kmp_zero_bt = TRUE;
3605 #endif /* KMP_ADJUST_BLOCKTIME */
3607 /* setup this new hierarchy */
3608 if (!(root = __kmp_root[gtid])) {
3609 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3610 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3613 #if KMP_STATS_ENABLED
3614 // Initialize stats as soon as possible (right after gtid assignment).
3615 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3616 __kmp_stats_thread_ptr->startLife();
3617 KMP_SET_THREAD_STATE(SERIAL_REGION);
3618 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3620 __kmp_initialize_root(root);
3622 /* setup new root thread structure */
3623 if (root->r.r_uber_thread) {
3624 root_thread = root->r.r_uber_thread;
3626 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3627 if (__kmp_storage_map) {
3628 __kmp_print_thread_storage_map(root_thread, gtid);
3630 root_thread->th.th_info.ds.ds_gtid = gtid;
3632 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3634 root_thread->th.th_root = root;
3635 if (__kmp_env_consistency_check) {
3636 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3639 __kmp_initialize_fast_memory(root_thread);
3640 #endif /* USE_FAST_MEMORY */
3643 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3644 __kmp_initialize_bget(root_thread);
3646 __kmp_init_random(root_thread); // Initialize random number generator
3649 /* setup the serial team held in reserve by the root thread */
3650 if (!root_thread->th.th_serial_team) {
3651 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3652 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3653 root_thread->th.th_serial_team = __kmp_allocate_team(
3656 ompt_data_none, // root parallel id
3658 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3660 KMP_ASSERT(root_thread->th.th_serial_team);
3661 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3662 root_thread->th.th_serial_team));
3664 /* drop root_thread into place */
3665 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3667 root->r.r_root_team->t.t_threads[0] = root_thread;
3668 root->r.r_hot_team->t.t_threads[0] = root_thread;
3669 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3670 // AC: the team created in reserve, not for execution (it is unused for now).
3671 root_thread->th.th_serial_team->t.t_serialized = 0;
3672 root->r.r_uber_thread = root_thread;
3674 /* initialize the thread, get it ready to go */
3675 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3676 TCW_4(__kmp_init_gtid, TRUE);
3678 /* prepare the master thread for get_gtid() */
3679 __kmp_gtid_set_specific(gtid);
3682 __kmp_itt_thread_name(gtid);
3683 #endif /* USE_ITT_BUILD */
3685 #ifdef KMP_TDATA_GTID
3688 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3689 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3691 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3693 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3694 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3695 KMP_INIT_BARRIER_STATE));
3696 { // Initialize barrier data.
3698 for (b = 0; b < bs_last_barrier; ++b) {
3699 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3701 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3705 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3706 KMP_INIT_BARRIER_STATE);
3708 #if KMP_AFFINITY_SUPPORTED
3709 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3710 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3711 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3712 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3713 if (TCR_4(__kmp_init_middle)) {
3714 __kmp_affinity_set_init_mask(gtid, TRUE);
3716 #endif /* KMP_AFFINITY_SUPPORTED */
3717 root_thread->th.th_def_allocator = __kmp_def_allocator;
3718 root_thread->th.th_prev_level = 0;
3719 root_thread->th.th_prev_num_threads = 1;
3721 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3722 tmp->cg_root = root_thread;
3723 tmp->cg_thread_limit = __kmp_cg_max_nth;
3724 tmp->cg_nthreads = 1;
3725 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3726 " cg_nthreads init to 1\n",
3729 root_thread->th.th_cg_roots = tmp;
3731 __kmp_root_counter++;
3734 if (!initial_thread && ompt_enabled.enabled) {
3736 kmp_info_t *root_thread = ompt_get_thread();
3738 ompt_set_thread_state(root_thread, ompt_state_overhead);
3740 if (ompt_enabled.ompt_callback_thread_begin) {
3741 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3742 ompt_thread_initial, __ompt_get_thread_data_internal());
3744 ompt_data_t *task_data;
3745 ompt_data_t *parallel_data;
3746 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3747 if (ompt_enabled.ompt_callback_implicit_task) {
3748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3749 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3752 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3757 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3762 #if KMP_NESTED_HOT_TEAMS
3763 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3764 const int max_level) {
3766 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3767 if (!hot_teams || !hot_teams[level].hot_team) {
3770 KMP_DEBUG_ASSERT(level < max_level);
3771 kmp_team_t *team = hot_teams[level].hot_team;
3772 nth = hot_teams[level].hot_team_nth;
3773 n = nth - 1; // master is not freed
3774 if (level < max_level - 1) {
3775 for (i = 0; i < nth; ++i) {
3776 kmp_info_t *th = team->t.t_threads[i];
3777 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3778 if (i > 0 && th->th.th_hot_teams) {
3779 __kmp_free(th->th.th_hot_teams);
3780 th->th.th_hot_teams = NULL;
3784 __kmp_free_team(root, team, NULL);
3789 // Resets a root thread and clear its root and hot teams.
3790 // Returns the number of __kmp_threads entries directly and indirectly freed.
3791 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3792 kmp_team_t *root_team = root->r.r_root_team;
3793 kmp_team_t *hot_team = root->r.r_hot_team;
3794 int n = hot_team->t.t_nproc;
3797 KMP_DEBUG_ASSERT(!root->r.r_active);
3799 root->r.r_root_team = NULL;
3800 root->r.r_hot_team = NULL;
3801 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3802 // before call to __kmp_free_team().
3803 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3804 #if KMP_NESTED_HOT_TEAMS
3805 if (__kmp_hot_teams_max_level >
3806 0) { // need to free nested hot teams and their threads if any
3807 for (i = 0; i < hot_team->t.t_nproc; ++i) {
3808 kmp_info_t *th = hot_team->t.t_threads[i];
3809 if (__kmp_hot_teams_max_level > 1) {
3810 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3812 if (th->th.th_hot_teams) {
3813 __kmp_free(th->th.th_hot_teams);
3814 th->th.th_hot_teams = NULL;
3819 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3821 // Before we can reap the thread, we need to make certain that all other
3822 // threads in the teams that had this root as ancestor have stopped trying to
3824 if (__kmp_tasking_mode != tskm_immediate_exec) {
3825 __kmp_wait_to_unref_task_teams();
3829 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3831 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3833 (LPVOID) & (root->r.r_uber_thread->th),
3834 root->r.r_uber_thread->th.th_info.ds.ds_thread));
3835 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3836 #endif /* KMP_OS_WINDOWS */
3839 ompt_data_t *task_data;
3840 ompt_data_t *parallel_data;
3841 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3842 if (ompt_enabled.ompt_callback_implicit_task) {
3843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3844 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3846 if (ompt_enabled.ompt_callback_thread_end) {
3847 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3848 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3853 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3854 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3855 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3857 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3858 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3860 // need to free contention group structure
3861 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3862 root->r.r_uber_thread->th.th_cg_roots->cg_root);
3863 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3864 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3865 root->r.r_uber_thread->th.th_cg_roots = NULL;
3867 __kmp_reap_thread(root->r.r_uber_thread, 1);
3869 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
3871 root->r.r_uber_thread = NULL;
3872 /* mark root as no longer in use */
3873 root->r.r_begin = FALSE;
3878 void __kmp_unregister_root_current_thread(int gtid) {
3879 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3880 /* this lock should be ok, since unregister_root_current_thread is never
3881 called during an abort, only during a normal close. furthermore, if you
3882 have the forkjoin lock, you should never try to get the initz lock */
3883 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3884 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3885 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3888 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3891 kmp_root_t *root = __kmp_root[gtid];
3893 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3894 KMP_ASSERT(KMP_UBER_GTID(gtid));
3895 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3896 KMP_ASSERT(root->r.r_active == FALSE);
3900 kmp_info_t *thread = __kmp_threads[gtid];
3901 kmp_team_t *team = thread->th.th_team;
3902 kmp_task_team_t *task_team = thread->th.th_task_team;
3904 // we need to wait for the proxy tasks before finishing the thread
3905 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3907 // the runtime is shutting down so we won't report any events
3908 thread->th.ompt_thread_info.state = ompt_state_undefined;
3910 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3913 __kmp_reset_root(gtid, root);
3915 /* free up this thread slot */
3916 __kmp_gtid_set_specific(KMP_GTID_DNE);
3917 #ifdef KMP_TDATA_GTID
3918 __kmp_gtid = KMP_GTID_DNE;
3923 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3925 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3929 /* __kmp_forkjoin_lock must be already held
3930 Unregisters a root thread that is not the current thread. Returns the number
3931 of __kmp_threads entries freed as a result. */
3932 static int __kmp_unregister_root_other_thread(int gtid) {
3933 kmp_root_t *root = __kmp_root[gtid];
3936 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3937 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3938 KMP_ASSERT(KMP_UBER_GTID(gtid));
3939 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3940 KMP_ASSERT(root->r.r_active == FALSE);
3942 r = __kmp_reset_root(gtid, root);
3944 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3950 void __kmp_task_info() {
3952 kmp_int32 gtid = __kmp_entry_gtid();
3953 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3954 kmp_info_t *this_thr = __kmp_threads[gtid];
3955 kmp_team_t *steam = this_thr->th.th_serial_team;
3956 kmp_team_t *team = this_thr->th.th_team;
3959 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3961 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3962 team->t.t_implicit_task_taskdata[tid].td_parent);
3966 /* TODO optimize with one big memclr, take out what isn't needed, split
3967 responsibility to workers as much as possible, and delay initialization of
3968 features as much as possible */
3969 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
3970 int tid, int gtid) {
3971 /* this_thr->th.th_info.ds.ds_gtid is setup in
3972 kmp_allocate_thread/create_worker.
3973 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3974 kmp_info_t *master = team->t.t_threads[0];
3975 KMP_DEBUG_ASSERT(this_thr != NULL);
3976 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
3977 KMP_DEBUG_ASSERT(team);
3978 KMP_DEBUG_ASSERT(team->t.t_threads);
3979 KMP_DEBUG_ASSERT(team->t.t_dispatch);
3980 KMP_DEBUG_ASSERT(master);
3981 KMP_DEBUG_ASSERT(master->th.th_root);
3985 TCW_SYNC_PTR(this_thr->th.th_team, team);
3987 this_thr->th.th_info.ds.ds_tid = tid;
3988 this_thr->th.th_set_nproc = 0;
3989 if (__kmp_tasking_mode != tskm_immediate_exec)
3990 // When tasking is possible, threads are not safe to reap until they are
3991 // done tasking; this will be set when tasking code is exited in wait
3992 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3993 else // no tasking --> always safe to reap
3994 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
3995 this_thr->th.th_set_proc_bind = proc_bind_default;
3996 #if KMP_AFFINITY_SUPPORTED
3997 this_thr->th.th_new_place = this_thr->th.th_current_place;
3999 this_thr->th.th_root = master->th.th_root;
4001 /* setup the thread's cache of the team structure */
4002 this_thr->th.th_team_nproc = team->t.t_nproc;
4003 this_thr->th.th_team_master = master;
4004 this_thr->th.th_team_serialized = team->t.t_serialized;
4005 TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4007 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4009 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4010 tid, gtid, this_thr, this_thr->th.th_current_task));
4012 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4015 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4016 tid, gtid, this_thr, this_thr->th.th_current_task));
4017 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4018 // __kmp_initialize_team()?
4020 /* TODO no worksharing in speculative threads */
4021 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4023 this_thr->th.th_local.this_construct = 0;
4025 if (!this_thr->th.th_pri_common) {
4026 this_thr->th.th_pri_common =
4027 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4028 if (__kmp_storage_map) {
4029 __kmp_print_storage_map_gtid(
4030 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4031 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4033 this_thr->th.th_pri_head = NULL;
4036 if (this_thr != master && // Master's CG root is initialized elsewhere
4037 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4038 // Make new thread's CG root same as master's
4039 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4040 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4042 // worker changes CG, need to check if old CG should be freed
4043 int i = tmp->cg_nthreads--;
4044 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4045 " on node %p of thread %p to %d\n",
4046 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4048 __kmp_free(tmp); // last thread left CG --> free it
4051 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4052 // Increment new thread's CG root's counter to add the new thread
4053 this_thr->th.th_cg_roots->cg_nthreads++;
4054 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4055 " node %p of thread %p to %d\n",
4056 this_thr, this_thr->th.th_cg_roots,
4057 this_thr->th.th_cg_roots->cg_root,
4058 this_thr->th.th_cg_roots->cg_nthreads));
4059 this_thr->th.th_current_task->td_icvs.thread_limit =
4060 this_thr->th.th_cg_roots->cg_thread_limit;
4063 /* Initialize dynamic dispatch */
4065 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066 // Use team max_nproc since this will never change for the team.
4068 sizeof(dispatch_private_info_t) *
4069 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4070 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4071 team->t.t_max_nproc));
4072 KMP_ASSERT(dispatch);
4073 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4074 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4076 dispatch->th_disp_index = 0;
4077 dispatch->th_doacross_buf_idx = 0;
4078 if (!dispatch->th_disp_buffer) {
4079 dispatch->th_disp_buffer =
4080 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4082 if (__kmp_storage_map) {
4083 __kmp_print_storage_map_gtid(
4084 gtid, &dispatch->th_disp_buffer[0],
4085 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4087 : __kmp_dispatch_num_buffers],
4088 disp_size, "th_%d.th_dispatch.th_disp_buffer "
4089 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4090 gtid, team->t.t_id, gtid);
4093 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4096 dispatch->th_dispatch_pr_current = 0;
4097 dispatch->th_dispatch_sh_current = 0;
4099 dispatch->th_deo_fcn = 0; /* ORDERED */
4100 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4103 this_thr->th.th_next_pool = NULL;
4105 if (!this_thr->th.th_task_state_memo_stack) {
4107 this_thr->th.th_task_state_memo_stack =
4108 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4109 this_thr->th.th_task_state_top = 0;
4110 this_thr->th.th_task_state_stack_sz = 4;
4111 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4112 ++i) // zero init the stack
4113 this_thr->th.th_task_state_memo_stack[i] = 0;
4116 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4117 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4122 /* allocate a new thread for the requesting team. this is only called from
4123 within a forkjoin critical section. we will first try to get an available
4124 thread from the thread pool. if none is available, we will fork a new one
4125 assuming we are able to create a new one. this should be assured, as the
4126 caller should check on this first. */
4127 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4129 kmp_team_t *serial_team;
4130 kmp_info_t *new_thr;
4133 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4134 KMP_DEBUG_ASSERT(root && team);
4135 #if !KMP_NESTED_HOT_TEAMS
4136 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4140 /* first, try to get one from the thread pool */
4141 if (__kmp_thread_pool) {
4142 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4143 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4144 if (new_thr == __kmp_thread_pool_insert_pt) {
4145 __kmp_thread_pool_insert_pt = NULL;
4147 TCW_4(new_thr->th.th_in_pool, FALSE);
4148 __kmp_suspend_initialize_thread(new_thr);
4149 __kmp_lock_suspend_mx(new_thr);
4150 if (new_thr->th.th_active_in_pool == TRUE) {
4151 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4152 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4153 new_thr->th.th_active_in_pool = FALSE;
4155 __kmp_unlock_suspend_mx(new_thr);
4157 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4158 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4159 KMP_ASSERT(!new_thr->th.th_team);
4160 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4162 /* setup the thread structure */
4163 __kmp_initialize_info(new_thr, team, new_tid,
4164 new_thr->th.th_info.ds.ds_gtid);
4165 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4167 TCW_4(__kmp_nth, __kmp_nth + 1);
4169 new_thr->th.th_task_state = 0;
4170 new_thr->th.th_task_state_top = 0;
4171 new_thr->th.th_task_state_stack_sz = 4;
4173 #ifdef KMP_ADJUST_BLOCKTIME
4174 /* Adjust blocktime back to zero if necessary */
4175 /* Middle initialization might not have occurred yet */
4176 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4177 if (__kmp_nth > __kmp_avail_proc) {
4178 __kmp_zero_bt = TRUE;
4181 #endif /* KMP_ADJUST_BLOCKTIME */
4184 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4185 // KMP_BARRIER_PARENT_FLAG.
4187 kmp_balign_t *balign = new_thr->th.th_bar;
4188 for (b = 0; b < bs_last_barrier; ++b)
4189 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4192 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4193 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4199 /* no, well fork a new one */
4200 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4201 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4204 // If this is the first worker thread the RTL is creating, then also
4205 // launch the monitor thread. We try to do this as early as possible.
4206 if (!TCR_4(__kmp_init_monitor)) {
4207 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4208 if (!TCR_4(__kmp_init_monitor)) {
4209 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4210 TCW_4(__kmp_init_monitor, 1);
4211 __kmp_create_monitor(&__kmp_monitor);
4212 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4214 // AC: wait until monitor has started. This is a fix for CQ232808.
4215 // The reason is that if the library is loaded/unloaded in a loop with
4216 // small (parallel) work in between, then there is high probability that
4217 // monitor thread started after the library shutdown. At shutdown it is
4218 // too late to cope with the problem, because when the master is in
4219 // DllMain (process detach) the monitor has no chances to start (it is
4220 // blocked), and master has no means to inform the monitor that the
4221 // library has gone, because all the memory which the monitor can access
4222 // is going to be released/reset.
4223 while (TCR_4(__kmp_init_monitor) < 2) {
4226 KF_TRACE(10, ("after monitor thread has started\n"));
4229 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4234 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4235 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4238 /* allocate space for it. */
4239 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4241 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4243 if (__kmp_storage_map) {
4244 __kmp_print_thread_storage_map(new_thr, new_gtid);
4247 // add the reserve serialized team, initialized from the team's master thread
4249 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4250 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4251 new_thr->th.th_serial_team = serial_team =
4252 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4254 ompt_data_none, // root parallel id
4256 proc_bind_default, &r_icvs,
4257 0 USE_NESTED_HOT_ARG(NULL));
4259 KMP_ASSERT(serial_team);
4260 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4261 // execution (it is unused for now).
4262 serial_team->t.t_threads[0] = new_thr;
4264 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4267 /* setup the thread structures */
4268 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4271 __kmp_initialize_fast_memory(new_thr);
4272 #endif /* USE_FAST_MEMORY */
4275 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4276 __kmp_initialize_bget(new_thr);
4279 __kmp_init_random(new_thr); // Initialize random number generator
4281 /* Initialize these only once when thread is grabbed for a team allocation */
4283 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4284 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4287 kmp_balign_t *balign = new_thr->th.th_bar;
4288 for (b = 0; b < bs_last_barrier; ++b) {
4289 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4290 balign[b].bb.team = NULL;
4291 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4292 balign[b].bb.use_oncore_barrier = 0;
4295 new_thr->th.th_spin_here = FALSE;
4296 new_thr->th.th_next_waiting = 0;
4298 new_thr->th.th_blocking = false;
4301 #if KMP_AFFINITY_SUPPORTED
4302 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4303 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4304 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4305 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4307 new_thr->th.th_def_allocator = __kmp_def_allocator;
4308 new_thr->th.th_prev_level = 0;
4309 new_thr->th.th_prev_num_threads = 1;
4311 TCW_4(new_thr->th.th_in_pool, FALSE);
4312 new_thr->th.th_active_in_pool = FALSE;
4313 TCW_4(new_thr->th.th_active, TRUE);
4315 /* adjust the global counters */
4319 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4320 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4321 if (__kmp_adjust_gtid_mode) {
4322 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4323 if (TCR_4(__kmp_gtid_mode) != 2) {
4324 TCW_4(__kmp_gtid_mode, 2);
4327 if (TCR_4(__kmp_gtid_mode) != 1) {
4328 TCW_4(__kmp_gtid_mode, 1);
4333 #ifdef KMP_ADJUST_BLOCKTIME
4334 /* Adjust blocktime back to zero if necessary */
4335 /* Middle initialization might not have occurred yet */
4336 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4337 if (__kmp_nth > __kmp_avail_proc) {
4338 __kmp_zero_bt = TRUE;
4341 #endif /* KMP_ADJUST_BLOCKTIME */
4343 /* actually fork it and create the new worker thread */
4345 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4346 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4348 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4350 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4356 /* Reinitialize team for reuse.
4357 The hot team code calls this case at every fork barrier, so EPCC barrier
4358 test are extremely sensitive to changes in it, esp. writes to the team
4359 struct, which cause a cache invalidation in all threads.
4360 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4361 static void __kmp_reinitialize_team(kmp_team_t *team,
4362 kmp_internal_control_t *new_icvs,
4364 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4365 team->t.t_threads[0], team));
4366 KMP_DEBUG_ASSERT(team && new_icvs);
4367 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4368 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4370 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4371 // Copy ICVs to the master thread's implicit taskdata
4372 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4373 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4375 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4376 team->t.t_threads[0], team));
4379 /* Initialize the team data structure.
4380 This assumes the t_threads and t_max_nproc are already set.
4381 Also, we don't touch the arguments */
4382 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4383 kmp_internal_control_t *new_icvs,
4385 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4388 KMP_DEBUG_ASSERT(team);
4389 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4390 KMP_DEBUG_ASSERT(team->t.t_threads);
4393 team->t.t_master_tid = 0; /* not needed */
4394 /* team->t.t_master_bar; not needed */
4395 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4396 team->t.t_nproc = new_nproc;
4398 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4399 team->t.t_next_pool = NULL;
4400 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4403 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4404 team->t.t_invoke = NULL; /* not needed */
4406 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4407 team->t.t_sched.sched = new_icvs->sched.sched;
4409 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4410 team->t.t_fp_control_saved = FALSE; /* not needed */
4411 team->t.t_x87_fpu_control_word = 0; /* not needed */
4412 team->t.t_mxcsr = 0; /* not needed */
4413 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4415 team->t.t_construct = 0;
4417 team->t.t_ordered.dt.t_value = 0;
4418 team->t.t_master_active = FALSE;
4421 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4424 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4427 team->t.t_control_stack_top = NULL;
4429 __kmp_reinitialize_team(team, new_icvs, loc);
4432 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4435 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4436 /* Sets full mask for thread and returns old mask, no changes to structures. */
4438 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4439 if (KMP_AFFINITY_CAPABLE()) {
4441 if (old_mask != NULL) {
4442 status = __kmp_get_system_affinity(old_mask, TRUE);
4445 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4449 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4454 #if KMP_AFFINITY_SUPPORTED
4456 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4457 // It calculats the worker + master thread's partition based upon the parent
4458 // thread's partition, and binds each worker to a thread in their partition.
4459 // The master thread's partition should already include its current binding.
4460 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4461 // Copy the master thread's place partion to the team struct
4462 kmp_info_t *master_th = team->t.t_threads[0];
4463 KMP_DEBUG_ASSERT(master_th != NULL);
4464 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4465 int first_place = master_th->th.th_first_place;
4466 int last_place = master_th->th.th_last_place;
4467 int masters_place = master_th->th.th_current_place;
4468 team->t.t_first_place = first_place;
4469 team->t.t_last_place = last_place;
4471 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4472 "bound to place %d partition = [%d,%d]\n",
4473 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4474 team->t.t_id, masters_place, first_place, last_place));
4476 switch (proc_bind) {
4478 case proc_bind_default:
4479 // serial teams might have the proc_bind policy set to proc_bind_default. It
4480 // doesn't matter, as we don't rebind master thread for any proc_bind policy
4481 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4484 case proc_bind_master: {
4486 int n_th = team->t.t_nproc;
4487 for (f = 1; f < n_th; f++) {
4488 kmp_info_t *th = team->t.t_threads[f];
4489 KMP_DEBUG_ASSERT(th != NULL);
4490 th->th.th_first_place = first_place;
4491 th->th.th_last_place = last_place;
4492 th->th.th_new_place = masters_place;
4493 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4494 team->t.t_display_affinity != 1) {
4495 team->t.t_display_affinity = 1;
4498 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4499 "partition = [%d,%d]\n",
4500 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4501 f, masters_place, first_place, last_place));
4505 case proc_bind_close: {
4507 int n_th = team->t.t_nproc;
4509 if (first_place <= last_place) {
4510 n_places = last_place - first_place + 1;
4512 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4514 if (n_th <= n_places) {
4515 int place = masters_place;
4516 for (f = 1; f < n_th; f++) {
4517 kmp_info_t *th = team->t.t_threads[f];
4518 KMP_DEBUG_ASSERT(th != NULL);
4520 if (place == last_place) {
4521 place = first_place;
4522 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4527 th->th.th_first_place = first_place;
4528 th->th.th_last_place = last_place;
4529 th->th.th_new_place = place;
4530 if (__kmp_display_affinity && place != th->th.th_current_place &&
4531 team->t.t_display_affinity != 1) {
4532 team->t.t_display_affinity = 1;
4535 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4536 "partition = [%d,%d]\n",
4537 __kmp_gtid_from_thread(team->t.t_threads[f]),
4538 team->t.t_id, f, place, first_place, last_place));
4541 int S, rem, gap, s_count;
4542 S = n_th / n_places;
4544 rem = n_th - (S * n_places);
4545 gap = rem > 0 ? n_places / rem : n_places;
4546 int place = masters_place;
4548 for (f = 0; f < n_th; f++) {
4549 kmp_info_t *th = team->t.t_threads[f];
4550 KMP_DEBUG_ASSERT(th != NULL);
4552 th->th.th_first_place = first_place;
4553 th->th.th_last_place = last_place;
4554 th->th.th_new_place = place;
4555 if (__kmp_display_affinity && place != th->th.th_current_place &&
4556 team->t.t_display_affinity != 1) {
4557 team->t.t_display_affinity = 1;
4561 if ((s_count == S) && rem && (gap_ct == gap)) {
4562 // do nothing, add an extra thread to place on next iteration
4563 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4564 // we added an extra thread to this place; move to next place
4565 if (place == last_place) {
4566 place = first_place;
4567 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4575 } else if (s_count == S) { // place full; don't add extra
4576 if (place == last_place) {
4577 place = first_place;
4578 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4588 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4589 "partition = [%d,%d]\n",
4590 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4591 th->th.th_new_place, first_place, last_place));
4593 KMP_DEBUG_ASSERT(place == masters_place);
4597 case proc_bind_spread: {
4599 int n_th = team->t.t_nproc;
4602 if (first_place <= last_place) {
4603 n_places = last_place - first_place + 1;
4605 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4607 if (n_th <= n_places) {
4610 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4611 int S = n_places / n_th;
4612 int s_count, rem, gap, gap_ct;
4614 place = masters_place;
4615 rem = n_places - n_th * S;
4616 gap = rem ? n_th / rem : 1;
4619 if (update_master_only == 1)
4621 for (f = 0; f < thidx; f++) {
4622 kmp_info_t *th = team->t.t_threads[f];
4623 KMP_DEBUG_ASSERT(th != NULL);
4625 th->th.th_first_place = place;
4626 th->th.th_new_place = place;
4627 if (__kmp_display_affinity && place != th->th.th_current_place &&
4628 team->t.t_display_affinity != 1) {
4629 team->t.t_display_affinity = 1;
4632 while (s_count < S) {
4633 if (place == last_place) {
4634 place = first_place;
4635 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4642 if (rem && (gap_ct == gap)) {
4643 if (place == last_place) {
4644 place = first_place;
4645 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4653 th->th.th_last_place = place;
4656 if (place == last_place) {
4657 place = first_place;
4658 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4665 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4666 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4667 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4668 f, th->th.th_new_place, th->th.th_first_place,
4669 th->th.th_last_place, __kmp_affinity_num_masks));
4672 /* Having uniform space of available computation places I can create
4673 T partitions of round(P/T) size and put threads into the first
4674 place of each partition. */
4675 double current = static_cast<double>(masters_place);
4677 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4682 if (update_master_only == 1)
4684 for (f = 0; f < thidx; f++) {
4685 first = static_cast<int>(current);
4686 last = static_cast<int>(current + spacing) - 1;
4687 KMP_DEBUG_ASSERT(last >= first);
4688 if (first >= n_places) {
4689 if (masters_place) {
4692 if (first == (masters_place + 1)) {
4693 KMP_DEBUG_ASSERT(f == n_th);
4696 if (last == masters_place) {
4697 KMP_DEBUG_ASSERT(f == (n_th - 1));
4701 KMP_DEBUG_ASSERT(f == n_th);
4706 if (last >= n_places) {
4707 last = (n_places - 1);
4712 KMP_DEBUG_ASSERT(0 <= first);
4713 KMP_DEBUG_ASSERT(n_places > first);
4714 KMP_DEBUG_ASSERT(0 <= last);
4715 KMP_DEBUG_ASSERT(n_places > last);
4716 KMP_DEBUG_ASSERT(last_place >= first_place);
4717 th = team->t.t_threads[f];
4718 KMP_DEBUG_ASSERT(th);
4719 th->th.th_first_place = first;
4720 th->th.th_new_place = place;
4721 th->th.th_last_place = last;
4722 if (__kmp_display_affinity && place != th->th.th_current_place &&
4723 team->t.t_display_affinity != 1) {
4724 team->t.t_display_affinity = 1;
4727 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4728 "partition = [%d,%d], spacing = %.4f\n",
4729 __kmp_gtid_from_thread(team->t.t_threads[f]),
4730 team->t.t_id, f, th->th.th_new_place,
4731 th->th.th_first_place, th->th.th_last_place, spacing));
4735 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4737 int S, rem, gap, s_count;
4738 S = n_th / n_places;
4740 rem = n_th - (S * n_places);
4741 gap = rem > 0 ? n_places / rem : n_places;
4742 int place = masters_place;
4745 if (update_master_only == 1)
4747 for (f = 0; f < thidx; f++) {
4748 kmp_info_t *th = team->t.t_threads[f];
4749 KMP_DEBUG_ASSERT(th != NULL);
4751 th->th.th_first_place = place;
4752 th->th.th_last_place = place;
4753 th->th.th_new_place = place;
4754 if (__kmp_display_affinity && place != th->th.th_current_place &&
4755 team->t.t_display_affinity != 1) {
4756 team->t.t_display_affinity = 1;
4760 if ((s_count == S) && rem && (gap_ct == gap)) {
4761 // do nothing, add an extra thread to place on next iteration
4762 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4763 // we added an extra thread to this place; move on to next place
4764 if (place == last_place) {
4765 place = first_place;
4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4774 } else if (s_count == S) { // place is full; don't add extra thread
4775 if (place == last_place) {
4776 place = first_place;
4777 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4786 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4787 "partition = [%d,%d]\n",
4788 __kmp_gtid_from_thread(team->t.t_threads[f]),
4789 team->t.t_id, f, th->th.th_new_place,
4790 th->th.th_first_place, th->th.th_last_place));
4792 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4800 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4803 #endif // KMP_AFFINITY_SUPPORTED
4805 /* allocate a new team data structure to use. take one off of the free pool if
4808 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4810 ompt_data_t ompt_parallel_data,
4812 kmp_proc_bind_t new_proc_bind,
4813 kmp_internal_control_t *new_icvs,
4814 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4815 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4818 int use_hot_team = !root->r.r_active;
4821 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4822 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4823 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4826 #if KMP_NESTED_HOT_TEAMS
4827 kmp_hot_team_ptr_t *hot_teams;
4829 team = master->th.th_team;
4830 level = team->t.t_active_level;
4831 if (master->th.th_teams_microtask) { // in teams construct?
4832 if (master->th.th_teams_size.nteams > 1 &&
4835 (microtask_t)__kmp_teams_master || // inner fork of the teams
4836 master->th.th_teams_level <
4837 team->t.t_level)) { // or nested parallel inside the teams
4838 ++level; // not increment if #teams==1, or for outer fork of the teams;
4839 // increment otherwise
4842 hot_teams = master->th.th_hot_teams;
4843 if (level < __kmp_hot_teams_max_level && hot_teams &&
4845 .hot_team) { // hot team has already been allocated for given level
4852 // Optimization to use a "hot" team
4853 if (use_hot_team && new_nproc > 1) {
4854 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4855 #if KMP_NESTED_HOT_TEAMS
4856 team = hot_teams[level].hot_team;
4858 team = root->r.r_hot_team;
4861 if (__kmp_tasking_mode != tskm_immediate_exec) {
4862 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4863 "task_team[1] = %p before reinit\n",
4864 team->t.t_task_team[0], team->t.t_task_team[1]));
4868 // Has the number of threads changed?
4869 /* Let's assume the most common case is that the number of threads is
4870 unchanged, and put that case first. */
4871 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4872 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4873 // This case can mean that omp_set_num_threads() was called and the hot
4874 // team size was already reduced, so we check the special flag
4875 if (team->t.t_size_changed == -1) {
4876 team->t.t_size_changed = 1;
4878 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4881 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4882 kmp_r_sched_t new_sched = new_icvs->sched;
4883 // set master's schedule as new run-time schedule
4884 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4886 __kmp_reinitialize_team(team, new_icvs,
4887 root->r.r_uber_thread->th.th_ident);
4889 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4890 team->t.t_threads[0], team));
4891 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4893 #if KMP_AFFINITY_SUPPORTED
4894 if ((team->t.t_size_changed == 0) &&
4895 (team->t.t_proc_bind == new_proc_bind)) {
4896 if (new_proc_bind == proc_bind_spread) {
4897 __kmp_partition_places(
4898 team, 1); // add flag to update only master for spread
4900 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4901 "proc_bind = %d, partition = [%d,%d]\n",
4902 team->t.t_id, new_proc_bind, team->t.t_first_place,
4903 team->t.t_last_place));
4905 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4906 __kmp_partition_places(team);
4909 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4910 #endif /* KMP_AFFINITY_SUPPORTED */
4911 } else if (team->t.t_nproc > new_nproc) {
4913 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4916 team->t.t_size_changed = 1;
4917 #if KMP_NESTED_HOT_TEAMS
4918 if (__kmp_hot_teams_mode == 0) {
4919 // AC: saved number of threads should correspond to team's value in this
4920 // mode, can be bigger in mode 1, when hot team has threads in reserve
4921 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4922 hot_teams[level].hot_team_nth = new_nproc;
4923 #endif // KMP_NESTED_HOT_TEAMS
4924 /* release the extra threads we don't need any more */
4925 for (f = new_nproc; f < team->t.t_nproc; f++) {
4926 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4927 if (__kmp_tasking_mode != tskm_immediate_exec) {
4928 // When decreasing team size, threads no longer in the team should
4930 team->t.t_threads[f]->th.th_task_team = NULL;
4932 __kmp_free_thread(team->t.t_threads[f]);
4933 team->t.t_threads[f] = NULL;
4935 #if KMP_NESTED_HOT_TEAMS
4936 } // (__kmp_hot_teams_mode == 0)
4938 // When keeping extra threads in team, switch threads to wait on own
4940 for (f = new_nproc; f < team->t.t_nproc; ++f) {
4941 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4942 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4943 for (int b = 0; b < bs_last_barrier; ++b) {
4944 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4945 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4947 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4951 #endif // KMP_NESTED_HOT_TEAMS
4952 team->t.t_nproc = new_nproc;
4953 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4954 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4955 __kmp_reinitialize_team(team, new_icvs,
4956 root->r.r_uber_thread->th.th_ident);
4958 // Update remaining threads
4959 for (f = 0; f < new_nproc; ++f) {
4960 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4963 // restore the current task state of the master thread: should be the
4965 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
4966 team->t.t_threads[0], team));
4968 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4971 for (f = 0; f < team->t.t_nproc; f++) {
4972 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
4973 team->t.t_threads[f]->th.th_team_nproc ==
4978 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4979 #if KMP_AFFINITY_SUPPORTED
4980 __kmp_partition_places(team);
4982 } else { // team->t.t_nproc < new_nproc
4983 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4984 kmp_affin_mask_t *old_mask;
4985 if (KMP_AFFINITY_CAPABLE()) {
4986 KMP_CPU_ALLOC(old_mask);
4991 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
4994 team->t.t_size_changed = 1;
4996 #if KMP_NESTED_HOT_TEAMS
4997 int avail_threads = hot_teams[level].hot_team_nth;
4998 if (new_nproc < avail_threads)
4999 avail_threads = new_nproc;
5000 kmp_info_t **other_threads = team->t.t_threads;
5001 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5002 // Adjust barrier data of reserved threads (if any) of the team
5003 // Other data will be set in __kmp_initialize_info() below.
5005 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5006 for (b = 0; b < bs_last_barrier; ++b) {
5007 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5008 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5010 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5014 if (hot_teams[level].hot_team_nth >= new_nproc) {
5015 // we have all needed threads in reserve, no need to allocate any
5016 // this only possible in mode 1, cannot have reserved threads in mode 0
5017 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5018 team->t.t_nproc = new_nproc; // just get reserved threads involved
5020 // we may have some threads in reserve, but not enough
5023 .hot_team_nth; // get reserved threads involved if any
5024 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5025 #endif // KMP_NESTED_HOT_TEAMS
5026 if (team->t.t_max_nproc < new_nproc) {
5027 /* reallocate larger arrays */
5028 __kmp_reallocate_team_arrays(team, new_nproc);
5029 __kmp_reinitialize_team(team, new_icvs, NULL);
5032 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5033 /* Temporarily set full mask for master thread before creation of
5034 workers. The reason is that workers inherit the affinity from master,
5035 so if a lot of workers are created on the single core quickly, they
5036 don't get a chance to set their own affinity for a long time. */
5037 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5040 /* allocate new threads for the hot team */
5041 for (f = team->t.t_nproc; f < new_nproc; f++) {
5042 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5043 KMP_DEBUG_ASSERT(new_worker);
5044 team->t.t_threads[f] = new_worker;
5047 ("__kmp_allocate_team: team %d init T#%d arrived: "
5048 "join=%llu, plain=%llu\n",
5049 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5050 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5051 team->t.t_bar[bs_plain_barrier].b_arrived));
5053 { // Initialize barrier data for new threads.
5055 kmp_balign_t *balign = new_worker->th.th_bar;
5056 for (b = 0; b < bs_last_barrier; ++b) {
5057 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5058 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5059 KMP_BARRIER_PARENT_FLAG);
5061 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5067 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5068 if (KMP_AFFINITY_CAPABLE()) {
5069 /* Restore initial master thread's affinity mask */
5070 __kmp_set_system_affinity(old_mask, TRUE);
5071 KMP_CPU_FREE(old_mask);
5074 #if KMP_NESTED_HOT_TEAMS
5075 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5076 #endif // KMP_NESTED_HOT_TEAMS
5077 /* make sure everyone is syncronized */
5078 int old_nproc = team->t.t_nproc; // save old value and use to update only
5079 // new threads below
5080 __kmp_initialize_team(team, new_nproc, new_icvs,
5081 root->r.r_uber_thread->th.th_ident);
5083 /* reinitialize the threads */
5084 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5085 for (f = 0; f < team->t.t_nproc; ++f)
5086 __kmp_initialize_info(team->t.t_threads[f], team, f,
5087 __kmp_gtid_from_tid(f, team));
5089 if (level) { // set th_task_state for new threads in nested hot team
5090 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5091 // only need to set the th_task_state for the new threads. th_task_state
5092 // for master thread will not be accurate until after this in
5093 // __kmp_fork_call(), so we look to the master's memo_stack to get the
5095 for (f = old_nproc; f < team->t.t_nproc; ++f)
5096 team->t.t_threads[f]->th.th_task_state =
5097 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5098 } else { // set th_task_state for new threads in non-nested hot team
5100 team->t.t_threads[0]->th.th_task_state; // copy master's state
5101 for (f = old_nproc; f < team->t.t_nproc; ++f)
5102 team->t.t_threads[f]->th.th_task_state = old_state;
5106 for (f = 0; f < team->t.t_nproc; ++f) {
5107 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5108 team->t.t_threads[f]->th.th_team_nproc ==
5113 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5114 #if KMP_AFFINITY_SUPPORTED
5115 __kmp_partition_places(team);
5117 } // Check changes in number of threads
5119 kmp_info_t *master = team->t.t_threads[0];
5120 if (master->th.th_teams_microtask) {
5121 for (f = 1; f < new_nproc; ++f) {
5122 // propagate teams construct specific info to workers
5123 kmp_info_t *thr = team->t.t_threads[f];
5124 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5125 thr->th.th_teams_level = master->th.th_teams_level;
5126 thr->th.th_teams_size = master->th.th_teams_size;
5129 #if KMP_NESTED_HOT_TEAMS
5131 // Sync barrier state for nested hot teams, not needed for outermost hot
5133 for (f = 1; f < new_nproc; ++f) {
5134 kmp_info_t *thr = team->t.t_threads[f];
5136 kmp_balign_t *balign = thr->th.th_bar;
5137 for (b = 0; b < bs_last_barrier; ++b) {
5138 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5139 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5141 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5146 #endif // KMP_NESTED_HOT_TEAMS
5148 /* reallocate space for arguments if necessary */
5149 __kmp_alloc_argv_entries(argc, team, TRUE);
5150 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5151 // The hot team re-uses the previous task team,
5152 // if untouched during the previous release->gather phase.
5154 KF_TRACE(10, (" hot_team = %p\n", team));
5157 if (__kmp_tasking_mode != tskm_immediate_exec) {
5158 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5159 "task_team[1] = %p after reinit\n",
5160 team->t.t_task_team[0], team->t.t_task_team[1]));
5165 __ompt_team_assign_id(team, ompt_parallel_data);
5173 /* next, let's try to take one from the team pool */
5175 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5176 /* TODO: consider resizing undersized teams instead of reaping them, now
5177 that we have a resizing mechanism */
5178 if (team->t.t_max_nproc >= max_nproc) {
5179 /* take this team from the team pool */
5180 __kmp_team_pool = team->t.t_next_pool;
5182 /* setup the team for fresh use */
5183 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5185 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5186 "task_team[1] %p to NULL\n",
5187 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5188 team->t.t_task_team[0] = NULL;
5189 team->t.t_task_team[1] = NULL;
5191 /* reallocate space for arguments if necessary */
5192 __kmp_alloc_argv_entries(argc, team, TRUE);
5193 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5196 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5197 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5198 { // Initialize barrier data.
5200 for (b = 0; b < bs_last_barrier; ++b) {
5201 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5203 team->t.t_bar[b].b_master_arrived = 0;
5204 team->t.t_bar[b].b_team_arrived = 0;
5209 team->t.t_proc_bind = new_proc_bind;
5211 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5215 __ompt_team_assign_id(team, ompt_parallel_data);
5223 /* reap team if it is too small, then loop back and check the next one */
5224 // not sure if this is wise, but, will be redone during the hot-teams
5226 /* TODO: Use technique to find the right size hot-team, don't reap them */
5227 team = __kmp_reap_team(team);
5228 __kmp_team_pool = team;
5231 /* nothing available in the pool, no matter, make a new team! */
5233 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5236 team->t.t_max_nproc = max_nproc;
5237 /* NOTE well, for some reason allocating one big buffer and dividing it up
5238 seems to really hurt performance a lot on the P4, so, let's not use this */
5239 __kmp_allocate_team_arrays(team, max_nproc);
5241 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5242 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5244 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5246 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5247 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5248 // memory, no need to duplicate
5249 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5250 // memory, no need to duplicate
5252 if (__kmp_storage_map) {
5253 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5256 /* allocate space for arguments */
5257 __kmp_alloc_argv_entries(argc, team, FALSE);
5258 team->t.t_argc = argc;
5261 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5262 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5263 { // Initialize barrier data.
5265 for (b = 0; b < bs_last_barrier; ++b) {
5266 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5268 team->t.t_bar[b].b_master_arrived = 0;
5269 team->t.t_bar[b].b_team_arrived = 0;
5274 team->t.t_proc_bind = new_proc_bind;
5277 __ompt_team_assign_id(team, ompt_parallel_data);
5278 team->t.ompt_serialized_team_info = NULL;
5283 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5289 /* TODO implement hot-teams at all levels */
5290 /* TODO implement lazy thread release on demand (disband request) */
5292 /* free the team. return it to the team pool. release all the threads
5293 * associated with it */
5294 void __kmp_free_team(kmp_root_t *root,
5295 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5297 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5301 KMP_DEBUG_ASSERT(root);
5302 KMP_DEBUG_ASSERT(team);
5303 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5304 KMP_DEBUG_ASSERT(team->t.t_threads);
5306 int use_hot_team = team == root->r.r_hot_team;
5307 #if KMP_NESTED_HOT_TEAMS
5309 kmp_hot_team_ptr_t *hot_teams;
5311 level = team->t.t_active_level - 1;
5312 if (master->th.th_teams_microtask) { // in teams construct?
5313 if (master->th.th_teams_size.nteams > 1) {
5314 ++level; // level was not increased in teams construct for
5317 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5318 master->th.th_teams_level == team->t.t_level) {
5319 ++level; // level was not increased in teams construct for
5320 // team_of_workers before the parallel
5321 } // team->t.t_level will be increased inside parallel
5323 hot_teams = master->th.th_hot_teams;
5324 if (level < __kmp_hot_teams_max_level) {
5325 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5329 #endif // KMP_NESTED_HOT_TEAMS
5331 /* team is done working */
5332 TCW_SYNC_PTR(team->t.t_pkfn,
5333 NULL); // Important for Debugging Support Library.
5335 team->t.t_copyin_counter = 0; // init counter for possible reuse
5337 // Do not reset pointer to parent team to NULL for hot teams.
5339 /* if we are non-hot team, release our threads */
5340 if (!use_hot_team) {
5341 if (__kmp_tasking_mode != tskm_immediate_exec) {
5342 // Wait for threads to reach reapable state
5343 for (f = 1; f < team->t.t_nproc; ++f) {
5344 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5345 kmp_info_t *th = team->t.t_threads[f];
5346 volatile kmp_uint32 *state = &th->th.th_reap_state;
5347 while (*state != KMP_SAFE_TO_REAP) {
5349 // On Windows a thread can be killed at any time, check this
5351 if (!__kmp_is_thread_alive(th, &ecode)) {
5352 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5356 // first check if thread is sleeping
5357 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5358 if (fl.is_sleeping())
5359 fl.resume(__kmp_gtid_from_thread(th));
5364 // Delete task teams
5366 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5367 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5368 if (task_team != NULL) {
5369 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5370 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5371 team->t.t_threads[f]->th.th_task_team = NULL;
5375 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5376 __kmp_get_gtid(), task_team, team->t.t_id));
5377 #if KMP_NESTED_HOT_TEAMS
5378 __kmp_free_task_team(master, task_team);
5380 team->t.t_task_team[tt_idx] = NULL;
5385 // Reset pointer to parent team only for non-hot teams.
5386 team->t.t_parent = NULL;
5387 team->t.t_level = 0;
5388 team->t.t_active_level = 0;
5390 /* free the worker threads */
5391 for (f = 1; f < team->t.t_nproc; ++f) {
5392 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5393 __kmp_free_thread(team->t.t_threads[f]);
5394 team->t.t_threads[f] = NULL;
5397 /* put the team back in the team pool */
5398 /* TODO limit size of team pool, call reap_team if pool too large */
5399 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5400 __kmp_team_pool = (volatile kmp_team_t *)team;
5401 } else { // Check if team was created for the masters in a teams construct
5402 // See if first worker is a CG root
5403 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5404 team->t.t_threads[1]->th.th_cg_roots);
5405 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5406 // Clean up the CG root nodes on workers so that this team can be re-used
5407 for (f = 1; f < team->t.t_nproc; ++f) {
5408 kmp_info_t *thr = team->t.t_threads[f];
5409 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5410 thr->th.th_cg_roots->cg_root == thr);
5411 // Pop current CG root off list
5412 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5413 thr->th.th_cg_roots = tmp->up;
5414 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5415 " up to node %p. cg_nthreads was %d\n",
5416 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5417 int i = tmp->cg_nthreads--;
5419 __kmp_free(tmp); // free CG if we are the last thread in it
5421 // Restore current task's thread_limit from CG root
5422 if (thr->th.th_cg_roots)
5423 thr->th.th_current_task->td_icvs.thread_limit =
5424 thr->th.th_cg_roots->cg_thread_limit;
5432 /* reap the team. destroy it, reclaim all its resources and free its memory */
5433 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5434 kmp_team_t *next_pool = team->t.t_next_pool;
5436 KMP_DEBUG_ASSERT(team);
5437 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5438 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5439 KMP_DEBUG_ASSERT(team->t.t_threads);
5440 KMP_DEBUG_ASSERT(team->t.t_argv);
5442 /* TODO clean the threads that are a part of this? */
5445 __kmp_free_team_arrays(team);
5446 if (team->t.t_argv != &team->t.t_inline_argv[0])
5447 __kmp_free((void *)team->t.t_argv);
5454 // Free the thread. Don't reap it, just place it on the pool of available
5457 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5458 // binding for the affinity mechanism to be useful.
5460 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5461 // However, we want to avoid a potential performance problem by always
5462 // scanning through the list to find the correct point at which to insert
5463 // the thread (potential N**2 behavior). To do this we keep track of the
5464 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5465 // With single-level parallelism, threads will always be added to the tail
5466 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5467 // parallelism, all bets are off and we may need to scan through the entire
5470 // This change also has a potentially large performance benefit, for some
5471 // applications. Previously, as threads were freed from the hot team, they
5472 // would be placed back on the free list in inverse order. If the hot team
5473 // grew back to it's original size, then the freed thread would be placed
5474 // back on the hot team in reverse order. This could cause bad cache
5475 // locality problems on programs where the size of the hot team regularly
5478 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5479 void __kmp_free_thread(kmp_info_t *this_th) {
5483 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5484 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5486 KMP_DEBUG_ASSERT(this_th);
5488 // When moving thread to pool, switch thread to wait on own b_go flag, and
5489 // uninitialized (NULL team).
5491 kmp_balign_t *balign = this_th->th.th_bar;
5492 for (b = 0; b < bs_last_barrier; ++b) {
5493 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5494 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5495 balign[b].bb.team = NULL;
5496 balign[b].bb.leaf_kids = 0;
5498 this_th->th.th_task_state = 0;
5499 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5501 /* put thread back on the free pool */
5502 TCW_PTR(this_th->th.th_team, NULL);
5503 TCW_PTR(this_th->th.th_root, NULL);
5504 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5506 while (this_th->th.th_cg_roots) {
5507 this_th->th.th_cg_roots->cg_nthreads--;
5508 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5509 " %p of thread %p to %d\n",
5510 this_th, this_th->th.th_cg_roots,
5511 this_th->th.th_cg_roots->cg_root,
5512 this_th->th.th_cg_roots->cg_nthreads));
5513 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5514 if (tmp->cg_root == this_th) { // Thread is a cg_root
5515 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5517 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5518 this_th->th.th_cg_roots = tmp->up;
5520 } else { // Worker thread
5521 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5524 this_th->th.th_cg_roots = NULL;
5529 /* If the implicit task assigned to this thread can be used by other threads
5530 * -> multiple threads can share the data and try to free the task at
5531 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5532 * with higher probability when hot team is disabled but can occurs even when
5533 * the hot team is enabled */
5534 __kmp_free_implicit_task(this_th);
5535 this_th->th.th_current_task = NULL;
5537 // If the __kmp_thread_pool_insert_pt is already past the new insert
5538 // point, then we need to re-scan the entire list.
5539 gtid = this_th->th.th_info.ds.ds_gtid;
5540 if (__kmp_thread_pool_insert_pt != NULL) {
5541 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5542 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5543 __kmp_thread_pool_insert_pt = NULL;
5547 // Scan down the list to find the place to insert the thread.
5548 // scan is the address of a link in the list, possibly the address of
5549 // __kmp_thread_pool itself.
5551 // In the absence of nested parallism, the for loop will have 0 iterations.
5552 if (__kmp_thread_pool_insert_pt != NULL) {
5553 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5555 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5557 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5558 scan = &((*scan)->th.th_next_pool))
5561 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5563 TCW_PTR(this_th->th.th_next_pool, *scan);
5564 __kmp_thread_pool_insert_pt = *scan = this_th;
5565 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5566 (this_th->th.th_info.ds.ds_gtid <
5567 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5568 TCW_4(this_th->th.th_in_pool, TRUE);
5569 __kmp_suspend_initialize_thread(this_th);
5570 __kmp_lock_suspend_mx(this_th);
5571 if (this_th->th.th_active == TRUE) {
5572 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5573 this_th->th.th_active_in_pool = TRUE;
5577 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5580 __kmp_unlock_suspend_mx(this_th);
5582 TCW_4(__kmp_nth, __kmp_nth - 1);
5584 #ifdef KMP_ADJUST_BLOCKTIME
5585 /* Adjust blocktime back to user setting or default if necessary */
5586 /* Middle initialization might never have occurred */
5587 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5588 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5589 if (__kmp_nth <= __kmp_avail_proc) {
5590 __kmp_zero_bt = FALSE;
5593 #endif /* KMP_ADJUST_BLOCKTIME */
5598 /* ------------------------------------------------------------------------ */
5600 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5601 int gtid = this_thr->th.th_info.ds.ds_gtid;
5602 /* void *stack_data;*/
5603 kmp_team_t *(*volatile pteam);
5606 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5608 if (__kmp_env_consistency_check) {
5609 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5613 ompt_data_t *thread_data;
5614 if (ompt_enabled.enabled) {
5615 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5616 *thread_data = ompt_data_none;
5618 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5619 this_thr->th.ompt_thread_info.wait_id = 0;
5620 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5621 if (ompt_enabled.ompt_callback_thread_begin) {
5622 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5623 ompt_thread_worker, thread_data);
5629 if (ompt_enabled.enabled) {
5630 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5633 /* This is the place where threads wait for work */
5634 while (!TCR_4(__kmp_global.g.g_done)) {
5635 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5638 /* wait for work to do */
5639 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5641 /* No tid yet since not part of a team */
5642 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5645 if (ompt_enabled.enabled) {
5646 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5650 pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5652 /* have we been allocated? */
5653 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5654 /* we were just woken up, so run our new task */
5655 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5658 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5659 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5660 (*pteam)->t.t_pkfn));
5662 updateHWFPControl(*pteam);
5665 if (ompt_enabled.enabled) {
5666 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5670 rc = (*pteam)->t.t_invoke(gtid);
5674 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5675 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5676 (*pteam)->t.t_pkfn));
5679 if (ompt_enabled.enabled) {
5680 /* no frame set while outside task */
5681 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5683 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5686 /* join barrier after parallel region */
5687 __kmp_join_barrier(gtid);
5690 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5693 if (ompt_enabled.ompt_callback_thread_end) {
5694 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5698 this_thr->th.th_task_team = NULL;
5699 /* run the destructors for the threadprivate data for this thread */
5700 __kmp_common_destroy_gtid(gtid);
5702 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5707 /* ------------------------------------------------------------------------ */
5709 void __kmp_internal_end_dest(void *specific_gtid) {
5710 #if KMP_COMPILER_ICC
5711 #pragma warning(push)
5712 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5715 // Make sure no significant bits are lost
5716 int gtid = (kmp_intptr_t)specific_gtid - 1;
5717 #if KMP_COMPILER_ICC
5718 #pragma warning(pop)
5721 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5722 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5723 * this is because 0 is reserved for the nothing-stored case */
5725 /* josh: One reason for setting the gtid specific data even when it is being
5726 destroyed by pthread is to allow gtid lookup through thread specific data
5727 (__kmp_gtid_get_specific). Some of the code, especially stat code,
5728 that gets executed in the call to __kmp_internal_end_thread, actually
5729 gets the gtid through the thread specific data. Setting it here seems
5730 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5732 todo: get rid of this after we remove the dependence on
5733 __kmp_gtid_get_specific */
5734 if (gtid >= 0 && KMP_UBER_GTID(gtid))
5735 __kmp_gtid_set_specific(gtid);
5736 #ifdef KMP_TDATA_GTID
5739 __kmp_internal_end_thread(gtid);
5742 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5744 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5745 // destructors work perfectly, but in real libomp.so I have no evidence it is
5746 // ever called. However, -fini linker option in makefile.mk works fine.
5748 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5749 __kmp_internal_end_atexit();
5752 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5756 /* [Windows] josh: when the atexit handler is called, there may still be more
5757 than one thread alive */
5758 void __kmp_internal_end_atexit(void) {
5759 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5761 josh: ideally, we want to completely shutdown the library in this atexit
5762 handler, but stat code that depends on thread specific data for gtid fails
5763 because that data becomes unavailable at some point during the shutdown, so
5764 we call __kmp_internal_end_thread instead. We should eventually remove the
5765 dependency on __kmp_get_specific_gtid in the stat code and use
5766 __kmp_internal_end_library to cleanly shutdown the library.
5768 // TODO: Can some of this comment about GVS be removed?
5769 I suspect that the offending stat code is executed when the calling thread
5770 tries to clean up a dead root thread's data structures, resulting in GVS
5771 code trying to close the GVS structures for that thread, but since the stat
5772 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5773 the calling thread is cleaning up itself instead of another thread, it get
5774 confused. This happens because allowing a thread to unregister and cleanup
5775 another thread is a recent modification for addressing an issue.
5776 Based on the current design (20050722), a thread may end up
5777 trying to unregister another thread only if thread death does not trigger
5778 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5779 thread specific data destructor function to detect thread death. For
5780 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5781 is nothing. Thus, the workaround is applicable only for Windows static
5783 __kmp_internal_end_library(-1);
5785 __kmp_close_console();
5789 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5790 // It is assumed __kmp_forkjoin_lock is acquired.
5794 KMP_DEBUG_ASSERT(thread != NULL);
5796 gtid = thread->th.th_info.ds.ds_gtid;
5799 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5800 /* Assume the threads are at the fork barrier here */
5802 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5804 /* Need release fence here to prevent seg faults for tree forkjoin barrier
5806 ANNOTATE_HAPPENS_BEFORE(thread);
5807 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5808 __kmp_release_64(&flag);
5811 // Terminate OS thread.
5812 __kmp_reap_worker(thread);
5814 // The thread was killed asynchronously. If it was actively
5815 // spinning in the thread pool, decrement the global count.
5817 // There is a small timing hole here - if the worker thread was just waking
5818 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5819 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5820 // the global counter might not get updated.
5822 // Currently, this can only happen as the library is unloaded,
5823 // so there are no harmful side effects.
5824 if (thread->th.th_active_in_pool) {
5825 thread->th.th_active_in_pool = FALSE;
5826 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5827 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5831 __kmp_free_implicit_task(thread);
5833 // Free the fast memory for tasking
5835 __kmp_free_fast_memory(thread);
5836 #endif /* USE_FAST_MEMORY */
5838 __kmp_suspend_uninitialize_thread(thread);
5840 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5841 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5844 // __kmp_nth was decremented when thread is added to the pool.
5846 #ifdef KMP_ADJUST_BLOCKTIME
5847 /* Adjust blocktime back to user setting or default if necessary */
5848 /* Middle initialization might never have occurred */
5849 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5850 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5851 if (__kmp_nth <= __kmp_avail_proc) {
5852 __kmp_zero_bt = FALSE;
5855 #endif /* KMP_ADJUST_BLOCKTIME */
5857 /* free the memory being used */
5858 if (__kmp_env_consistency_check) {
5859 if (thread->th.th_cons) {
5860 __kmp_free_cons_stack(thread->th.th_cons);
5861 thread->th.th_cons = NULL;
5865 if (thread->th.th_pri_common != NULL) {
5866 __kmp_free(thread->th.th_pri_common);
5867 thread->th.th_pri_common = NULL;
5870 if (thread->th.th_task_state_memo_stack != NULL) {
5871 __kmp_free(thread->th.th_task_state_memo_stack);
5872 thread->th.th_task_state_memo_stack = NULL;
5876 if (thread->th.th_local.bget_data != NULL) {
5877 __kmp_finalize_bget(thread);
5881 #if KMP_AFFINITY_SUPPORTED
5882 if (thread->th.th_affin_mask != NULL) {
5883 KMP_CPU_FREE(thread->th.th_affin_mask);
5884 thread->th.th_affin_mask = NULL;
5886 #endif /* KMP_AFFINITY_SUPPORTED */
5888 #if KMP_USE_HIER_SCHED
5889 if (thread->th.th_hier_bar_data != NULL) {
5890 __kmp_free(thread->th.th_hier_bar_data);
5891 thread->th.th_hier_bar_data = NULL;
5895 __kmp_reap_team(thread->th.th_serial_team);
5896 thread->th.th_serial_team = NULL;
5901 } // __kmp_reap_thread
5903 static void __kmp_internal_end(void) {
5906 /* First, unregister the library */
5907 __kmp_unregister_library();
5910 /* In Win static library, we can't tell when a root actually dies, so we
5911 reclaim the data structures for any root threads that have died but not
5912 unregistered themselves, in order to shut down cleanly.
5913 In Win dynamic library we also can't tell when a thread dies. */
5914 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5918 for (i = 0; i < __kmp_threads_capacity; i++)
5920 if (__kmp_root[i]->r.r_active)
5922 KMP_MB(); /* Flush all pending memory write invalidates. */
5923 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5925 if (i < __kmp_threads_capacity) {
5927 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5928 KMP_MB(); /* Flush all pending memory write invalidates. */
5930 // Need to check that monitor was initialized before reaping it. If we are
5931 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5932 // __kmp_monitor will appear to contain valid data, but it is only valid in
5933 // the parent process, not the child.
5934 // New behavior (201008): instead of keying off of the flag
5935 // __kmp_init_parallel, the monitor thread creation is keyed off
5936 // of the new flag __kmp_init_monitor.
5937 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5938 if (TCR_4(__kmp_init_monitor)) {
5939 __kmp_reap_monitor(&__kmp_monitor);
5940 TCW_4(__kmp_init_monitor, 0);
5942 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5943 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5944 #endif // KMP_USE_MONITOR
5946 /* TODO move this to cleanup code */
5948 /* make sure that everything has properly ended */
5949 for (i = 0; i < __kmp_threads_capacity; i++) {
5950 if (__kmp_root[i]) {
5951 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5952 // there can be uber threads alive here
5953 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5960 // Reap the worker threads.
5961 // This is valid for now, but be careful if threads are reaped sooner.
5962 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5963 // Get the next thread from the pool.
5964 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5965 __kmp_thread_pool = thread->th.th_next_pool;
5967 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5968 thread->th.th_next_pool = NULL;
5969 thread->th.th_in_pool = FALSE;
5970 __kmp_reap_thread(thread, 0);
5972 __kmp_thread_pool_insert_pt = NULL;
5975 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
5976 // Get the next team from the pool.
5977 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
5978 __kmp_team_pool = team->t.t_next_pool;
5980 team->t.t_next_pool = NULL;
5981 __kmp_reap_team(team);
5984 __kmp_reap_task_teams();
5987 // Threads that are not reaped should not access any resources since they
5988 // are going to be deallocated soon, so the shutdown sequence should wait
5989 // until all threads either exit the final spin-waiting loop or begin
5990 // sleeping after the given blocktime.
5991 for (i = 0; i < __kmp_threads_capacity; i++) {
5992 kmp_info_t *thr = __kmp_threads[i];
5993 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
5998 for (i = 0; i < __kmp_threads_capacity; ++i) {
5999 // TBD: Add some checking...
6000 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6003 /* Make sure all threadprivate destructors get run by joining with all
6004 worker threads before resetting this flag */
6005 TCW_SYNC_4(__kmp_init_common, FALSE);
6007 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6011 // See note above: One of the possible fixes for CQ138434 / CQ140126
6013 // FIXME: push both code fragments down and CSE them?
6014 // push them into __kmp_cleanup() ?
6015 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6016 if (TCR_4(__kmp_init_monitor)) {
6017 __kmp_reap_monitor(&__kmp_monitor);
6018 TCW_4(__kmp_init_monitor, 0);
6020 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6021 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6023 } /* else !__kmp_global.t_active */
6024 TCW_4(__kmp_init_gtid, FALSE);
6025 KMP_MB(); /* Flush all pending memory write invalidates. */
6033 void __kmp_internal_end_library(int gtid_req) {
6034 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6035 /* this shouldn't be a race condition because __kmp_internal_end() is the
6036 only place to clear __kmp_serial_init */
6037 /* we'll check this later too, after we get the lock */
6038 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6039 // redundaant, because the next check will work in any case.
6040 if (__kmp_global.g.g_abort) {
6041 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6045 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6046 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6050 KMP_MB(); /* Flush all pending memory write invalidates. */
6052 /* find out who we are and what we should do */
6054 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6056 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6057 if (gtid == KMP_GTID_SHUTDOWN) {
6058 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6059 "already shutdown\n"));
6061 } else if (gtid == KMP_GTID_MONITOR) {
6062 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6063 "registered, or system shutdown\n"));
6065 } else if (gtid == KMP_GTID_DNE) {
6066 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6068 /* we don't know who we are, but we may still shutdown the library */
6069 } else if (KMP_UBER_GTID(gtid)) {
6070 /* unregister ourselves as an uber thread. gtid is no longer valid */
6071 if (__kmp_root[gtid]->r.r_active) {
6072 __kmp_global.g.g_abort = -1;
6073 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6075 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6081 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6082 __kmp_unregister_root_current_thread(gtid);
6085 /* worker threads may call this function through the atexit handler, if they
6087 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6088 TODO: do a thorough shutdown instead */
6089 #ifdef DUMP_DEBUG_ON_EXIT
6090 if (__kmp_debug_buf)
6091 __kmp_dump_debug_buffer();
6096 /* synchronize the termination process */
6097 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6099 /* have we already finished */
6100 if (__kmp_global.g.g_abort) {
6101 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6103 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6106 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6107 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6111 /* We need this lock to enforce mutex between this reading of
6112 __kmp_threads_capacity and the writing by __kmp_register_root.
6113 Alternatively, we can use a counter of roots that is atomically updated by
6114 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6115 __kmp_internal_end_*. */
6116 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6118 /* now we can safely conduct the actual termination */
6119 __kmp_internal_end();
6121 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6122 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6124 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6126 #ifdef DUMP_DEBUG_ON_EXIT
6127 if (__kmp_debug_buf)
6128 __kmp_dump_debug_buffer();
6132 __kmp_close_console();
6135 __kmp_fini_allocator();
6137 } // __kmp_internal_end_library
6139 void __kmp_internal_end_thread(int gtid_req) {
6142 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6143 /* this shouldn't be a race condition because __kmp_internal_end() is the
6144 * only place to clear __kmp_serial_init */
6145 /* we'll check this later too, after we get the lock */
6146 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6147 // redundant, because the next check will work in any case.
6148 if (__kmp_global.g.g_abort) {
6149 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6154 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6158 KMP_MB(); /* Flush all pending memory write invalidates. */
6160 /* find out who we are and what we should do */
6162 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6164 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6165 if (gtid == KMP_GTID_SHUTDOWN) {
6166 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6167 "already shutdown\n"));
6169 } else if (gtid == KMP_GTID_MONITOR) {
6170 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6171 "registered, or system shutdown\n"));
6173 } else if (gtid == KMP_GTID_DNE) {
6174 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6177 /* we don't know who we are */
6178 } else if (KMP_UBER_GTID(gtid)) {
6179 /* unregister ourselves as an uber thread. gtid is no longer valid */
6180 if (__kmp_root[gtid]->r.r_active) {
6181 __kmp_global.g.g_abort = -1;
6182 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6184 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6188 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6190 __kmp_unregister_root_current_thread(gtid);
6193 /* just a worker thread, let's leave */
6194 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6197 __kmp_threads[gtid]->th.th_task_team = NULL;
6201 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6207 if (__kmp_pause_status != kmp_hard_paused)
6208 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6209 // because we will better shutdown later in the library destructor.
6211 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6215 /* synchronize the termination process */
6216 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6218 /* have we already finished */
6219 if (__kmp_global.g.g_abort) {
6220 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6222 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6225 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6226 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6230 /* We need this lock to enforce mutex between this reading of
6231 __kmp_threads_capacity and the writing by __kmp_register_root.
6232 Alternatively, we can use a counter of roots that is atomically updated by
6233 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6234 __kmp_internal_end_*. */
6236 /* should we finish the run-time? are all siblings done? */
6237 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6239 for (i = 0; i < __kmp_threads_capacity; ++i) {
6240 if (KMP_UBER_GTID(i)) {
6243 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6244 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6245 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6250 /* now we can safely conduct the actual termination */
6252 __kmp_internal_end();
6254 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6255 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6257 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6259 #ifdef DUMP_DEBUG_ON_EXIT
6260 if (__kmp_debug_buf)
6261 __kmp_dump_debug_buffer();
6263 } // __kmp_internal_end_thread
6265 // -----------------------------------------------------------------------------
6266 // Library registration stuff.
6268 static long __kmp_registration_flag = 0;
6269 // Random value used to indicate library initialization.
6270 static char *__kmp_registration_str = NULL;
6271 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6273 static inline char *__kmp_reg_status_name() {
6274 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6275 each thread. If registration and unregistration go in different threads
6276 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6277 env var can not be found, because the name will contain different pid. */
6278 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6279 } // __kmp_reg_status_get
6281 void __kmp_register_library_startup(void) {
6283 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6289 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6290 __kmp_initialize_system_tick();
6292 __kmp_read_system_time(&time.dtime);
6293 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6294 __kmp_registration_str =
6295 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6296 __kmp_registration_flag, KMP_LIBRARY_FILE);
6298 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6299 __kmp_registration_str));
6303 char *value = NULL; // Actual value of the environment variable.
6305 // Set environment variable, but do not overwrite if it is exist.
6306 __kmp_env_set(name, __kmp_registration_str, 0);
6307 // Check the variable is written.
6308 value = __kmp_env_get(name);
6309 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6311 done = 1; // Ok, environment variable set successfully, exit the loop.
6315 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6316 // Check whether it alive or dead.
6317 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6319 char *flag_addr_str = NULL;
6320 char *flag_val_str = NULL;
6321 char const *file_name = NULL;
6322 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6323 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6326 long *flag_addr = 0;
6328 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6329 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6330 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6331 // First, check whether environment-encoded address is mapped into
6333 // If so, dereference it to see if it still has the right value.
6334 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6337 // If not, then we know the other copy of the library is no longer
6344 case 0: // Cannot parse environment variable -- neighbor status unknown.
6345 // Assume it is the incompatible format of future version of the
6346 // library. Assume the other library is alive.
6347 // WARN( ... ); // TODO: Issue a warning.
6348 file_name = "unknown library";
6350 // Attention! Falling to the next case. That's intentional.
6351 case 1: { // Neighbor is alive.
6352 // Check it is allowed.
6353 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6354 if (!__kmp_str_match_true(duplicate_ok)) {
6355 // That's not allowed. Issue fatal error.
6356 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6357 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6359 KMP_INTERNAL_FREE(duplicate_ok);
6360 __kmp_duplicate_library_ok = 1;
6361 done = 1; // Exit the loop.
6363 case 2: { // Neighbor is dead.
6364 // Clear the variable and try to register library again.
6365 __kmp_env_unset(name);
6367 default: { KMP_DEBUG_ASSERT(0); } break;
6370 KMP_INTERNAL_FREE((void *)value);
6372 KMP_INTERNAL_FREE((void *)name);
6374 } // func __kmp_register_library_startup
6376 void __kmp_unregister_library(void) {
6378 char *name = __kmp_reg_status_name();
6379 char *value = __kmp_env_get(name);
6381 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6382 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6383 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6384 // Ok, this is our variable. Delete it.
6385 __kmp_env_unset(name);
6388 KMP_INTERNAL_FREE(__kmp_registration_str);
6389 KMP_INTERNAL_FREE(value);
6390 KMP_INTERNAL_FREE(name);
6392 __kmp_registration_flag = 0;
6393 __kmp_registration_str = NULL;
6395 } // __kmp_unregister_library
6397 // End of Library registration stuff.
6398 // -----------------------------------------------------------------------------
6400 #if KMP_MIC_SUPPORTED
6402 static void __kmp_check_mic_type() {
6403 kmp_cpuid_t cpuid_state = {0};
6404 kmp_cpuid_t *cs_p = &cpuid_state;
6405 __kmp_x86_cpuid(1, 0, cs_p);
6406 // We don't support mic1 at the moment
6407 if ((cs_p->eax & 0xff0) == 0xB10) {
6408 __kmp_mic_type = mic2;
6409 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6410 __kmp_mic_type = mic3;
6412 __kmp_mic_type = non_mic;
6416 #endif /* KMP_MIC_SUPPORTED */
6418 static void __kmp_do_serial_initialize(void) {
6422 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6424 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6425 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6426 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6427 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6428 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6434 __kmp_validate_locks();
6436 /* Initialize internal memory allocator */
6437 __kmp_init_allocator();
6439 /* Register the library startup via an environment variable and check to see
6440 whether another copy of the library is already registered. */
6442 __kmp_register_library_startup();
6444 /* TODO reinitialization of library */
6445 if (TCR_4(__kmp_global.g.g_done)) {
6446 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6449 __kmp_global.g.g_abort = 0;
6450 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6452 /* initialize the locks */
6453 #if KMP_USE_ADAPTIVE_LOCKS
6454 #if KMP_DEBUG_ADAPTIVE_LOCKS
6455 __kmp_init_speculative_stats();
6458 #if KMP_STATS_ENABLED
6461 __kmp_init_lock(&__kmp_global_lock);
6462 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6463 __kmp_init_lock(&__kmp_debug_lock);
6464 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6465 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6466 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6467 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6468 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6469 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6470 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6471 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6472 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6473 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6474 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6475 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6476 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6477 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6478 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6480 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6482 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6484 /* conduct initialization and initial setup of configuration */
6486 __kmp_runtime_initialize();
6488 #if KMP_MIC_SUPPORTED
6489 __kmp_check_mic_type();
6492 // Some global variable initialization moved here from kmp_env_initialize()
6496 __kmp_abort_delay = 0;
6498 // From __kmp_init_dflt_team_nth()
6499 /* assume the entire machine will be used */
6500 __kmp_dflt_team_nth_ub = __kmp_xproc;
6501 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6502 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6504 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6505 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6507 __kmp_max_nth = __kmp_sys_max_nth;
6508 __kmp_cg_max_nth = __kmp_sys_max_nth;
6509 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6510 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6511 __kmp_teams_max_nth = __kmp_sys_max_nth;
6514 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6516 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6518 __kmp_monitor_wakeups =
6519 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6520 __kmp_bt_intervals =
6521 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6523 // From "KMP_LIBRARY" part of __kmp_env_initialize()
6524 __kmp_library = library_throughput;
6525 // From KMP_SCHEDULE initialization
6526 __kmp_static = kmp_sch_static_balanced;
6527 // AC: do not use analytical here, because it is non-monotonous
6528 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6529 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6530 // need to repeat assignment
6531 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6532 // bit control and barrier method control parts
6533 #if KMP_FAST_REDUCTION_BARRIER
6534 #define kmp_reduction_barrier_gather_bb ((int)1)
6535 #define kmp_reduction_barrier_release_bb ((int)1)
6536 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6537 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6538 #endif // KMP_FAST_REDUCTION_BARRIER
6539 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6540 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6541 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6542 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6543 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6544 #if KMP_FAST_REDUCTION_BARRIER
6545 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6546 // lin_64 ): hyper,1
6547 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6548 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6549 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6550 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6552 #endif // KMP_FAST_REDUCTION_BARRIER
6554 #if KMP_FAST_REDUCTION_BARRIER
6555 #undef kmp_reduction_barrier_release_pat
6556 #undef kmp_reduction_barrier_gather_pat
6557 #undef kmp_reduction_barrier_release_bb
6558 #undef kmp_reduction_barrier_gather_bb
6559 #endif // KMP_FAST_REDUCTION_BARRIER
6560 #if KMP_MIC_SUPPORTED
6561 if (__kmp_mic_type == mic2) { // KNC
6562 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6563 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6564 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6565 1; // forkjoin release
6566 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6567 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6569 #if KMP_FAST_REDUCTION_BARRIER
6570 if (__kmp_mic_type == mic2) { // KNC
6571 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6572 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6574 #endif // KMP_FAST_REDUCTION_BARRIER
6575 #endif // KMP_MIC_SUPPORTED
6577 // From KMP_CHECKS initialization
6579 __kmp_env_checks = TRUE; /* development versions have the extra checks */
6581 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6584 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6585 __kmp_foreign_tp = TRUE;
6587 __kmp_global.g.g_dynamic = FALSE;
6588 __kmp_global.g.g_dynamic_mode = dynamic_default;
6590 __kmp_env_initialize(NULL);
6592 // Print all messages in message catalog for testing purposes.
6594 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6595 if (__kmp_str_match_true(val)) {
6596 kmp_str_buf_t buffer;
6597 __kmp_str_buf_init(&buffer);
6598 __kmp_i18n_dump_catalog(&buffer);
6599 __kmp_printf("%s", buffer.str);
6600 __kmp_str_buf_free(&buffer);
6602 __kmp_env_free(&val);
6605 __kmp_threads_capacity =
6606 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6607 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6608 __kmp_tp_capacity = __kmp_default_tp_capacity(
6609 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6611 // If the library is shut down properly, both pools must be NULL. Just in
6612 // case, set them to NULL -- some memory may leak, but subsequent code will
6613 // work even if pools are not freed.
6614 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6615 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6616 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6617 __kmp_thread_pool = NULL;
6618 __kmp_thread_pool_insert_pt = NULL;
6619 __kmp_team_pool = NULL;
6621 /* Allocate all of the variable sized records */
6622 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6624 /* Since allocation is cache-aligned, just add extra padding at the end */
6626 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6628 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6629 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6630 sizeof(kmp_info_t *) * __kmp_threads_capacity);
6632 /* init thread counts */
6633 KMP_DEBUG_ASSERT(__kmp_all_nth ==
6634 0); // Asserts fail if the library is reinitializing and
6635 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6639 /* setup the uber master thread and hierarchy */
6640 gtid = __kmp_register_root(TRUE);
6641 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6642 KMP_ASSERT(KMP_UBER_GTID(gtid));
6643 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6645 KMP_MB(); /* Flush all pending memory write invalidates. */
6647 __kmp_common_initialize();
6650 /* invoke the child fork handler */
6651 __kmp_register_atfork();
6654 #if !KMP_DYNAMIC_LIB
6656 /* Invoke the exit handler when the program finishes, only for static
6657 library. For dynamic library, we already have _fini and DllMain. */
6658 int rc = atexit(__kmp_internal_end_atexit);
6660 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6666 #if KMP_HANDLE_SIGNALS
6668 /* NOTE: make sure that this is called before the user installs their own
6669 signal handlers so that the user handlers are called first. this way they
6670 can return false, not call our handler, avoid terminating the library, and
6671 continue execution where they left off. */
6672 __kmp_install_signals(FALSE);
6673 #endif /* KMP_OS_UNIX */
6675 __kmp_install_signals(TRUE);
6676 #endif /* KMP_OS_WINDOWS */
6679 /* we have finished the serial initialization */
6680 __kmp_init_counter++;
6682 __kmp_init_serial = TRUE;
6684 if (__kmp_settings) {
6688 if (__kmp_display_env || __kmp_display_env_verbose) {
6689 __kmp_env_print_2();
6698 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6701 void __kmp_serial_initialize(void) {
6702 if (__kmp_init_serial) {
6705 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6706 if (__kmp_init_serial) {
6707 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6710 __kmp_do_serial_initialize();
6711 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6714 static void __kmp_do_middle_initialize(void) {
6716 int prev_dflt_team_nth;
6718 if (!__kmp_init_serial) {
6719 __kmp_do_serial_initialize();
6722 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6724 // Save the previous value for the __kmp_dflt_team_nth so that
6725 // we can avoid some reinitialization if it hasn't changed.
6726 prev_dflt_team_nth = __kmp_dflt_team_nth;
6728 #if KMP_AFFINITY_SUPPORTED
6729 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6730 // number of cores on the machine.
6731 __kmp_affinity_initialize();
6733 // Run through the __kmp_threads array and set the affinity mask
6734 // for each root thread that is currently registered with the RTL.
6735 for (i = 0; i < __kmp_threads_capacity; i++) {
6736 if (TCR_PTR(__kmp_threads[i]) != NULL) {
6737 __kmp_affinity_set_init_mask(i, TRUE);
6740 #endif /* KMP_AFFINITY_SUPPORTED */
6742 KMP_ASSERT(__kmp_xproc > 0);
6743 if (__kmp_avail_proc == 0) {
6744 __kmp_avail_proc = __kmp_xproc;
6747 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6750 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6751 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6756 if (__kmp_dflt_team_nth == 0) {
6757 #ifdef KMP_DFLT_NTH_CORES
6758 // Default #threads = #cores
6759 __kmp_dflt_team_nth = __kmp_ncores;
6760 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6761 "__kmp_ncores (%d)\n",
6762 __kmp_dflt_team_nth));
6764 // Default #threads = #available OS procs
6765 __kmp_dflt_team_nth = __kmp_avail_proc;
6766 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6767 "__kmp_avail_proc(%d)\n",
6768 __kmp_dflt_team_nth));
6769 #endif /* KMP_DFLT_NTH_CORES */
6772 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6773 __kmp_dflt_team_nth = KMP_MIN_NTH;
6775 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6776 __kmp_dflt_team_nth = __kmp_sys_max_nth;
6779 // There's no harm in continuing if the following check fails,
6780 // but it indicates an error in the previous logic.
6781 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6783 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6784 // Run through the __kmp_threads array and set the num threads icv for each
6785 // root thread that is currently registered with the RTL (which has not
6786 // already explicitly set its nthreads-var with a call to
6787 // omp_set_num_threads()).
6788 for (i = 0; i < __kmp_threads_capacity; i++) {
6789 kmp_info_t *thread = __kmp_threads[i];
6792 if (thread->th.th_current_task->td_icvs.nproc != 0)
6795 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6800 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6801 __kmp_dflt_team_nth));
6803 #ifdef KMP_ADJUST_BLOCKTIME
6804 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6805 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6806 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6807 if (__kmp_nth > __kmp_avail_proc) {
6808 __kmp_zero_bt = TRUE;
6811 #endif /* KMP_ADJUST_BLOCKTIME */
6813 /* we have finished middle initialization */
6814 TCW_SYNC_4(__kmp_init_middle, TRUE);
6816 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6819 void __kmp_middle_initialize(void) {
6820 if (__kmp_init_middle) {
6823 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6824 if (__kmp_init_middle) {
6825 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6828 __kmp_do_middle_initialize();
6829 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6832 void __kmp_parallel_initialize(void) {
6833 int gtid = __kmp_entry_gtid(); // this might be a new root
6835 /* synchronize parallel initialization (for sibling) */
6836 if (TCR_4(__kmp_init_parallel))
6838 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6839 if (TCR_4(__kmp_init_parallel)) {
6840 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6844 /* TODO reinitialization after we have already shut down */
6845 if (TCR_4(__kmp_global.g.g_done)) {
6848 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6849 __kmp_infinite_loop();
6852 /* jc: The lock __kmp_initz_lock is already held, so calling
6853 __kmp_serial_initialize would cause a deadlock. So we call
6854 __kmp_do_serial_initialize directly. */
6855 if (!__kmp_init_middle) {
6856 __kmp_do_middle_initialize();
6858 __kmp_resume_if_hard_paused();
6860 /* begin initialization */
6861 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6862 KMP_ASSERT(KMP_UBER_GTID(gtid));
6864 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6865 // Save the FP control regs.
6866 // Worker threads will set theirs to these values at thread startup.
6867 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6868 __kmp_store_mxcsr(&__kmp_init_mxcsr);
6869 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6870 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6873 #if KMP_HANDLE_SIGNALS
6874 /* must be after __kmp_serial_initialize */
6875 __kmp_install_signals(TRUE);
6879 __kmp_suspend_initialize();
6881 #if defined(USE_LOAD_BALANCE)
6882 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6883 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6886 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6887 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6891 if (__kmp_version) {
6892 __kmp_print_version_2();
6895 /* we have finished parallel initialization */
6896 TCW_SYNC_4(__kmp_init_parallel, TRUE);
6899 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6901 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6904 /* ------------------------------------------------------------------------ */
6906 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6908 kmp_disp_t *dispatch;
6912 /* none of the threads have encountered any constructs, yet. */
6913 this_thr->th.th_local.this_construct = 0;
6914 #if KMP_CACHE_MANAGE
6915 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6916 #endif /* KMP_CACHE_MANAGE */
6917 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6918 KMP_DEBUG_ASSERT(dispatch);
6919 KMP_DEBUG_ASSERT(team->t.t_dispatch);
6920 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6921 // this_thr->th.th_info.ds.ds_tid ] );
6923 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6924 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6925 if (__kmp_env_consistency_check)
6926 __kmp_push_parallel(gtid, team->t.t_ident);
6928 KMP_MB(); /* Flush all pending memory write invalidates. */
6931 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6933 if (__kmp_env_consistency_check)
6934 __kmp_pop_parallel(gtid, team->t.t_ident);
6936 __kmp_finish_implicit_task(this_thr);
6939 int __kmp_invoke_task_func(int gtid) {
6941 int tid = __kmp_tid_from_gtid(gtid);
6942 kmp_info_t *this_thr = __kmp_threads[gtid];
6943 kmp_team_t *team = this_thr->th.th_team;
6945 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6947 if (__itt_stack_caller_create_ptr) {
6948 __kmp_itt_stack_callee_enter(
6950 team->t.t_stack_id); // inform ittnotify about entering user's code
6952 #endif /* USE_ITT_BUILD */
6953 #if INCLUDE_SSC_MARKS
6954 SSC_MARK_INVOKING();
6959 void **exit_runtime_p;
6960 ompt_data_t *my_task_data;
6961 ompt_data_t *my_parallel_data;
6964 if (ompt_enabled.enabled) {
6966 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6968 exit_runtime_p = &dummy;
6972 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6973 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
6974 if (ompt_enabled.ompt_callback_implicit_task) {
6975 ompt_team_size = team->t.t_nproc;
6976 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
6977 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
6978 __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
6979 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
6983 #if KMP_STATS_ENABLED
6984 stats_state_e previous_state = KMP_GET_THREAD_STATE();
6985 if (previous_state == stats_state_e::TEAMS_REGION) {
6986 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
6988 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
6990 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
6993 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
6994 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7001 *exit_runtime_p = NULL;
7004 #if KMP_STATS_ENABLED
7005 if (previous_state == stats_state_e::TEAMS_REGION) {
7006 KMP_SET_THREAD_STATE(previous_state);
7008 KMP_POP_PARTITIONED_TIMER();
7012 if (__itt_stack_caller_create_ptr) {
7013 __kmp_itt_stack_callee_leave(
7015 team->t.t_stack_id); // inform ittnotify about leaving user's code
7017 #endif /* USE_ITT_BUILD */
7018 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7023 void __kmp_teams_master(int gtid) {
7024 // This routine is called by all master threads in teams construct
7025 kmp_info_t *thr = __kmp_threads[gtid];
7026 kmp_team_t *team = thr->th.th_team;
7027 ident_t *loc = team->t.t_ident;
7028 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7029 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7030 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7031 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7032 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7034 // This thread is a new CG root. Set up the proper variables.
7035 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7036 tmp->cg_root = thr; // Make thr the CG root
7037 // Init to thread limit that was stored when league masters were forked
7038 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7039 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7040 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7041 " cg_nthreads to 1\n",
7043 tmp->up = thr->th.th_cg_roots;
7044 thr->th.th_cg_roots = tmp;
7046 // Launch league of teams now, but not let workers execute
7047 // (they hang on fork barrier until next parallel)
7048 #if INCLUDE_SSC_MARKS
7051 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7052 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7053 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7054 #if INCLUDE_SSC_MARKS
7057 // If the team size was reduced from the limit, set it to the new size
7058 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7059 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7060 // AC: last parameter "1" eliminates join barrier which won't work because
7061 // worker threads are in a fork barrier waiting for more parallel regions
7062 __kmp_join_call(loc, gtid
7071 int __kmp_invoke_teams_master(int gtid) {
7072 kmp_info_t *this_thr = __kmp_threads[gtid];
7073 kmp_team_t *team = this_thr->th.th_team;
7075 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7076 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7077 (void *)__kmp_teams_master);
7079 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7080 __kmp_teams_master(gtid);
7081 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7085 /* this sets the requested number of threads for the next parallel region
7086 encountered by this team. since this should be enclosed in the forkjoin
7087 critical section it should avoid race conditions with assymmetrical nested
7090 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7091 kmp_info_t *thr = __kmp_threads[gtid];
7093 if (num_threads > 0)
7094 thr->th.th_set_nproc = num_threads;
7097 /* this sets the requested number of teams for the teams region and/or
7098 the number of threads for the next parallel region encountered */
7099 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7101 kmp_info_t *thr = __kmp_threads[gtid];
7102 KMP_DEBUG_ASSERT(num_teams >= 0);
7103 KMP_DEBUG_ASSERT(num_threads >= 0);
7106 num_teams = 1; // default number of teams is 1.
7107 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7108 if (!__kmp_reserve_warn) {
7109 __kmp_reserve_warn = 1;
7110 __kmp_msg(kmp_ms_warning,
7111 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7112 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7114 num_teams = __kmp_teams_max_nth;
7116 // Set number of teams (number of threads in the outer "parallel" of the
7118 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7120 // Remember the number of threads for inner parallel regions
7121 if (num_threads == 0) {
7122 if (!TCR_4(__kmp_init_middle))
7123 __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7124 num_threads = __kmp_avail_proc / num_teams;
7125 if (num_teams * num_threads > __kmp_teams_max_nth) {
7126 // adjust num_threads w/o warning as it is not user setting
7127 num_threads = __kmp_teams_max_nth / num_teams;
7130 // This thread will be the master of the league masters
7131 // Store new thread limit; old limit is saved in th_cg_roots list
7132 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7134 if (num_teams * num_threads > __kmp_teams_max_nth) {
7135 int new_threads = __kmp_teams_max_nth / num_teams;
7136 if (!__kmp_reserve_warn) { // user asked for too many threads
7137 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7138 __kmp_msg(kmp_ms_warning,
7139 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7140 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7142 num_threads = new_threads;
7145 thr->th.th_teams_size.nth = num_threads;
7148 // Set the proc_bind var to use in the following parallel region.
7149 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7150 kmp_info_t *thr = __kmp_threads[gtid];
7151 thr->th.th_set_proc_bind = proc_bind;
7154 /* Launch the worker threads into the microtask. */
7156 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7157 kmp_info_t *this_thr = __kmp_threads[gtid];
7161 #endif /* KMP_DEBUG */
7163 KMP_DEBUG_ASSERT(team);
7164 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7165 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7166 KMP_MB(); /* Flush all pending memory write invalidates. */
7168 team->t.t_construct = 0; /* no single directives seen yet */
7169 team->t.t_ordered.dt.t_value =
7170 0; /* thread 0 enters the ordered section first */
7172 /* Reset the identifiers on the dispatch buffer */
7173 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7174 if (team->t.t_max_nproc > 1) {
7176 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7177 team->t.t_disp_buffer[i].buffer_index = i;
7178 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7181 team->t.t_disp_buffer[0].buffer_index = 0;
7182 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7185 KMP_MB(); /* Flush all pending memory write invalidates. */
7186 KMP_ASSERT(this_thr->th.th_team == team);
7189 for (f = 0; f < team->t.t_nproc; f++) {
7190 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7191 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7193 #endif /* KMP_DEBUG */
7195 /* release the worker threads so they may begin working */
7196 __kmp_fork_barrier(gtid, 0);
7199 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7200 kmp_info_t *this_thr = __kmp_threads[gtid];
7202 KMP_DEBUG_ASSERT(team);
7203 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7204 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7205 KMP_MB(); /* Flush all pending memory write invalidates. */
7207 /* Join barrier after fork */
7210 if (__kmp_threads[gtid] &&
7211 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7212 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7213 __kmp_threads[gtid]);
7214 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7215 "team->t.t_nproc=%d\n",
7216 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7218 __kmp_print_structure();
7220 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7221 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7222 #endif /* KMP_DEBUG */
7224 __kmp_join_barrier(gtid); /* wait for everyone */
7226 if (ompt_enabled.enabled &&
7227 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7228 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7229 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7230 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7232 void *codeptr = NULL;
7233 if (KMP_MASTER_TID(ds_tid) &&
7234 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7235 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7236 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7238 if (ompt_enabled.ompt_callback_sync_region_wait) {
7239 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7240 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7243 if (ompt_enabled.ompt_callback_sync_region) {
7244 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7245 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7249 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7250 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7251 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7256 KMP_MB(); /* Flush all pending memory write invalidates. */
7257 KMP_ASSERT(this_thr->th.th_team == team);
7260 /* ------------------------------------------------------------------------ */
7262 #ifdef USE_LOAD_BALANCE
7264 // Return the worker threads actively spinning in the hot team, if we
7265 // are at the outermost level of parallelism. Otherwise, return 0.
7266 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7269 kmp_team_t *hot_team;
7271 if (root->r.r_active) {
7274 hot_team = root->r.r_hot_team;
7275 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7276 return hot_team->t.t_nproc - 1; // Don't count master thread
7279 // Skip the master thread - it is accounted for elsewhere.
7281 for (i = 1; i < hot_team->t.t_nproc; i++) {
7282 if (hot_team->t.t_threads[i]->th.th_active) {
7289 // Perform an automatic adjustment to the number of
7290 // threads used by the next parallel region.
7291 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7294 int hot_team_active;
7295 int team_curr_active;
7298 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7300 KMP_DEBUG_ASSERT(root);
7301 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7302 ->th.th_current_task->td_icvs.dynamic == TRUE);
7303 KMP_DEBUG_ASSERT(set_nproc > 1);
7305 if (set_nproc == 1) {
7306 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7310 // Threads that are active in the thread pool, active in the hot team for this
7311 // particular root (if we are at the outer par level), and the currently
7312 // executing thread (to become the master) are available to add to the new
7313 // team, but are currently contributing to the system load, and must be
7315 pool_active = __kmp_thread_pool_active_nth;
7316 hot_team_active = __kmp_active_hot_team_nproc(root);
7317 team_curr_active = pool_active + hot_team_active + 1;
7319 // Check the system load.
7320 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7321 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7322 "hot team active = %d\n",
7323 system_active, pool_active, hot_team_active));
7325 if (system_active < 0) {
7326 // There was an error reading the necessary info from /proc, so use the
7327 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7328 // = dynamic_thread_limit, we shouldn't wind up getting back here.
7329 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7330 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7332 // Make this call behave like the thread limit algorithm.
7333 retval = __kmp_avail_proc - __kmp_nth +
7334 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7335 if (retval > set_nproc) {
7338 if (retval < KMP_MIN_NTH) {
7339 retval = KMP_MIN_NTH;
7342 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7347 // There is a slight delay in the load balance algorithm in detecting new
7348 // running procs. The real system load at this instant should be at least as
7349 // large as the #active omp thread that are available to add to the team.
7350 if (system_active < team_curr_active) {
7351 system_active = team_curr_active;
7353 retval = __kmp_avail_proc - system_active + team_curr_active;
7354 if (retval > set_nproc) {
7357 if (retval < KMP_MIN_NTH) {
7358 retval = KMP_MIN_NTH;
7361 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7363 } // __kmp_load_balance_nproc()
7365 #endif /* USE_LOAD_BALANCE */
7367 /* ------------------------------------------------------------------------ */
7369 /* NOTE: this is called with the __kmp_init_lock held */
7370 void __kmp_cleanup(void) {
7373 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7375 if (TCR_4(__kmp_init_parallel)) {
7376 #if KMP_HANDLE_SIGNALS
7377 __kmp_remove_signals();
7379 TCW_4(__kmp_init_parallel, FALSE);
7382 if (TCR_4(__kmp_init_middle)) {
7383 #if KMP_AFFINITY_SUPPORTED
7384 __kmp_affinity_uninitialize();
7385 #endif /* KMP_AFFINITY_SUPPORTED */
7386 __kmp_cleanup_hierarchy();
7387 TCW_4(__kmp_init_middle, FALSE);
7390 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7392 if (__kmp_init_serial) {
7393 __kmp_runtime_destroy();
7394 __kmp_init_serial = FALSE;
7397 __kmp_cleanup_threadprivate_caches();
7399 for (f = 0; f < __kmp_threads_capacity; f++) {
7400 if (__kmp_root[f] != NULL) {
7401 __kmp_free(__kmp_root[f]);
7402 __kmp_root[f] = NULL;
7405 __kmp_free(__kmp_threads);
7406 // __kmp_threads and __kmp_root were allocated at once, as single block, so
7407 // there is no need in freeing __kmp_root.
7408 __kmp_threads = NULL;
7410 __kmp_threads_capacity = 0;
7412 #if KMP_USE_DYNAMIC_LOCK
7413 __kmp_cleanup_indirect_user_locks();
7415 __kmp_cleanup_user_locks();
7418 #if KMP_AFFINITY_SUPPORTED
7419 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7420 __kmp_cpuinfo_file = NULL;
7421 #endif /* KMP_AFFINITY_SUPPORTED */
7423 #if KMP_USE_ADAPTIVE_LOCKS
7424 #if KMP_DEBUG_ADAPTIVE_LOCKS
7425 __kmp_print_speculative_stats();
7428 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7429 __kmp_nested_nth.nth = NULL;
7430 __kmp_nested_nth.size = 0;
7431 __kmp_nested_nth.used = 0;
7432 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7433 __kmp_nested_proc_bind.bind_types = NULL;
7434 __kmp_nested_proc_bind.size = 0;
7435 __kmp_nested_proc_bind.used = 0;
7436 if (__kmp_affinity_format) {
7437 KMP_INTERNAL_FREE(__kmp_affinity_format);
7438 __kmp_affinity_format = NULL;
7441 __kmp_i18n_catclose();
7443 #if KMP_USE_HIER_SCHED
7444 __kmp_hier_scheds.deallocate();
7447 #if KMP_STATS_ENABLED
7451 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7454 /* ------------------------------------------------------------------------ */
7456 int __kmp_ignore_mppbeg(void) {
7459 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7460 if (__kmp_str_match_false(env))
7463 // By default __kmpc_begin() is no-op.
7467 int __kmp_ignore_mppend(void) {
7470 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7471 if (__kmp_str_match_false(env))
7474 // By default __kmpc_end() is no-op.
7478 void __kmp_internal_begin(void) {
7482 /* this is a very important step as it will register new sibling threads
7483 and assign these new uber threads a new gtid */
7484 gtid = __kmp_entry_gtid();
7485 root = __kmp_threads[gtid]->th.th_root;
7486 KMP_ASSERT(KMP_UBER_GTID(gtid));
7488 if (root->r.r_begin)
7490 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7491 if (root->r.r_begin) {
7492 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7496 root->r.r_begin = TRUE;
7498 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7501 /* ------------------------------------------------------------------------ */
7503 void __kmp_user_set_library(enum library_type arg) {
7508 /* first, make sure we are initialized so we can get our gtid */
7510 gtid = __kmp_entry_gtid();
7511 thread = __kmp_threads[gtid];
7513 root = thread->th.th_root;
7515 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7517 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7519 KMP_WARNING(SetLibraryIncorrectCall);
7524 case library_serial:
7525 thread->th.th_set_nproc = 0;
7526 set__nproc(thread, 1);
7528 case library_turnaround:
7529 thread->th.th_set_nproc = 0;
7530 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7531 : __kmp_dflt_team_nth_ub);
7533 case library_throughput:
7534 thread->th.th_set_nproc = 0;
7535 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7536 : __kmp_dflt_team_nth_ub);
7539 KMP_FATAL(UnknownLibraryType, arg);
7542 __kmp_aux_set_library(arg);
7545 void __kmp_aux_set_stacksize(size_t arg) {
7546 if (!__kmp_init_serial)
7547 __kmp_serial_initialize();
7550 if (arg & (0x1000 - 1)) {
7551 arg &= ~(0x1000 - 1);
7552 if (arg + 0x1000) /* check for overflow if we round up */
7556 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7558 /* only change the default stacksize before the first parallel region */
7559 if (!TCR_4(__kmp_init_parallel)) {
7560 size_t value = arg; /* argument is in bytes */
7562 if (value < __kmp_sys_min_stksize)
7563 value = __kmp_sys_min_stksize;
7564 else if (value > KMP_MAX_STKSIZE)
7565 value = KMP_MAX_STKSIZE;
7567 __kmp_stksize = value;
7569 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7572 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7575 /* set the behaviour of the runtime library */
7576 /* TODO this can cause some odd behaviour with sibling parallelism... */
7577 void __kmp_aux_set_library(enum library_type arg) {
7578 __kmp_library = arg;
7580 switch (__kmp_library) {
7581 case library_serial: {
7582 KMP_INFORM(LibraryIsSerial);
7584 case library_turnaround:
7585 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7586 __kmp_use_yield = 2; // only yield when oversubscribed
7588 case library_throughput:
7589 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7590 __kmp_dflt_blocktime = 200;
7593 KMP_FATAL(UnknownLibraryType, arg);
7597 /* Getting team information common for all team API */
7598 // Returns NULL if not in teams construct
7599 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7600 kmp_info_t *thr = __kmp_entry_thread();
7601 teams_serialized = 0;
7602 if (thr->th.th_teams_microtask) {
7603 kmp_team_t *team = thr->th.th_team;
7604 int tlevel = thr->th.th_teams_level; // the level of the teams construct
7605 int ii = team->t.t_level;
7606 teams_serialized = team->t.t_serialized;
7607 int level = tlevel + 1;
7608 KMP_DEBUG_ASSERT(ii >= tlevel);
7609 while (ii > level) {
7610 for (teams_serialized = team->t.t_serialized;
7611 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7613 if (team->t.t_serialized && (!teams_serialized)) {
7614 team = team->t.t_parent;
7618 team = team->t.t_parent;
7627 int __kmp_aux_get_team_num() {
7629 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7631 if (serialized > 1) {
7632 return 0; // teams region is serialized ( 1 team of 1 thread ).
7634 return team->t.t_master_tid;
7640 int __kmp_aux_get_num_teams() {
7642 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7644 if (serialized > 1) {
7647 return team->t.t_parent->t.t_nproc;
7653 /* ------------------------------------------------------------------------ */
7656 * Affinity Format Parser
7658 * Field is in form of: %[[[0].]size]type
7659 * % and type are required (%% means print a literal '%')
7660 * type is either single char or long name surrounded by {},
7661 * e.g., N or {num_threads}
7662 * 0 => leading zeros
7663 * . => right justified when size is specified
7664 * by default output is left justified
7665 * size is the *minimum* field length
7666 * All other characters are printed as is
7668 * Available field types:
7669 * L {thread_level} - omp_get_level()
7670 * n {thread_num} - omp_get_thread_num()
7671 * h {host} - name of host machine
7672 * P {process_id} - process id (integer)
7673 * T {thread_identifier} - native thread identifier (integer)
7674 * N {num_threads} - omp_get_num_threads()
7675 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7676 * a {thread_affinity} - comma separated list of integers or integer ranges
7677 * (values of affinity mask)
7679 * Implementation-specific field types can be added
7680 * If a type is unknown, print "undefined"
7683 // Structure holding the short name, long name, and corresponding data type
7684 // for snprintf. A table of these will represent the entire valid keyword
7686 typedef struct kmp_affinity_format_field_t {
7687 char short_name; // from spec e.g., L -> thread level
7688 const char *long_name; // from spec thread_level -> thread level
7689 char field_format; // data type for snprintf (typically 'd' or 's'
7690 // for integer or string)
7691 } kmp_affinity_format_field_t;
7693 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7694 #if KMP_AFFINITY_SUPPORTED
7695 {'A', "thread_affinity", 's'},
7697 {'t', "team_num", 'd'},
7698 {'T', "num_teams", 'd'},
7699 {'L', "nesting_level", 'd'},
7700 {'n', "thread_num", 'd'},
7701 {'N', "num_threads", 'd'},
7702 {'a', "ancestor_tnum", 'd'},
7704 {'P', "process_id", 'd'},
7705 {'i', "native_thread_id", 'd'}};
7707 // Return the number of characters it takes to hold field
7708 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7710 kmp_str_buf_t *field_buffer) {
7711 int rc, format_index, field_value;
7712 const char *width_left, *width_right;
7713 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7714 static const int FORMAT_SIZE = 20;
7715 char format[FORMAT_SIZE] = {0};
7716 char absolute_short_name = 0;
7718 KMP_DEBUG_ASSERT(gtid >= 0);
7719 KMP_DEBUG_ASSERT(th);
7720 KMP_DEBUG_ASSERT(**ptr == '%');
7721 KMP_DEBUG_ASSERT(field_buffer);
7723 __kmp_str_buf_clear(field_buffer);
7725 // Skip the initial %
7728 // Check for %% first
7730 __kmp_str_buf_cat(field_buffer, "%", 1);
7731 (*ptr)++; // skip over the second %
7735 // Parse field modifiers if they are present
7739 (*ptr)++; // skip over 0
7741 right_justify = false;
7743 right_justify = true;
7744 (*ptr)++; // skip over .
7746 // Parse width of field: [width_left, width_right)
7747 width_left = width_right = NULL;
7748 if (**ptr >= '0' && **ptr <= '9') {
7754 // Create the format for KMP_SNPRINTF based on flags parsed above
7756 format[format_index++] = '%';
7758 format[format_index++] = '-';
7760 format[format_index++] = '0';
7761 if (width_left && width_right) {
7763 // Only allow 8 digit number widths.
7764 // This also prevents overflowing format variable
7765 while (i < 8 && width_left < width_right) {
7766 format[format_index++] = *width_left;
7772 // Parse a name (long or short)
7773 // Canonicalize the name into absolute_short_name
7774 found_valid_name = false;
7775 parse_long_name = (**ptr == '{');
7776 if (parse_long_name)
7777 (*ptr)++; // skip initial left brace
7778 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7779 sizeof(__kmp_affinity_format_table[0]);
7781 char short_name = __kmp_affinity_format_table[i].short_name;
7782 const char *long_name = __kmp_affinity_format_table[i].long_name;
7783 char field_format = __kmp_affinity_format_table[i].field_format;
7784 if (parse_long_name) {
7785 int length = KMP_STRLEN(long_name);
7786 if (strncmp(*ptr, long_name, length) == 0) {
7787 found_valid_name = true;
7788 (*ptr) += length; // skip the long name
7790 } else if (**ptr == short_name) {
7791 found_valid_name = true;
7792 (*ptr)++; // skip the short name
7794 if (found_valid_name) {
7795 format[format_index++] = field_format;
7796 format[format_index++] = '\0';
7797 absolute_short_name = short_name;
7801 if (parse_long_name) {
7803 absolute_short_name = 0;
7805 (*ptr)++; // skip over the right brace
7809 // Attempt to fill the buffer with the requested
7810 // value using snprintf within __kmp_str_buf_print()
7811 switch (absolute_short_name) {
7813 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7816 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7819 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7822 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7825 static const int BUFFER_SIZE = 256;
7826 char buf[BUFFER_SIZE];
7827 __kmp_expand_host_name(buf, BUFFER_SIZE);
7828 rc = __kmp_str_buf_print(field_buffer, format, buf);
7831 rc = __kmp_str_buf_print(field_buffer, format, getpid());
7834 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7837 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7841 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7842 rc = __kmp_str_buf_print(field_buffer, format, field_value);
7844 #if KMP_AFFINITY_SUPPORTED
7847 __kmp_str_buf_init(&buf);
7848 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7849 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7850 __kmp_str_buf_free(&buf);
7854 // According to spec, If an implementation does not have info for field
7855 // type, then "undefined" is printed
7856 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7858 if (parse_long_name) {
7867 KMP_ASSERT(format_index <= FORMAT_SIZE);
7872 * Return number of characters needed to hold the affinity string
7873 * (not including null byte character)
7874 * The resultant string is printed to buffer, which the caller can then
7877 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7878 kmp_str_buf_t *buffer) {
7879 const char *parse_ptr;
7881 const kmp_info_t *th;
7882 kmp_str_buf_t field;
7884 KMP_DEBUG_ASSERT(buffer);
7885 KMP_DEBUG_ASSERT(gtid >= 0);
7887 __kmp_str_buf_init(&field);
7888 __kmp_str_buf_clear(buffer);
7890 th = __kmp_threads[gtid];
7893 // If format is NULL or zero-length string, then we use
7894 // affinity-format-var ICV
7896 if (parse_ptr == NULL || *parse_ptr == '\0') {
7897 parse_ptr = __kmp_affinity_format;
7899 KMP_DEBUG_ASSERT(parse_ptr);
7901 while (*parse_ptr != '\0') {
7903 if (*parse_ptr == '%') {
7904 // Put field in the buffer
7905 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7906 __kmp_str_buf_catbuf(buffer, &field);
7909 // Put literal character in buffer
7910 __kmp_str_buf_cat(buffer, parse_ptr, 1);
7915 __kmp_str_buf_free(&field);
7919 // Displays the affinity string to stdout
7920 void __kmp_aux_display_affinity(int gtid, const char *format) {
7922 __kmp_str_buf_init(&buf);
7923 __kmp_aux_capture_affinity(gtid, format, &buf);
7924 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7925 __kmp_str_buf_free(&buf);
7928 /* ------------------------------------------------------------------------ */
7930 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7931 int blocktime = arg; /* argument is in milliseconds */
7937 __kmp_save_internal_controls(thread);
7939 /* Normalize and set blocktime for the teams */
7940 if (blocktime < KMP_MIN_BLOCKTIME)
7941 blocktime = KMP_MIN_BLOCKTIME;
7942 else if (blocktime > KMP_MAX_BLOCKTIME)
7943 blocktime = KMP_MAX_BLOCKTIME;
7945 set__blocktime_team(thread->th.th_team, tid, blocktime);
7946 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
7949 /* Calculate and set blocktime intervals for the teams */
7950 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7952 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
7953 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
7956 /* Set whether blocktime has been set to "TRUE" */
7959 set__bt_set_team(thread->th.th_team, tid, bt_set);
7960 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
7962 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7963 "bt_intervals=%d, monitor_updates=%d\n",
7964 __kmp_gtid_from_tid(tid, thread->th.th_team),
7965 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7966 __kmp_monitor_wakeups));
7968 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7969 __kmp_gtid_from_tid(tid, thread->th.th_team),
7970 thread->th.th_team->t.t_id, tid, blocktime));
7974 void __kmp_aux_set_defaults(char const *str, int len) {
7975 if (!__kmp_init_serial) {
7976 __kmp_serial_initialize();
7978 __kmp_env_initialize(str);
7980 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
7983 } // __kmp_aux_set_defaults
7985 /* ------------------------------------------------------------------------ */
7986 /* internal fast reduction routines */
7988 PACKED_REDUCTION_METHOD_T
7989 __kmp_determine_reduction_method(
7990 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
7991 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7992 kmp_critical_name *lck) {
7994 // Default reduction method: critical construct ( lck != NULL, like in current
7996 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
7997 // can be selected by RTL
7998 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
7999 // can be selected by RTL
8000 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8001 // among generated by PAROPT.
8003 PACKED_REDUCTION_METHOD_T retval;
8007 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8008 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8010 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8011 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8012 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8014 retval = critical_reduce_block;
8016 // another choice of getting a team size (with 1 dynamic deference) is slower
8017 team_size = __kmp_get_team_num_threads(global_tid);
8018 if (team_size == 1) {
8020 retval = empty_reduce_block;
8024 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8026 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8028 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8029 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8031 int teamsize_cutoff = 4;
8033 #if KMP_MIC_SUPPORTED
8034 if (__kmp_mic_type != non_mic) {
8035 teamsize_cutoff = 8;
8038 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8039 if (tree_available) {
8040 if (team_size <= teamsize_cutoff) {
8041 if (atomic_available) {
8042 retval = atomic_reduce_block;
8045 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8047 } else if (atomic_available) {
8048 retval = atomic_reduce_block;
8051 #error "Unknown or unsupported OS"
8052 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8053 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8055 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8057 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8061 if (atomic_available) {
8062 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8063 retval = atomic_reduce_block;
8065 } // otherwise: use critical section
8069 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8070 if (atomic_available && (num_vars <= 3)) {
8071 retval = atomic_reduce_block;
8072 } else if (tree_available) {
8073 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8074 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8075 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8077 } // otherwise: use critical section
8080 #error "Unknown or unsupported OS"
8084 #error "Unknown or unsupported architecture"
8088 // KMP_FORCE_REDUCTION
8090 // If the team is serialized (team_size == 1), ignore the forced reduction
8091 // method and stay with the unsynchronized method (empty_reduce_block)
8092 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8095 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8097 int atomic_available, tree_available;
8099 switch ((forced_retval = __kmp_force_reduction_method)) {
8100 case critical_reduce_block:
8101 KMP_ASSERT(lck); // lck should be != 0
8104 case atomic_reduce_block:
8105 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8106 if (!atomic_available) {
8107 KMP_WARNING(RedMethodNotSupported, "atomic");
8108 forced_retval = critical_reduce_block;
8112 case tree_reduce_block:
8113 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8114 if (!tree_available) {
8115 KMP_WARNING(RedMethodNotSupported, "tree");
8116 forced_retval = critical_reduce_block;
8118 #if KMP_FAST_REDUCTION_BARRIER
8119 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8125 KMP_ASSERT(0); // "unsupported method specified"
8128 retval = forced_retval;
8131 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8133 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8134 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8139 // this function is for testing set/get/determine reduce method
8140 kmp_int32 __kmp_get_reduce_method(void) {
8141 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8144 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8145 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8146 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8148 // Hard pause shuts down the runtime completely. Resume happens naturally when
8149 // OpenMP is used subsequently.
8150 void __kmp_hard_pause() {
8151 __kmp_pause_status = kmp_hard_paused;
8152 __kmp_internal_end_thread(-1);
8155 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8156 void __kmp_resume_if_soft_paused() {
8157 if (__kmp_pause_status == kmp_soft_paused) {
8158 __kmp_pause_status = kmp_not_paused;
8160 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8161 kmp_info_t *thread = __kmp_threads[gtid];
8162 if (thread) { // Wake it if sleeping
8163 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8164 if (fl.is_sleeping())
8166 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8167 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8168 } else { // thread holds the lock and may sleep soon
8169 do { // until either the thread sleeps, or we can get the lock
8170 if (fl.is_sleeping()) {
8173 } else if (__kmp_try_suspend_mx(thread)) {
8174 __kmp_unlock_suspend_mx(thread);
8184 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8185 // TODO: add warning messages
8186 int __kmp_pause_resource(kmp_pause_status_t level) {
8187 if (level == kmp_not_paused) { // requesting resume
8188 if (__kmp_pause_status == kmp_not_paused) {
8189 // error message about runtime not being paused, so can't resume
8192 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8193 __kmp_pause_status == kmp_hard_paused);
8194 __kmp_pause_status = kmp_not_paused;
8197 } else if (level == kmp_soft_paused) { // requesting soft pause
8198 if (__kmp_pause_status != kmp_not_paused) {
8199 // error message about already being paused
8205 } else if (level == kmp_hard_paused) { // requesting hard pause
8206 if (__kmp_pause_status != kmp_not_paused) {
8207 // error message about already being paused
8214 // error message about invalid level