2 * kmp_runtime.cpp -- KPTS runtime support library
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
32 #include "ompt-specific.h"
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
42 #include "tsan_annotations.h"
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46 KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
49 char const __kmp_version_omp_api[] =
50 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
53 char const __kmp_version_lock[] =
54 KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
59 /* ------------------------------------------------------------------------ */
62 kmp_info_t __kmp_monitor;
65 /* Forward declarations */
67 void __kmp_cleanup(void);
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72 kmp_internal_control_t *new_icvs,
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76 int update_master_only = 0);
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82 kmp_internal_control_t *new_icvs, ident_t *loc);
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
88 static int __kmp_expand_threads(int nNeed);
90 static int __kmp_unregister_root_other_thread(int gtid);
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
101 kmp_info_t **other_threads;
109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
110 __kmp_nth, __kmp_all_nth));
112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115 __kmp_init_gtid for this to work. */
117 if (!TCR_4(__kmp_init_gtid))
120 #ifdef KMP_TDATA_GTID
121 if (TCR_4(__kmp_gtid_mode) >= 3) {
122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
126 if (TCR_4(__kmp_gtid_mode) >= 2) {
127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128 return __kmp_gtid_get_specific();
130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
132 stack_addr = (char *)&stack_data;
133 other_threads = __kmp_threads;
135 /* ATT: The code below is a source of potential bugs due to unsynchronized
136 access to __kmp_threads array. For example:
137 1. Current thread loads other_threads[i] to thr and checks it, it is
139 2. Current thread is suspended by OS.
140 3. Another thread unregisters and finishes (debug versions of free()
141 may fill memory with something like 0xEF).
142 4. Current thread is resumed.
143 5. Current thread reads junk from *thr.
144 TODO: Fix it. --ln */
146 for (i = 0; i < __kmp_threads_capacity; i++) {
148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
155 /* stack grows down -- search through all of the active threads */
157 if (stack_addr <= stack_base) {
158 size_t stack_diff = stack_base - stack_addr;
160 if (stack_diff <= stack_size) {
161 /* The only way we can be closer than the allocated */
162 /* stack size is if we are running on this thread. */
163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
169 /* get specific to try and determine our gtid */
171 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172 "thread, using TLS\n"));
173 i = __kmp_gtid_get_specific();
175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
177 /* if we havn't been assigned a gtid, then return code */
181 /* dynamically updated stack window for uber threads to avoid get_specific
183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184 KMP_FATAL(StackOverflow, i);
187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188 if (stack_addr > stack_base) {
189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195 stack_base - stack_addr);
198 /* Reprint stack bounds for ubermaster since they have been refined */
199 if (__kmp_storage_map) {
200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203 other_threads[i]->th.th_info.ds.ds_stacksize,
204 "th_%d stack (refinement)", i);
209 int __kmp_get_global_thread_id_reg() {
212 if (!__kmp_init_serial) {
215 #ifdef KMP_TDATA_GTID
216 if (TCR_4(__kmp_gtid_mode) >= 3) {
217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
221 if (TCR_4(__kmp_gtid_mode) >= 2) {
222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223 gtid = __kmp_gtid_get_specific();
226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227 gtid = __kmp_get_global_thread_id();
230 /* we must be a new uber master sibling thread */
231 if (gtid == KMP_GTID_DNE) {
233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234 "Registering a new gtid.\n"));
235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236 if (!__kmp_init_serial) {
237 __kmp_do_serial_initialize();
238 gtid = __kmp_gtid_get_specific();
240 gtid = __kmp_register_root(FALSE);
242 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
246 KMP_DEBUG_ASSERT(gtid >= 0);
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
254 char *stack_beg = NULL;
255 char *stack_end = NULL;
258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259 if (__kmp_storage_map) {
260 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
263 gtid = __kmp_gtid_from_thread(th);
265 if (gtid == KMP_GTID_MONITOR) {
266 __kmp_print_storage_map_gtid(
267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268 "th_%s stack (%s)", "mon",
269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
271 __kmp_print_storage_map_gtid(
272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273 "th_%d stack (%s)", gtid,
274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278 /* No point in checking ubermaster threads since they use refinement and
280 gtid = __kmp_gtid_from_thread(th);
281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
283 ("__kmp_check_stack_overlap: performing extensive checking\n"));
284 if (stack_beg == NULL) {
285 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
289 for (f = 0; f < __kmp_threads_capacity; f++) {
290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
292 if (f_th && f_th != th) {
293 char *other_stack_end =
294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295 char *other_stack_beg =
296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
300 /* Print the other stack values before the abort */
301 if (__kmp_storage_map)
302 __kmp_print_storage_map_gtid(
303 -1, other_stack_beg, other_stack_end,
304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
316 /* ------------------------------------------------------------------------ */
318 void __kmp_infinite_loop(void) {
319 static int done = FALSE;
326 #define MAX_MESSAGE 512
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329 char const *format, ...) {
330 char buffer[MAX_MESSAGE];
333 va_start(ap, format);
334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335 p2, (unsigned long)size, format);
336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337 __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342 if (__kmp_storage_map_verbose) {
343 node = __kmp_get_host_node(p1);
344 if (node < 0) /* doesn't work, so don't try this next time */
345 __kmp_storage_map_verbose = FALSE;
349 int localProc = __kmp_get_cpu_from_gtid(gtid);
351 const int page_size = KMP_GET_PAGE_SIZE();
353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
359 __kmp_printf_no_lock(" GTID %d\n", gtid);
361 /* The more elaborate format is disabled for now because of the prctl
366 /* This loop collates adjacent pages with the same host node. */
368 (char *)p1 += page_size;
369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
375 (char *)p1 + (page_size - 1),
376 __kmp_get_host_node(p1));
378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
379 (char *)p2 + (page_size - 1),
380 __kmp_get_host_node(p2));
386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
392 void __kmp_warn(char const *format, ...) {
393 char buffer[MAX_MESSAGE];
396 if (__kmp_generate_warnings == kmp_warnings_off) {
400 va_start(ap, format);
402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404 __kmp_vprintf(kmp_err, buffer, ap);
405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
410 void __kmp_abort_process() {
411 // Later threads may stall here, but that's ok because abort() will kill them.
412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
414 if (__kmp_debug_buf) {
415 __kmp_dump_debug_buffer();
418 if (KMP_OS_WINDOWS) {
419 // Let other threads know of abnormal termination and prevent deadlock
420 // if abort happened during library initialization or shutdown
421 __kmp_global.g.g_abort = SIGABRT;
423 /* On Windows* OS by default abort() causes pop-up error box, which stalls
424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425 boxes. _set_abort_behavior() works well, but this function is not
426 available in VS7 (this is not problem for DLL, but it is a problem for
427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428 help, at least in some versions of MS C RTL.
430 It seems following sequence is the only way to simulate abort() and
431 avoid pop-up error box. */
433 _exit(3); // Just in case, if signal ignored, exit anyway.
438 __kmp_infinite_loop();
439 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
441 } // __kmp_abort_process
443 void __kmp_abort_thread(void) {
444 // TODO: Eliminate g_abort global variable and this function.
445 // In case of abort just call abort(), it will kill all the threads.
446 __kmp_infinite_loop();
447 } // __kmp_abort_thread
449 /* Print out the storage map for the major kmp_info_t thread data structures
450 that are allocated together. */
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460 sizeof(kmp_local_t), "th_%d.th_local", gtid);
462 __kmp_print_storage_map_gtid(
463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467 &thr->th.th_bar[bs_plain_barrier + 1],
468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472 &thr->th.th_bar[bs_forkjoin_barrier + 1],
473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
476 #if KMP_FAST_REDUCTION_BARRIER
477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478 &thr->th.th_bar[bs_reduction_barrier + 1],
479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
481 #endif // KMP_FAST_REDUCTION_BARRIER
484 /* Print out the storage map for the major kmp_team_t team data structures
485 that are allocated together. */
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488 int team_id, int num_thr) {
489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494 &team->t.t_bar[bs_last_barrier],
495 sizeof(kmp_balign_team_t) * bs_last_barrier,
496 "%s_%d.t_bar", header, team_id);
498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499 &team->t.t_bar[bs_plain_barrier + 1],
500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504 &team->t.t_bar[bs_forkjoin_barrier + 1],
505 sizeof(kmp_balign_team_t),
506 "%s_%d.t_bar[forkjoin]", header, team_id);
508 #if KMP_FAST_REDUCTION_BARRIER
509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510 &team->t.t_bar[bs_reduction_barrier + 1],
511 sizeof(kmp_balign_team_t),
512 "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
515 __kmp_print_storage_map_gtid(
516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
519 __kmp_print_storage_map_gtid(
520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524 &team->t.t_disp_buffer[num_disp_buff],
525 sizeof(dispatch_shared_info_t) * num_disp_buff,
526 "%s_%d.t_disp_buffer", header, team_id);
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
532 /* ------------------------------------------------------------------------ */
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538 // TODO: Change to __kmp_break_bootstrap_lock().
539 __kmp_init_bootstrap_lock(lck); // make the lock released
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
546 // PROCESS_DETACH is expected to be called by a thread that executes
547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550 // threads can be still alive here, although being about to be terminated. The
551 // threads in the array with ds_thread==0 are most suspicious. Actually, it
552 // can be not safe to access the __kmp_threads[].
554 // TODO: does it make sense to check __kmp_roots[] ?
556 // Let's check that there are no other alive threads registered with the OMP
560 for (i = 0; i < __kmp_threads_capacity; ++i) {
563 kmp_info_t *th = __kmp_threads[i];
566 int gtid = th->th.th_info.ds.ds_gtid;
567 if (gtid == gtid_req)
572 int alive = __kmp_is_thread_alive(th, &exit_val);
577 if (thread_count == 0)
581 // Assume that I'm alone. Now it might be safe to check and reset locks.
582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583 __kmp_reset_lock(&__kmp_forkjoin_lock);
585 __kmp_reset_lock(&__kmp_stdio_lock);
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
594 case DLL_PROCESS_ATTACH:
595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
599 case DLL_PROCESS_DETACH:
600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
602 if (lpReserved != NULL) {
603 // lpReserved is used for telling the difference:
604 // lpReserved == NULL when FreeLibrary() was called,
605 // lpReserved != NULL when the process terminates.
606 // When FreeLibrary() is called, worker threads remain alive. So they will
607 // release the forkjoin lock by themselves. When the process terminates,
608 // worker threads disappear triggering the problem of unreleased forkjoin
609 // lock as described below.
611 // A worker thread can take the forkjoin lock. The problem comes up if
612 // that worker thread becomes dead before it releases the forkjoin lock.
613 // The forkjoin lock remains taken, while the thread executing
614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615 // to take the forkjoin lock and will always fail, so that the application
616 // will never finish [normally]. This scenario is possible if
617 // __kmpc_end() has not been executed. It looks like it's not a corner
618 // case, but common cases:
619 // - the main function was compiled by an alternative compiler;
620 // - the main function was compiled by icl but without /Qopenmp
621 // (application with plugins);
622 // - application terminates by calling C exit(), Fortran CALL EXIT() or
624 // - alive foreign thread prevented __kmpc_end from doing cleanup.
626 // This is a hack to work around the problem.
627 // TODO: !!! figure out something better.
628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
631 __kmp_internal_end_library(__kmp_gtid_get_specific());
635 case DLL_THREAD_ATTACH:
636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
638 /* if we want to register new siblings all the time here call
639 * __kmp_get_gtid(); */
642 case DLL_THREAD_DETACH:
643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
645 __kmp_internal_end_thread(__kmp_gtid_get_specific());
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657 int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659 kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
662 if (__kmp_env_consistency_check) {
663 if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
670 #ifdef BUILD_PARALLEL_ORDERED
671 if (!team->t.t_serialized) {
673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
677 #endif /* BUILD_PARALLEL_ORDERED */
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682 int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684 int tid = __kmp_tid_from_gtid(gtid);
685 kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
688 if (__kmp_env_consistency_check) {
689 if (__kmp_threads[gtid]->th.th_root->r.r_active)
690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
692 #ifdef BUILD_PARALLEL_ORDERED
693 if (!team->t.t_serialized) {
694 KMP_MB(); /* Flush all pending memory write invalidates. */
696 /* use the tid of the next thread in this team */
697 /* TODO replace with general release procedure */
698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
700 KMP_MB(); /* Flush all pending memory write invalidates. */
702 #endif /* BUILD_PARALLEL_ORDERED */
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit */
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
713 if (!TCR_4(__kmp_init_parallel))
714 __kmp_parallel_initialize();
715 __kmp_resume_if_soft_paused();
717 th = __kmp_threads[gtid];
718 team = th->th.th_team;
721 th->th.th_ident = id_ref;
723 if (team->t.t_serialized) {
726 kmp_int32 old_this = th->th.th_local.this_construct;
728 ++th->th.th_local.this_construct;
729 /* try to set team count to thread count--success means thread got the
731 /* TODO: Should this be acquire or release? */
732 if (team->t.t_construct == old_this) {
733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734 th->th.th_local.this_construct);
737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739 team->t.t_active_level ==
740 1) { // Only report metadata by master of active team at level 1
741 __kmp_itt_metadata_single(id_ref);
743 #endif /* USE_ITT_BUILD */
746 if (__kmp_env_consistency_check) {
747 if (status && push_ws) {
748 __kmp_push_workshare(gtid, ct_psingle, id_ref);
750 __kmp_check_workshare(gtid, ct_psingle, id_ref);
755 __kmp_itt_single_start(gtid);
757 #endif /* USE_ITT_BUILD */
761 void __kmp_exit_single(int gtid) {
763 __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765 if (__kmp_env_consistency_check)
766 __kmp_pop_workshare(gtid, ct_psingle, NULL);
769 /* determine if we can go parallel or must use a serialized parallel region and
770 * how many threads we can use
771 * set_nproc is the number of threads requested for the team
772 * returns 0 if we should serialize or only use one thread,
773 * otherwise the number of threads to use
774 * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776 int master_tid, int set_nthreads,
780 KMP_DEBUG_ASSERT(__kmp_init_serial);
781 KMP_DEBUG_ASSERT(root && parent_team);
782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
784 // If dyn-var is set, dynamically adjust the number of desired threads,
785 // according to the method specified by dynamic_mode.
786 new_nthreads = set_nthreads;
787 if (!get__dynamic_2(parent_team, master_tid)) {
790 #ifdef USE_LOAD_BALANCE
791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793 if (new_nthreads == 1) {
794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795 "reservation to 1 thread\n",
799 if (new_nthreads < set_nthreads) {
800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801 "reservation to %d threads\n",
802 master_tid, new_nthreads));
805 #endif /* USE_LOAD_BALANCE */
806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807 new_nthreads = __kmp_avail_proc - __kmp_nth +
808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809 if (new_nthreads <= 1) {
810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811 "reservation to 1 thread\n",
815 if (new_nthreads < set_nthreads) {
816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817 "reservation to %d threads\n",
818 master_tid, new_nthreads));
820 new_nthreads = set_nthreads;
822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823 if (set_nthreads > 2) {
824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825 new_nthreads = (new_nthreads % set_nthreads) + 1;
826 if (new_nthreads == 1) {
827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828 "reservation to 1 thread\n",
832 if (new_nthreads < set_nthreads) {
833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834 "reservation to %d threads\n",
835 master_tid, new_nthreads));
842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843 if (__kmp_nth + new_nthreads -
844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
846 int tl_nthreads = __kmp_max_nth - __kmp_nth +
847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848 if (tl_nthreads <= 0) {
852 // If dyn-var is false, emit a 1-time warning.
853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854 __kmp_reserve_warn = 1;
855 __kmp_msg(kmp_ms_warning,
856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
859 if (tl_nthreads == 1) {
860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861 "reduced reservation to 1 thread\n",
865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866 "reservation to %d threads\n",
867 master_tid, tl_nthreads));
868 new_nthreads = tl_nthreads;
871 // Respect OMP_THREAD_LIMIT
872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874 if (cg_nthreads + new_nthreads -
875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
877 int tl_nthreads = max_cg_threads - cg_nthreads +
878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879 if (tl_nthreads <= 0) {
883 // If dyn-var is false, emit a 1-time warning.
884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885 __kmp_reserve_warn = 1;
886 __kmp_msg(kmp_ms_warning,
887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
890 if (tl_nthreads == 1) {
891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892 "reduced reservation to 1 thread\n",
896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897 "reservation to %d threads\n",
898 master_tid, tl_nthreads));
899 new_nthreads = tl_nthreads;
902 // Check if the threads array is large enough, or needs expanding.
903 // See comment in __kmp_register_root() about the adjustment if
904 // __kmp_threads[0] == NULL.
905 capacity = __kmp_threads_capacity;
906 if (TCR_PTR(__kmp_threads[0]) == NULL) {
909 if (__kmp_nth + new_nthreads -
910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
912 // Expand the threads array.
913 int slotsRequired = __kmp_nth + new_nthreads -
914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
916 int slotsAdded = __kmp_expand_threads(slotsRequired);
917 if (slotsAdded < slotsRequired) {
918 // The threads array was not expanded enough.
919 new_nthreads -= (slotsRequired - slotsAdded);
920 KMP_ASSERT(new_nthreads >= 1);
922 // If dyn-var is false, emit a 1-time warning.
923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924 __kmp_reserve_warn = 1;
925 if (__kmp_tp_cached) {
926 __kmp_msg(kmp_ms_warning,
927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
931 __kmp_msg(kmp_ms_warning,
932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
940 if (new_nthreads == 1) {
942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943 "dead roots and rechecking; requested %d threads\n",
944 __kmp_get_gtid(), set_nthreads));
946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
948 __kmp_get_gtid(), new_nthreads, set_nthreads));
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955 assured that there are enough threads available, because we checked on that
956 earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958 kmp_info_t *master_th, int master_gtid) {
962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
966 /* first, let's setup the master thread */
967 master_th->th.th_info.ds.ds_tid = 0;
968 master_th->th.th_team = team;
969 master_th->th.th_team_nproc = team->t.t_nproc;
970 master_th->th.th_team_master = master_th;
971 master_th->th.th_team_serialized = FALSE;
972 master_th->th.th_dispatch = &team->t.t_dispatch[0];
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978 if (hot_teams) { // hot teams array is not allocated if
979 // KMP_HOT_TEAMS_MAX_LEVEL=0
980 int level = team->t.t_active_level - 1; // index in array of hot teams
981 if (master_th->th.th_teams_microtask) { // are we inside the teams?
982 if (master_th->th.th_teams_size.nteams > 1) {
983 ++level; // level was not increased in teams construct for
986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987 master_th->th.th_teams_level == team->t.t_level) {
988 ++level; // level was not increased in teams construct for
989 // team_of_workers before the parallel
990 } // team->t.t_level will be increased inside parallel
992 if (level < __kmp_hot_teams_max_level) {
993 if (hot_teams[level].hot_team) {
994 // hot team has already been allocated for given level
995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996 use_hot_team = 1; // the team is ready to use
998 use_hot_team = 0; // AC: threads are not allocated yet
999 hot_teams[level].hot_team = team; // remember new hot team
1000 hot_teams[level].hot_team_nth = team->t.t_nproc;
1007 use_hot_team = team == root->r.r_hot_team;
1009 if (!use_hot_team) {
1011 /* install the master thread */
1012 team->t.t_threads[0] = master_th;
1013 __kmp_initialize_info(master_th, team, 0, master_gtid);
1015 /* now, install the worker threads */
1016 for (i = 1; i < team->t.t_nproc; i++) {
1018 /* fork or reallocate a new thread and install it in team */
1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020 team->t.t_threads[i] = thr;
1021 KMP_DEBUG_ASSERT(thr);
1022 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023 /* align team and thread arrived states */
1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029 team->t.t_bar[bs_plain_barrier].b_arrived));
1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031 thr->th.th_teams_level = master_th->th.th_teams_level;
1032 thr->th.th_teams_size = master_th->th.th_teams_size;
1033 { // Initialize threads' barrier data.
1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036 for (b = 0; b < bs_last_barrier; ++b) {
1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1046 #if KMP_AFFINITY_SUPPORTED
1047 __kmp_partition_places(team);
1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052 for (i = 0; i < team->t.t_nproc; i++) {
1053 kmp_info_t *thr = team->t.t_threads[i];
1054 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055 thr->th.th_prev_level != team->t.t_level) {
1056 team->t.t_display_affinity = 1;
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070 if (__kmp_inherit_fp_control) {
1071 kmp_int16 x87_fpu_control_word;
1074 // Get master values of FPU control flags (both X87 and vector)
1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076 __kmp_store_mxcsr(&mxcsr);
1077 mxcsr &= KMP_X86_MXCSR_MASK;
1079 // There is no point looking at t_fp_control_saved here.
1080 // If it is TRUE, we still have to update the values if they are different
1081 // from those we now have. If it is FALSE we didn't save anything yet, but
1082 // our objective is the same. We have to ensure that the values in the team
1083 // are the same as those we have.
1084 // So, this code achieves what we need whether or not t_fp_control_saved is
1085 // true. By checking whether the value needs updating we avoid unnecessary
1086 // writes that would put the cache-line into a written state, causing all
1087 // threads in the team to have to read it again.
1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090 // Although we don't use this value, other code in the runtime wants to know
1091 // whether it should restore them. So we must ensure it is correct.
1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1094 // Similarly here. Don't write to this cache-line in the team structure
1095 // unless we have to.
1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1100 // Do the opposite, setting the hardware registers to the updated values from
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104 // Only reset the fp control regs if they have been changed in the team.
1105 // the parallel region that we are exiting.
1106 kmp_int16 x87_fpu_control_word;
1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109 __kmp_store_mxcsr(&mxcsr);
1110 mxcsr &= KMP_X86_MXCSR_MASK;
1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113 __kmp_clear_x87_fpu_status_word();
1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1117 if (team->t.t_mxcsr != mxcsr) {
1118 __kmp_load_mxcsr(&team->t.t_mxcsr);
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128 int realloc); // forward declaration
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131 single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133 kmp_info_t *this_thr;
1134 kmp_team_t *serial_team;
1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1138 /* Skip all this code for autopar serialized loops since it results in
1139 unacceptable overhead */
1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1143 if (!TCR_4(__kmp_init_parallel))
1144 __kmp_parallel_initialize();
1145 __kmp_resume_if_soft_paused();
1147 this_thr = __kmp_threads[global_tid];
1148 serial_team = this_thr->th.th_serial_team;
1150 /* utilize the serialized team held by this thread */
1151 KMP_DEBUG_ASSERT(serial_team);
1154 if (__kmp_tasking_mode != tskm_immediate_exec) {
1156 this_thr->th.th_task_team ==
1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161 "team %p, new task_team = NULL\n",
1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163 this_thr->th.th_task_team = NULL;
1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168 proc_bind = proc_bind_false;
1169 } else if (proc_bind == proc_bind_default) {
1170 // No proc_bind clause was specified, so use the current value
1171 // of proc-bind-var for this parallel region.
1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1174 // Reset for next parallel region
1175 this_thr->th.th_set_proc_bind = proc_bind_default;
1178 ompt_data_t ompt_parallel_data = ompt_data_none;
1179 ompt_data_t *implicit_task_data;
1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181 if (ompt_enabled.enabled &&
1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1184 ompt_task_info_t *parent_task_info;
1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188 if (ompt_enabled.ompt_callback_parallel_begin) {
1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192 &(parent_task_info->task_data), &(parent_task_info->frame),
1193 &ompt_parallel_data, team_size,
1194 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1197 #endif // OMPT_SUPPORT
1199 if (this_thr->th.th_team != serial_team) {
1200 // Nested level will be an index in the nested nthreads array
1201 int level = this_thr->th.th_team->t.t_level;
1203 if (serial_team->t.t_serialized) {
1204 /* this serial team was already used
1205 TODO increase performance by making this locks more specific */
1206 kmp_team_t *new_team;
1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1215 proc_bind, &this_thr->th.th_current_task->td_icvs,
1216 0 USE_NESTED_HOT_ARG(NULL));
1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218 KMP_ASSERT(new_team);
1220 /* setup new serialized team and install it */
1221 new_team->t.t_threads[0] = this_thr;
1222 new_team->t.t_parent = this_thr->th.th_team;
1223 serial_team = new_team;
1224 this_thr->th.th_serial_team = serial_team;
1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229 global_tid, serial_team));
1231 /* TODO the above breaks the requirement that if we run out of resources,
1232 then we can still guarantee that serialized teams are ok, since we may
1233 need to allocate a new one */
1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238 global_tid, serial_team));
1241 /* we have to initialize this serial team */
1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245 serial_team->t.t_ident = loc;
1246 serial_team->t.t_serialized = 1;
1247 serial_team->t.t_nproc = 1;
1248 serial_team->t.t_parent = this_thr->th.th_team;
1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250 this_thr->th.th_team = serial_team;
1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254 this_thr->th.th_current_task));
1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256 this_thr->th.th_current_task->td_flags.executing = 0;
1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261 implicit task for each serialized task represented by
1262 team->t.t_serialized? */
1263 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264 &this_thr->th.th_current_task->td_parent->td_icvs);
1266 // Thread value exists in the nested nthreads array for the next nested
1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269 this_thr->th.th_current_task->td_icvs.nproc =
1270 __kmp_nested_nth.nth[level + 1];
1273 if (__kmp_nested_proc_bind.used &&
1274 (level + 1 < __kmp_nested_proc_bind.used)) {
1275 this_thr->th.th_current_task->td_icvs.proc_bind =
1276 __kmp_nested_proc_bind.bind_types[level + 1];
1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1282 this_thr->th.th_info.ds.ds_tid = 0;
1284 /* set thread cache values */
1285 this_thr->th.th_team_nproc = 1;
1286 this_thr->th.th_team_master = this_thr;
1287 this_thr->th.th_team_serialized = 1;
1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1293 propagateFPControl(serial_team);
1295 /* check if we need to allocate dispatch buffers stack */
1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298 serial_team->t.t_dispatch->th_disp_buffer =
1299 (dispatch_private_info_t *)__kmp_allocate(
1300 sizeof(dispatch_private_info_t));
1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1307 /* this serialized team is already being used,
1308 * that's fine, just add another nested level */
1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312 ++serial_team->t.t_serialized;
1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1315 // Nested level will be an index in the nested nthreads array
1316 int level = this_thr->th.th_team->t.t_level;
1317 // Thread value exists in the nested nthreads array for the next nested
1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320 this_thr->th.th_current_task->td_icvs.nproc =
1321 __kmp_nested_nth.nth[level + 1];
1323 serial_team->t.t_level++;
1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325 "of serial team %p to %d\n",
1326 global_tid, serial_team, serial_team->t.t_level));
1328 /* allocate/push dispatch buffers stack */
1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1331 dispatch_private_info_t *disp_buffer =
1332 (dispatch_private_info_t *)__kmp_allocate(
1333 sizeof(dispatch_private_info_t));
1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1343 // Perform the display affinity functionality for
1344 // serialized parallel regions
1345 if (__kmp_display_affinity) {
1346 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347 this_thr->th.th_prev_num_threads != 1) {
1348 // NULL means use the affinity-format-var ICV
1349 __kmp_aux_display_affinity(global_tid, NULL);
1350 this_thr->th.th_prev_level = serial_team->t.t_level;
1351 this_thr->th.th_prev_num_threads = 1;
1355 if (__kmp_env_consistency_check)
1356 __kmp_push_parallel(global_tid, NULL);
1358 serial_team->t.ompt_team_info.master_return_address = codeptr;
1359 if (ompt_enabled.enabled &&
1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1363 ompt_lw_taskteam_t lw_taskteam;
1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365 &ompt_parallel_data, codeptr);
1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368 // don't use lw_taskteam after linking. content was swaped
1370 /* OMPT implicit task begin */
1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372 if (ompt_enabled.ompt_callback_implicit_task) {
1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376 OMPT_CUR_TASK_INFO(this_thr)
1377 ->thread_num = __kmp_tid_from_gtid(global_tid);
1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390 enum fork_context_e call_context, // Intel, GNU, ...
1391 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1396 int master_this_cons;
1398 kmp_team_t *parent_team;
1399 kmp_info_t *master_th;
1403 int master_set_numthreads;
1407 #if KMP_NESTED_HOT_TEAMS
1408 kmp_hot_team_ptr_t **p_hot_teams;
1411 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1412 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1414 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1415 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1416 /* Some systems prefer the stack for the root thread(s) to start with */
1417 /* some gap from the parent stack to prevent false sharing. */
1418 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1419 /* These 2 lines below are so this does not get optimized out */
1420 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1421 __kmp_stkpadding += (short)((kmp_int64)dummy);
1424 /* initialize if needed */
1426 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1427 if (!TCR_4(__kmp_init_parallel))
1428 __kmp_parallel_initialize();
1429 __kmp_resume_if_soft_paused();
1431 /* setup current data */
1432 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1434 parent_team = master_th->th.th_team;
1435 master_tid = master_th->th.th_info.ds.ds_tid;
1436 master_this_cons = master_th->th.th_local.this_construct;
1437 root = master_th->th.th_root;
1438 master_active = root->r.r_active;
1439 master_set_numthreads = master_th->th.th_set_nproc;
1442 ompt_data_t ompt_parallel_data = ompt_data_none;
1443 ompt_data_t *parent_task_data;
1444 ompt_frame_t *ompt_frame;
1445 ompt_data_t *implicit_task_data;
1446 void *return_address = NULL;
1448 if (ompt_enabled.enabled) {
1449 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1451 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1455 // Nested level will be an index in the nested nthreads array
1456 level = parent_team->t.t_level;
1457 // used to launch non-serial teams even if nested is not allowed
1458 active_level = parent_team->t.t_active_level;
1459 // needed to check nesting inside the teams
1460 teams_level = master_th->th.th_teams_level;
1461 #if KMP_NESTED_HOT_TEAMS
1462 p_hot_teams = &master_th->th.th_hot_teams;
1463 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1464 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1465 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467 // it is either actual or not needed (when active_level > 0)
1468 (*p_hot_teams)[0].hot_team_nth = 1;
1473 if (ompt_enabled.enabled) {
1474 if (ompt_enabled.ompt_callback_parallel_begin) {
1475 int team_size = master_set_numthreads
1476 ? master_set_numthreads
1477 : get__nproc_2(parent_team, master_tid);
1478 int flags = OMPT_INVOKER(call_context) |
1479 ((microtask == (microtask_t)__kmp_teams_master)
1480 ? ompt_parallel_league
1481 : ompt_parallel_team);
1482 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1483 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1486 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1490 master_th->th.th_ident = loc;
1492 if (master_th->th.th_teams_microtask && ap &&
1493 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1494 // AC: This is start of parallel that is nested inside teams construct.
1495 // The team is actual (hot), all workers are ready at the fork barrier.
1496 // No lock needed to initialize the team a bit, then free workers.
1497 parent_team->t.t_ident = loc;
1498 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1499 parent_team->t.t_argc = argc;
1500 argv = (void **)parent_team->t.t_argv;
1501 for (i = argc - 1; i >= 0; --i)
1502 *argv++ = va_arg(kmp_va_deref(ap), void *);
1503 // Increment our nested depth levels, but not increase the serialization
1504 if (parent_team == master_th->th.th_serial_team) {
1505 // AC: we are in serialized parallel
1506 __kmpc_serialized_parallel(loc, gtid);
1507 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1511 void **exit_frame_p;
1513 ompt_lw_taskteam_t lw_taskteam;
1515 if (ompt_enabled.enabled) {
1516 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517 &ompt_parallel_data, return_address);
1518 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1520 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1521 // don't use lw_taskteam after linking. content was swaped
1523 /* OMPT implicit task begin */
1524 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1525 if (ompt_enabled.ompt_callback_implicit_task) {
1526 OMPT_CUR_TASK_INFO(master_th)
1527 ->thread_num = __kmp_tid_from_gtid(gtid);
1528 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1529 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1530 implicit_task_data, 1,
1531 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1535 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1537 exit_frame_p = &dummy;
1540 // AC: need to decrement t_serialized for enquiry functions to work
1541 // correctly, will restore at join time
1542 parent_team->t.t_serialized--;
1545 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1546 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1547 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1556 if (ompt_enabled.enabled) {
1557 *exit_frame_p = NULL;
1558 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1559 if (ompt_enabled.ompt_callback_implicit_task) {
1560 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1561 ompt_scope_end, NULL, implicit_task_data, 1,
1562 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1564 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1565 __ompt_lw_taskteam_unlink(master_th);
1566 if (ompt_enabled.ompt_callback_parallel_end) {
1567 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1568 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1569 OMPT_INVOKER(call_context) | ompt_parallel_team,
1572 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1578 parent_team->t.t_pkfn = microtask;
1579 parent_team->t.t_invoke = invoker;
1580 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1581 parent_team->t.t_active_level++;
1582 parent_team->t.t_level++;
1583 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1586 if (ompt_enabled.enabled) {
1587 ompt_lw_taskteam_t lw_taskteam;
1588 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1589 &ompt_parallel_data, return_address);
1590 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1594 /* Change number of threads in the team if requested */
1595 if (master_set_numthreads) { // The parallel has num_threads clause
1596 if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1597 // AC: only can reduce number of threads dynamically, can't increase
1598 kmp_info_t **other_threads = parent_team->t.t_threads;
1599 parent_team->t.t_nproc = master_set_numthreads;
1600 for (i = 0; i < master_set_numthreads; ++i) {
1601 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1603 // Keep extra threads hot in the team for possible next parallels
1605 master_th->th.th_set_nproc = 0;
1609 if (__kmp_debugging) { // Let debugger override number of threads.
1610 int nth = __kmp_omp_num_threads(loc);
1611 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1612 master_set_numthreads = nth;
1617 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1618 "master_th=%p, gtid=%d\n",
1619 root, parent_team, master_th, gtid));
1620 __kmp_internal_fork(loc, gtid, parent_team);
1621 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1622 "master_th=%p, gtid=%d\n",
1623 root, parent_team, master_th, gtid));
1625 /* Invoke microtask for MASTER thread */
1626 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1627 parent_team->t.t_id, parent_team->t.t_pkfn));
1629 if (!parent_team->t.t_invoke(gtid)) {
1630 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1632 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1633 parent_team->t.t_id, parent_team->t.t_pkfn));
1634 KMP_MB(); /* Flush all pending memory write invalidates. */
1636 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1639 } // Parallel closely nested in teams construct
1642 if (__kmp_tasking_mode != tskm_immediate_exec) {
1643 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1644 parent_team->t.t_task_team[master_th->th.th_task_state]);
1648 if (parent_team->t.t_active_level >=
1649 master_th->th.th_current_task->td_icvs.max_active_levels) {
1652 int enter_teams = ((ap == NULL && active_level == 0) ||
1653 (ap && teams_level > 0 && teams_level == level));
1655 master_set_numthreads
1656 ? master_set_numthreads
1659 master_tid); // TODO: get nproc directly from current task
1661 // Check if we need to take forkjoin lock? (no need for serialized
1662 // parallel out of teams construct). This code moved here from
1663 // __kmp_reserve_threads() to speedup nested serialized parallels.
1665 if ((get__max_active_levels(master_th) == 1 &&
1666 (root->r.r_in_parallel && !enter_teams)) ||
1667 (__kmp_library == library_serial)) {
1668 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1675 /* determine how many new threads we can use */
1676 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1677 /* AC: If we execute teams from parallel region (on host), then teams
1678 should be created but each can only have 1 thread if nesting is
1679 disabled. If teams called from serial region, then teams and their
1680 threads should be created regardless of the nesting setting. */
1681 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1682 nthreads, enter_teams);
1683 if (nthreads == 1) {
1684 // Free lock for single thread execution here; for multi-thread
1685 // execution it will be freed later after team of threads created
1687 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1691 KMP_DEBUG_ASSERT(nthreads > 0);
1693 // If we temporarily changed the set number of threads then restore it now
1694 master_th->th.th_set_nproc = 0;
1696 /* create a serialized parallel region? */
1697 if (nthreads == 1) {
1698 /* josh todo: hypothetical question: what do we do for OS X*? */
1699 #if KMP_OS_LINUX && \
1700 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1703 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1704 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1705 KMP_ARCH_AARCH64) */
1708 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1710 __kmpc_serialized_parallel(loc, gtid);
1712 if (call_context == fork_context_intel) {
1713 /* TODO this sucks, use the compiler itself to pass args! :) */
1714 master_th->th.th_serial_team->t.t_ident = loc;
1716 // revert change made in __kmpc_serialized_parallel()
1717 master_th->th.th_serial_team->t.t_level--;
1718 // Get args from parent team for teams construct
1722 void **exit_frame_p;
1723 ompt_task_info_t *task_info;
1725 ompt_lw_taskteam_t lw_taskteam;
1727 if (ompt_enabled.enabled) {
1728 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1729 &ompt_parallel_data, return_address);
1731 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1732 // don't use lw_taskteam after linking. content was swaped
1734 task_info = OMPT_CUR_TASK_INFO(master_th);
1735 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1736 if (ompt_enabled.ompt_callback_implicit_task) {
1737 OMPT_CUR_TASK_INFO(master_th)
1738 ->thread_num = __kmp_tid_from_gtid(gtid);
1739 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1740 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1741 &(task_info->task_data), 1,
1742 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1743 ompt_task_implicit);
1747 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1749 exit_frame_p = &dummy;
1754 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1755 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1756 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1757 parent_team->t.t_argv
1766 if (ompt_enabled.enabled) {
1767 *exit_frame_p = NULL;
1768 if (ompt_enabled.ompt_callback_implicit_task) {
1769 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1770 ompt_scope_end, NULL, &(task_info->task_data), 1,
1771 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1772 ompt_task_implicit);
1774 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1775 __ompt_lw_taskteam_unlink(master_th);
1776 if (ompt_enabled.ompt_callback_parallel_end) {
1777 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1778 &ompt_parallel_data, parent_task_data,
1779 OMPT_INVOKER(call_context) | ompt_parallel_team,
1782 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1785 } else if (microtask == (microtask_t)__kmp_teams_master) {
1786 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1787 master_th->th.th_serial_team);
1788 team = master_th->th.th_team;
1789 // team->t.t_pkfn = microtask;
1790 team->t.t_invoke = invoker;
1791 __kmp_alloc_argv_entries(argc, team, TRUE);
1792 team->t.t_argc = argc;
1793 argv = (void **)team->t.t_argv;
1795 for (i = argc - 1; i >= 0; --i)
1796 *argv++ = va_arg(kmp_va_deref(ap), void *);
1798 for (i = 0; i < argc; ++i)
1799 // Get args from parent team for teams construct
1800 argv[i] = parent_team->t.t_argv[i];
1802 // AC: revert change made in __kmpc_serialized_parallel()
1803 // because initial code in teams should have level=0
1805 // AC: call special invoker for outer "parallel" of teams construct
1808 if (ompt_enabled.enabled) {
1809 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1810 if (ompt_enabled.ompt_callback_implicit_task) {
1811 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1812 ompt_scope_end, NULL, &(task_info->task_data), 0,
1813 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1815 if (ompt_enabled.ompt_callback_parallel_end) {
1816 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1817 &ompt_parallel_data, parent_task_data,
1818 OMPT_INVOKER(call_context) | ompt_parallel_league,
1821 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1826 for (i = argc - 1; i >= 0; --i)
1827 *argv++ = va_arg(kmp_va_deref(ap), void *);
1832 void **exit_frame_p;
1833 ompt_task_info_t *task_info;
1835 ompt_lw_taskteam_t lw_taskteam;
1837 if (ompt_enabled.enabled) {
1838 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1839 &ompt_parallel_data, return_address);
1840 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1841 // don't use lw_taskteam after linking. content was swaped
1842 task_info = OMPT_CUR_TASK_INFO(master_th);
1843 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1845 /* OMPT implicit task begin */
1846 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1847 if (ompt_enabled.ompt_callback_implicit_task) {
1848 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1849 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1850 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1851 ompt_task_implicit);
1852 OMPT_CUR_TASK_INFO(master_th)
1853 ->thread_num = __kmp_tid_from_gtid(gtid);
1857 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1859 exit_frame_p = &dummy;
1864 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1865 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1866 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1875 if (ompt_enabled.enabled) {
1876 *exit_frame_p = NULL;
1877 if (ompt_enabled.ompt_callback_implicit_task) {
1878 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1879 ompt_scope_end, NULL, &(task_info->task_data), 1,
1880 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1881 ompt_task_implicit);
1884 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1885 __ompt_lw_taskteam_unlink(master_th);
1886 if (ompt_enabled.ompt_callback_parallel_end) {
1887 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1888 &ompt_parallel_data, parent_task_data,
1889 OMPT_INVOKER(call_context) | ompt_parallel_team,
1892 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1896 } else if (call_context == fork_context_gnu) {
1898 ompt_lw_taskteam_t lwt;
1899 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1902 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1903 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1904 // don't use lw_taskteam after linking. content was swaped
1907 // we were called from GNU native code
1908 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1911 KMP_ASSERT2(call_context < fork_context_last,
1912 "__kmp_fork_call: unknown fork_context parameter");
1915 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1918 } // if (nthreads == 1)
1920 // GEH: only modify the executing flag in the case when not serialized
1921 // serialized case is handled in kmpc_serialized_parallel
1922 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1923 "curtask=%p, curtask_max_aclevel=%d\n",
1924 parent_team->t.t_active_level, master_th,
1925 master_th->th.th_current_task,
1926 master_th->th.th_current_task->td_icvs.max_active_levels));
1927 // TODO: GEH - cannot do this assertion because root thread not set up as
1929 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1930 master_th->th.th_current_task->td_flags.executing = 0;
1932 if (!master_th->th.th_teams_microtask || level > teams_level) {
1933 /* Increment our nested depth level */
1934 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1937 // See if we need to make a copy of the ICVs.
1938 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1939 if ((level + 1 < __kmp_nested_nth.used) &&
1940 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1941 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1943 nthreads_icv = 0; // don't update
1946 // Figure out the proc_bind_policy for the new team.
1947 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1948 kmp_proc_bind_t proc_bind_icv =
1949 proc_bind_default; // proc_bind_default means don't update
1950 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1951 proc_bind = proc_bind_false;
1953 if (proc_bind == proc_bind_default) {
1954 // No proc_bind clause specified; use current proc-bind-var for this
1956 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1958 /* else: The proc_bind policy was specified explicitly on parallel clause.
1959 This overrides proc-bind-var for this parallel region, but does not
1960 change proc-bind-var. */
1961 // Figure the value of proc-bind-var for the child threads.
1962 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1963 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1964 master_th->th.th_current_task->td_icvs.proc_bind)) {
1965 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1969 // Reset for next parallel region
1970 master_th->th.th_set_proc_bind = proc_bind_default;
1972 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1973 kmp_internal_control_t new_icvs;
1974 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1975 new_icvs.next = NULL;
1976 if (nthreads_icv > 0) {
1977 new_icvs.nproc = nthreads_icv;
1979 if (proc_bind_icv != proc_bind_default) {
1980 new_icvs.proc_bind = proc_bind_icv;
1983 /* allocate a new parallel team */
1984 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1985 team = __kmp_allocate_team(root, nthreads, nthreads,
1989 proc_bind, &new_icvs,
1990 argc USE_NESTED_HOT_ARG(master_th));
1992 /* allocate a new parallel team */
1993 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1994 team = __kmp_allocate_team(root, nthreads, nthreads,
1999 &master_th->th.th_current_task->td_icvs,
2000 argc USE_NESTED_HOT_ARG(master_th));
2003 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2005 /* setup the new team */
2006 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2007 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2008 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2009 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2010 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2012 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2015 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2016 // TODO: parent_team->t.t_level == INT_MAX ???
2017 if (!master_th->th.th_teams_microtask || level > teams_level) {
2018 int new_level = parent_team->t.t_level + 1;
2019 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2020 new_level = parent_team->t.t_active_level + 1;
2021 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2023 // AC: Do not increase parallel level at start of the teams construct
2024 int new_level = parent_team->t.t_level;
2025 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2026 new_level = parent_team->t.t_active_level;
2027 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2029 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2030 // set master's schedule as new run-time schedule
2031 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2033 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2034 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2036 // Update the floating point rounding in the team if required.
2037 propagateFPControl(team);
2039 if (__kmp_tasking_mode != tskm_immediate_exec) {
2040 // Set master's task team to team's task team. Unless this is hot team, it
2042 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2043 parent_team->t.t_task_team[master_th->th.th_task_state]);
2044 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2045 "%p, new task_team %p / team %p\n",
2046 __kmp_gtid_from_thread(master_th),
2047 master_th->th.th_task_team, parent_team,
2048 team->t.t_task_team[master_th->th.th_task_state], team));
2050 if (active_level || master_th->th.th_task_team) {
2051 // Take a memo of master's task_state
2052 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2053 if (master_th->th.th_task_state_top >=
2054 master_th->th.th_task_state_stack_sz) { // increase size
2055 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2056 kmp_uint8 *old_stack, *new_stack;
2058 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2059 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2060 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2062 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2063 ++i) { // zero-init rest of stack
2066 old_stack = master_th->th.th_task_state_memo_stack;
2067 master_th->th.th_task_state_memo_stack = new_stack;
2068 master_th->th.th_task_state_stack_sz = new_size;
2069 __kmp_free(old_stack);
2071 // Store master's task_state on stack
2073 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2074 master_th->th.th_task_state;
2075 master_th->th.th_task_state_top++;
2076 #if KMP_NESTED_HOT_TEAMS
2077 if (master_th->th.th_hot_teams &&
2078 active_level < __kmp_hot_teams_max_level &&
2079 team == master_th->th.th_hot_teams[active_level].hot_team) {
2080 // Restore master's nested state if nested hot team
2081 master_th->th.th_task_state =
2083 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2086 master_th->th.th_task_state = 0;
2087 #if KMP_NESTED_HOT_TEAMS
2091 #if !KMP_NESTED_HOT_TEAMS
2092 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2093 (team == root->r.r_hot_team));
2099 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2100 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2102 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2103 (team->t.t_master_tid == 0 &&
2104 (team->t.t_parent == root->r.r_root_team ||
2105 team->t.t_parent->t.t_serialized)));
2108 /* now, setup the arguments */
2109 argv = (void **)team->t.t_argv;
2111 for (i = argc - 1; i >= 0; --i) {
2112 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2113 KMP_CHECK_UPDATE(*argv, new_argv);
2117 for (i = 0; i < argc; ++i) {
2118 // Get args from parent team for teams construct
2119 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2123 /* now actually fork the threads */
2124 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2125 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2126 root->r.r_active = TRUE;
2128 __kmp_fork_team_threads(root, team, master_th, gtid);
2129 __kmp_setup_icv_copy(team, nthreads,
2130 &master_th->th.th_current_task->td_icvs, loc);
2133 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2136 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2139 if (team->t.t_active_level == 1 // only report frames at level 1
2140 && !master_th->th.th_teams_microtask) { // not in teams construct
2142 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2143 (__kmp_forkjoin_frames_mode == 3 ||
2144 __kmp_forkjoin_frames_mode == 1)) {
2145 kmp_uint64 tmp_time = 0;
2146 if (__itt_get_timestamp_ptr)
2147 tmp_time = __itt_get_timestamp();
2148 // Internal fork - report frame begin
2149 master_th->th.th_frame_time = tmp_time;
2150 if (__kmp_forkjoin_frames_mode == 3)
2151 team->t.t_region_time = tmp_time;
2153 // only one notification scheme (either "submit" or "forking/joined", not both)
2154 #endif /* USE_ITT_NOTIFY */
2155 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2156 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2157 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2158 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2161 #endif /* USE_ITT_BUILD */
2163 /* now go on and do the work */
2164 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2167 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2168 root, team, master_th, gtid));
2171 if (__itt_stack_caller_create_ptr) {
2172 team->t.t_stack_id =
2173 __kmp_itt_stack_caller_create(); // create new stack stitching id
2174 // before entering fork barrier
2176 #endif /* USE_ITT_BUILD */
2178 // AC: skip __kmp_internal_fork at teams construct, let only master
2181 __kmp_internal_fork(loc, gtid, team);
2182 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2183 "master_th=%p, gtid=%d\n",
2184 root, team, master_th, gtid));
2187 if (call_context == fork_context_gnu) {
2188 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2192 /* Invoke microtask for MASTER thread */
2193 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2194 team->t.t_id, team->t.t_pkfn));
2195 } // END of timer KMP_fork_call block
2197 #if KMP_STATS_ENABLED
2198 // If beginning a teams construct, then change thread state
2199 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2201 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2205 if (!team->t.t_invoke(gtid)) {
2206 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2209 #if KMP_STATS_ENABLED
2210 // If was beginning of a teams construct, then reset thread state
2212 KMP_SET_THREAD_STATE(previous_state);
2216 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2217 team->t.t_id, team->t.t_pkfn));
2218 KMP_MB(); /* Flush all pending memory write invalidates. */
2220 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2223 if (ompt_enabled.enabled) {
2224 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2232 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2234 // restore state outside the region
2235 thread->th.ompt_thread_info.state =
2236 ((team->t.t_serialized) ? ompt_state_work_serial
2237 : ompt_state_work_parallel);
2240 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2241 kmp_team_t *team, ompt_data_t *parallel_data,
2242 int flags, void *codeptr) {
2243 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2244 if (ompt_enabled.ompt_callback_parallel_end) {
2245 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2246 parallel_data, &(task_info->task_data), flags, codeptr);
2249 task_info->frame.enter_frame = ompt_data_none;
2250 __kmp_join_restore_state(thread, team);
2254 void __kmp_join_call(ident_t *loc, int gtid
2257 enum fork_context_e fork_context
2261 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2263 kmp_team_t *parent_team;
2264 kmp_info_t *master_th;
2268 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2270 /* setup current data */
2271 master_th = __kmp_threads[gtid];
2272 root = master_th->th.th_root;
2273 team = master_th->th.th_team;
2274 parent_team = team->t.t_parent;
2276 master_th->th.th_ident = loc;
2279 void *team_microtask = (void *)team->t.t_pkfn;
2280 if (ompt_enabled.enabled) {
2281 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2286 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2287 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2288 "th_task_team = %p\n",
2289 __kmp_gtid_from_thread(master_th), team,
2290 team->t.t_task_team[master_th->th.th_task_state],
2291 master_th->th.th_task_team));
2292 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2293 team->t.t_task_team[master_th->th.th_task_state]);
2297 if (team->t.t_serialized) {
2298 if (master_th->th.th_teams_microtask) {
2299 // We are in teams construct
2300 int level = team->t.t_level;
2301 int tlevel = master_th->th.th_teams_level;
2302 if (level == tlevel) {
2303 // AC: we haven't incremented it earlier at start of teams construct,
2304 // so do it here - at the end of teams construct
2306 } else if (level == tlevel + 1) {
2307 // AC: we are exiting parallel inside teams, need to increment
2308 // serialization in order to restore it in the next call to
2309 // __kmpc_end_serialized_parallel
2310 team->t.t_serialized++;
2313 __kmpc_end_serialized_parallel(loc, gtid);
2316 if (ompt_enabled.enabled) {
2317 __kmp_join_restore_state(master_th, parent_team);
2324 master_active = team->t.t_master_active;
2327 // AC: No barrier for internal teams at exit from teams construct.
2328 // But there is barrier for external team (league).
2329 __kmp_internal_join(loc, gtid, team);
2331 master_th->th.th_task_state =
2332 0; // AC: no tasking in teams (out of any parallel)
2338 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2339 void *codeptr = team->t.ompt_team_info.master_return_address;
2343 if (__itt_stack_caller_create_ptr) {
2344 __kmp_itt_stack_caller_destroy(
2345 (__itt_caller)team->t
2346 .t_stack_id); // destroy the stack stitching id after join barrier
2349 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2350 if (team->t.t_active_level == 1 &&
2351 !master_th->th.th_teams_microtask) { /* not in teams construct */
2352 master_th->th.th_ident = loc;
2353 // only one notification scheme (either "submit" or "forking/joined", not
2355 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2356 __kmp_forkjoin_frames_mode == 3)
2357 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2358 master_th->th.th_frame_time, 0, loc,
2359 master_th->th.th_team_nproc, 1);
2360 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2361 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2362 __kmp_itt_region_joined(gtid);
2363 } // active_level == 1
2364 #endif /* USE_ITT_BUILD */
2366 if (master_th->th.th_teams_microtask && !exit_teams &&
2367 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2368 team->t.t_level == master_th->th.th_teams_level + 1) {
2369 // AC: We need to leave the team structure intact at the end of parallel
2370 // inside the teams construct, so that at the next parallel same (hot) team
2371 // works, only adjust nesting levels
2373 ompt_data_t ompt_parallel_data = ompt_data_none;
2374 if (ompt_enabled.enabled) {
2375 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2376 if (ompt_enabled.ompt_callback_implicit_task) {
2377 int ompt_team_size = team->t.t_nproc;
2378 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2379 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2380 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2382 task_info->frame.exit_frame = ompt_data_none;
2383 task_info->task_data = ompt_data_none;
2384 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2385 __ompt_lw_taskteam_unlink(master_th);
2388 /* Decrement our nested depth level */
2390 team->t.t_active_level--;
2391 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2393 // Restore number of threads in the team if needed. This code relies on
2394 // the proper adjustment of th_teams_size.nth after the fork in
2395 // __kmp_teams_master on each teams master in the case that
2396 // __kmp_reserve_threads reduced it.
2397 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2398 int old_num = master_th->th.th_team_nproc;
2399 int new_num = master_th->th.th_teams_size.nth;
2400 kmp_info_t **other_threads = team->t.t_threads;
2401 team->t.t_nproc = new_num;
2402 for (int i = 0; i < old_num; ++i) {
2403 other_threads[i]->th.th_team_nproc = new_num;
2405 // Adjust states of non-used threads of the team
2406 for (int i = old_num; i < new_num; ++i) {
2407 // Re-initialize thread's barrier data.
2408 KMP_DEBUG_ASSERT(other_threads[i]);
2409 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2410 for (int b = 0; b < bs_last_barrier; ++b) {
2411 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2412 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2414 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2417 if (__kmp_tasking_mode != tskm_immediate_exec) {
2418 // Synchronize thread's task state
2419 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2425 if (ompt_enabled.enabled) {
2426 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2427 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2434 /* do cleanup and restore the parent team */
2435 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2436 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2438 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2440 /* jc: The following lock has instructions with REL and ACQ semantics,
2441 separating the parallel user code called in this parallel region
2442 from the serial user code called after this function returns. */
2443 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2445 if (!master_th->th.th_teams_microtask ||
2446 team->t.t_level > master_th->th.th_teams_level) {
2447 /* Decrement our nested depth level */
2448 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2450 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2453 if (ompt_enabled.enabled) {
2454 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2455 if (ompt_enabled.ompt_callback_implicit_task) {
2456 int flags = (team_microtask == (void *)__kmp_teams_master)
2458 : ompt_task_implicit;
2459 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2460 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2461 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2462 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2464 task_info->frame.exit_frame = ompt_data_none;
2465 task_info->task_data = ompt_data_none;
2469 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2471 __kmp_pop_current_task_from_thread(master_th);
2473 #if KMP_AFFINITY_SUPPORTED
2474 // Restore master thread's partition.
2475 master_th->th.th_first_place = team->t.t_first_place;
2476 master_th->th.th_last_place = team->t.t_last_place;
2477 #endif // KMP_AFFINITY_SUPPORTED
2478 master_th->th.th_def_allocator = team->t.t_def_allocator;
2480 updateHWFPControl(team);
2482 if (root->r.r_active != master_active)
2483 root->r.r_active = master_active;
2485 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2486 master_th)); // this will free worker threads
2488 /* this race was fun to find. make sure the following is in the critical
2489 region otherwise assertions may fail occasionally since the old team may be
2490 reallocated and the hierarchy appears inconsistent. it is actually safe to
2491 run and won't cause any bugs, but will cause those assertion failures. it's
2492 only one deref&assign so might as well put this in the critical region */
2493 master_th->th.th_team = parent_team;
2494 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2495 master_th->th.th_team_master = parent_team->t.t_threads[0];
2496 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2498 /* restore serialized team, if need be */
2499 if (parent_team->t.t_serialized &&
2500 parent_team != master_th->th.th_serial_team &&
2501 parent_team != root->r.r_root_team) {
2502 __kmp_free_team(root,
2503 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2504 master_th->th.th_serial_team = parent_team;
2507 if (__kmp_tasking_mode != tskm_immediate_exec) {
2508 if (master_th->th.th_task_state_top >
2509 0) { // Restore task state from memo stack
2510 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2511 // Remember master's state if we re-use this nested hot team
2512 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2513 master_th->th.th_task_state;
2514 --master_th->th.th_task_state_top; // pop
2515 // Now restore state at this level
2516 master_th->th.th_task_state =
2518 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2520 // Copy the task team from the parent team to the master thread
2521 master_th->th.th_task_team =
2522 parent_team->t.t_task_team[master_th->th.th_task_state];
2524 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2525 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2529 // TODO: GEH - cannot do this assertion because root thread not set up as
2531 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2532 master_th->th.th_current_task->td_flags.executing = 1;
2534 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2538 OMPT_INVOKER(fork_context) |
2539 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2540 : ompt_parallel_team);
2541 if (ompt_enabled.enabled) {
2542 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2548 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2551 /* Check whether we should push an internal control record onto the
2552 serial team stack. If so, do it. */
2553 void __kmp_save_internal_controls(kmp_info_t *thread) {
2555 if (thread->th.th_team != thread->th.th_serial_team) {
2558 if (thread->th.th_team->t.t_serialized > 1) {
2561 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2564 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2565 thread->th.th_team->t.t_serialized) {
2569 if (push) { /* push a record on the serial team's stack */
2570 kmp_internal_control_t *control =
2571 (kmp_internal_control_t *)__kmp_allocate(
2572 sizeof(kmp_internal_control_t));
2574 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2576 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2578 control->next = thread->th.th_team->t.t_control_stack_top;
2579 thread->th.th_team->t.t_control_stack_top = control;
2584 /* Changes set_nproc */
2585 void __kmp_set_num_threads(int new_nth, int gtid) {
2589 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2590 KMP_DEBUG_ASSERT(__kmp_init_serial);
2594 else if (new_nth > __kmp_max_nth)
2595 new_nth = __kmp_max_nth;
2597 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2598 thread = __kmp_threads[gtid];
2599 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2600 return; // nothing to do
2602 __kmp_save_internal_controls(thread);
2604 set__nproc(thread, new_nth);
2606 // If this omp_set_num_threads() call will cause the hot team size to be
2607 // reduced (in the absence of a num_threads clause), then reduce it now,
2608 // rather than waiting for the next parallel region.
2609 root = thread->th.th_root;
2610 if (__kmp_init_parallel && (!root->r.r_active) &&
2611 (root->r.r_hot_team->t.t_nproc > new_nth)
2612 #if KMP_NESTED_HOT_TEAMS
2613 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2616 kmp_team_t *hot_team = root->r.r_hot_team;
2619 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2621 // Release the extra threads we don't need any more.
2622 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2623 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2624 if (__kmp_tasking_mode != tskm_immediate_exec) {
2625 // When decreasing team size, threads no longer in the team should unref
2627 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2629 __kmp_free_thread(hot_team->t.t_threads[f]);
2630 hot_team->t.t_threads[f] = NULL;
2632 hot_team->t.t_nproc = new_nth;
2633 #if KMP_NESTED_HOT_TEAMS
2634 if (thread->th.th_hot_teams) {
2635 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2636 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2640 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2642 // Update the t_nproc field in the threads that are still active.
2643 for (f = 0; f < new_nth; f++) {
2644 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2645 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2647 // Special flag in case omp_set_num_threads() call
2648 hot_team->t.t_size_changed = -1;
2652 /* Changes max_active_levels */
2653 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2656 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2658 gtid, max_active_levels));
2659 KMP_DEBUG_ASSERT(__kmp_init_serial);
2661 // validate max_active_levels
2662 if (max_active_levels < 0) {
2663 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2664 // We ignore this call if the user has specified a negative value.
2665 // The current setting won't be changed. The last valid setting will be
2666 // used. A warning will be issued (if warnings are allowed as controlled by
2667 // the KMP_WARNINGS env var).
2668 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2669 "max_active_levels for thread %d = (%d)\n",
2670 gtid, max_active_levels));
2673 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2674 // it's OK, the max_active_levels is within the valid range: [ 0;
2675 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2676 // We allow a zero value. (implementation defined behavior)
2678 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2679 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2680 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2681 // Current upper limit is MAX_INT. (implementation defined behavior)
2682 // If the input exceeds the upper limit, we correct the input to be the
2683 // upper limit. (implementation defined behavior)
2684 // Actually, the flow should never get here until we use MAX_INT limit.
2686 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2687 "max_active_levels for thread %d = (%d)\n",
2688 gtid, max_active_levels));
2690 thread = __kmp_threads[gtid];
2692 __kmp_save_internal_controls(thread);
2694 set__max_active_levels(thread, max_active_levels);
2697 /* Gets max_active_levels */
2698 int __kmp_get_max_active_levels(int gtid) {
2701 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2702 KMP_DEBUG_ASSERT(__kmp_init_serial);
2704 thread = __kmp_threads[gtid];
2705 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2706 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2707 "curtask_maxaclevel=%d\n",
2708 gtid, thread->th.th_current_task,
2709 thread->th.th_current_task->td_icvs.max_active_levels));
2710 return thread->th.th_current_task->td_icvs.max_active_levels;
2713 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2714 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2716 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2717 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2719 kmp_sched_t orig_kind;
2720 // kmp_team_t *team;
2722 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2723 gtid, (int)kind, chunk));
2724 KMP_DEBUG_ASSERT(__kmp_init_serial);
2726 // Check if the kind parameter is valid, correct if needed.
2727 // Valid parameters should fit in one of two intervals - standard or extended:
2728 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2729 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2731 kind = __kmp_sched_without_mods(kind);
2733 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2734 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2735 // TODO: Hint needs attention in case we change the default schedule.
2736 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2737 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2739 kind = kmp_sched_default;
2740 chunk = 0; // ignore chunk value in case of bad kind
2743 thread = __kmp_threads[gtid];
2745 __kmp_save_internal_controls(thread);
2747 if (kind < kmp_sched_upper_std) {
2748 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2749 // differ static chunked vs. unchunked: chunk should be invalid to
2750 // indicate unchunked schedule (which is the default)
2751 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2753 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2754 __kmp_sch_map[kind - kmp_sched_lower - 1];
2757 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2758 // kmp_sched_lower - 2 ];
2759 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2760 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2761 kmp_sched_lower - 2];
2763 __kmp_sched_apply_mods_intkind(
2764 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2765 if (kind == kmp_sched_auto || chunk < 1) {
2766 // ignore parameter chunk for schedule auto
2767 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2769 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2773 /* Gets def_sched_var ICV values */
2774 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2776 enum sched_type th_type;
2778 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2779 KMP_DEBUG_ASSERT(__kmp_init_serial);
2781 thread = __kmp_threads[gtid];
2783 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2784 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2785 case kmp_sch_static:
2786 case kmp_sch_static_greedy:
2787 case kmp_sch_static_balanced:
2788 *kind = kmp_sched_static;
2789 __kmp_sched_apply_mods_stdkind(kind, th_type);
2790 *chunk = 0; // chunk was not set, try to show this fact via zero value
2792 case kmp_sch_static_chunked:
2793 *kind = kmp_sched_static;
2795 case kmp_sch_dynamic_chunked:
2796 *kind = kmp_sched_dynamic;
2798 case kmp_sch_guided_chunked:
2799 case kmp_sch_guided_iterative_chunked:
2800 case kmp_sch_guided_analytical_chunked:
2801 *kind = kmp_sched_guided;
2804 *kind = kmp_sched_auto;
2806 case kmp_sch_trapezoidal:
2807 *kind = kmp_sched_trapezoidal;
2809 #if KMP_STATIC_STEAL_ENABLED
2810 case kmp_sch_static_steal:
2811 *kind = kmp_sched_static_steal;
2815 KMP_FATAL(UnknownSchedulingType, th_type);
2818 __kmp_sched_apply_mods_stdkind(kind, th_type);
2819 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2822 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2828 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2829 KMP_DEBUG_ASSERT(__kmp_init_serial);
2836 thr = __kmp_threads[gtid];
2837 team = thr->th.th_team;
2838 ii = team->t.t_level;
2842 if (thr->th.th_teams_microtask) {
2843 // AC: we are in teams region where multiple nested teams have same level
2844 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2846 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2847 KMP_DEBUG_ASSERT(ii >= tlevel);
2848 // AC: As we need to pass by the teams league, we need to artificially
2851 ii += 2; // three teams have same level
2853 ii++; // two teams have same level
2859 return __kmp_tid_from_gtid(gtid);
2861 dd = team->t.t_serialized;
2863 while (ii > level) {
2864 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2866 if ((team->t.t_serialized) && (!dd)) {
2867 team = team->t.t_parent;
2871 team = team->t.t_parent;
2872 dd = team->t.t_serialized;
2877 return (dd > 1) ? (0) : (team->t.t_master_tid);
2880 int __kmp_get_team_size(int gtid, int level) {
2886 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2887 KMP_DEBUG_ASSERT(__kmp_init_serial);
2894 thr = __kmp_threads[gtid];
2895 team = thr->th.th_team;
2896 ii = team->t.t_level;
2900 if (thr->th.th_teams_microtask) {
2901 // AC: we are in teams region where multiple nested teams have same level
2902 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2904 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2905 KMP_DEBUG_ASSERT(ii >= tlevel);
2906 // AC: As we need to pass by the teams league, we need to artificially
2909 ii += 2; // three teams have same level
2911 ii++; // two teams have same level
2916 while (ii > level) {
2917 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2919 if (team->t.t_serialized && (!dd)) {
2920 team = team->t.t_parent;
2924 team = team->t.t_parent;
2929 return team->t.t_nproc;
2932 kmp_r_sched_t __kmp_get_schedule_global() {
2933 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2934 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2935 // independently. So one can get the updated schedule here.
2937 kmp_r_sched_t r_sched;
2939 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2940 // __kmp_guided. __kmp_sched should keep original value, so that user can set
2941 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2942 // different roots (even in OMP 2.5)
2943 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2944 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2945 if (s == kmp_sch_static) {
2946 // replace STATIC with more detailed schedule (balanced or greedy)
2947 r_sched.r_sched_type = __kmp_static;
2948 } else if (s == kmp_sch_guided_chunked) {
2949 // replace GUIDED with more detailed schedule (iterative or analytical)
2950 r_sched.r_sched_type = __kmp_guided;
2951 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2952 r_sched.r_sched_type = __kmp_sched;
2954 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2956 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2957 // __kmp_chunk may be wrong here (if it was not ever set)
2958 r_sched.chunk = KMP_DEFAULT_CHUNK;
2960 r_sched.chunk = __kmp_chunk;
2966 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2967 at least argc number of *t_argv entries for the requested team. */
2968 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2970 KMP_DEBUG_ASSERT(team);
2971 if (!realloc || argc > team->t.t_max_argc) {
2973 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2974 "current entries=%d\n",
2975 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
2976 /* if previously allocated heap space for args, free them */
2977 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
2978 __kmp_free((void *)team->t.t_argv);
2980 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
2981 /* use unused space in the cache line for arguments */
2982 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2983 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
2985 team->t.t_id, team->t.t_max_argc));
2986 team->t.t_argv = &team->t.t_inline_argv[0];
2987 if (__kmp_storage_map) {
2988 __kmp_print_storage_map_gtid(
2989 -1, &team->t.t_inline_argv[0],
2990 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2991 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
2995 /* allocate space for arguments in the heap */
2996 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
2997 ? KMP_MIN_MALLOC_ARGV_ENTRIES
2999 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3001 team->t.t_id, team->t.t_max_argc));
3003 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3004 if (__kmp_storage_map) {
3005 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3006 &team->t.t_argv[team->t.t_max_argc],
3007 sizeof(void *) * team->t.t_max_argc,
3008 "team_%d.t_argv", team->t.t_id);
3014 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3016 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3018 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3019 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3020 sizeof(dispatch_shared_info_t) * num_disp_buff);
3021 team->t.t_dispatch =
3022 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3023 team->t.t_implicit_task_taskdata =
3024 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3025 team->t.t_max_nproc = max_nth;
3027 /* setup dispatch buffers */
3028 for (i = 0; i < num_disp_buff; ++i) {
3029 team->t.t_disp_buffer[i].buffer_index = i;
3030 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3034 static void __kmp_free_team_arrays(kmp_team_t *team) {
3035 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3037 for (i = 0; i < team->t.t_max_nproc; ++i) {
3038 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3039 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3040 team->t.t_dispatch[i].th_disp_buffer = NULL;
3043 #if KMP_USE_HIER_SCHED
3044 __kmp_dispatch_free_hierarchies(team);
3046 __kmp_free(team->t.t_threads);
3047 __kmp_free(team->t.t_disp_buffer);
3048 __kmp_free(team->t.t_dispatch);
3049 __kmp_free(team->t.t_implicit_task_taskdata);
3050 team->t.t_threads = NULL;
3051 team->t.t_disp_buffer = NULL;
3052 team->t.t_dispatch = NULL;
3053 team->t.t_implicit_task_taskdata = 0;
3056 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3057 kmp_info_t **oldThreads = team->t.t_threads;
3059 __kmp_free(team->t.t_disp_buffer);
3060 __kmp_free(team->t.t_dispatch);
3061 __kmp_free(team->t.t_implicit_task_taskdata);
3062 __kmp_allocate_team_arrays(team, max_nth);
3064 KMP_MEMCPY(team->t.t_threads, oldThreads,
3065 team->t.t_nproc * sizeof(kmp_info_t *));
3067 __kmp_free(oldThreads);
3070 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3072 kmp_r_sched_t r_sched =
3073 __kmp_get_schedule_global(); // get current state of scheduling globals
3075 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3077 kmp_internal_control_t g_icvs = {
3078 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3079 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3080 // adjustment of threads (per thread)
3081 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3082 // whether blocktime is explicitly set
3083 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3085 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3088 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3089 // next parallel region (per thread)
3090 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3091 __kmp_cg_max_nth, // int thread_limit;
3092 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3093 // for max_active_levels
3094 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3095 // {sched,chunk} pair
3096 __kmp_nested_proc_bind.bind_types[0],
3097 __kmp_default_device,
3098 NULL // struct kmp_internal_control *next;
3104 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3106 kmp_internal_control_t gx_icvs;
3107 gx_icvs.serial_nesting_level =
3108 0; // probably =team->t.t_serial like in save_inter_controls
3109 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3110 gx_icvs.next = NULL;
3115 static void __kmp_initialize_root(kmp_root_t *root) {
3117 kmp_team_t *root_team;
3118 kmp_team_t *hot_team;
3119 int hot_team_max_nth;
3120 kmp_r_sched_t r_sched =
3121 __kmp_get_schedule_global(); // get current state of scheduling globals
3122 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3123 KMP_DEBUG_ASSERT(root);
3124 KMP_ASSERT(!root->r.r_begin);
3126 /* setup the root state structure */
3127 __kmp_init_lock(&root->r.r_begin_lock);
3128 root->r.r_begin = FALSE;
3129 root->r.r_active = FALSE;
3130 root->r.r_in_parallel = 0;
3131 root->r.r_blocktime = __kmp_dflt_blocktime;
3133 /* setup the root team for this task */
3134 /* allocate the root team structure */
3135 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3138 __kmp_allocate_team(root,
3142 ompt_data_none, // root parallel id
3144 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3146 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3149 // Non-NULL value should be assigned to make the debugger display the root
3151 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3154 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3156 root->r.r_root_team = root_team;
3157 root_team->t.t_control_stack_top = NULL;
3159 /* initialize root team */
3160 root_team->t.t_threads[0] = NULL;
3161 root_team->t.t_nproc = 1;
3162 root_team->t.t_serialized = 1;
3163 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3164 root_team->t.t_sched.sched = r_sched.sched;
3167 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3168 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3170 /* setup the hot team for this task */
3171 /* allocate the hot team structure */
3172 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3175 __kmp_allocate_team(root,
3177 __kmp_dflt_team_nth_ub * 2, // max_nproc
3179 ompt_data_none, // root parallel id
3181 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3187 root->r.r_hot_team = hot_team;
3188 root_team->t.t_control_stack_top = NULL;
3190 /* first-time initialization */
3191 hot_team->t.t_parent = root_team;
3193 /* initialize hot team */
3194 hot_team_max_nth = hot_team->t.t_max_nproc;
3195 for (f = 0; f < hot_team_max_nth; ++f) {
3196 hot_team->t.t_threads[f] = NULL;
3198 hot_team->t.t_nproc = 1;
3199 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3200 hot_team->t.t_sched.sched = r_sched.sched;
3201 hot_team->t.t_size_changed = 0;
3206 typedef struct kmp_team_list_item {
3207 kmp_team_p const *entry;
3208 struct kmp_team_list_item *next;
3209 } kmp_team_list_item_t;
3210 typedef kmp_team_list_item_t *kmp_team_list_t;
3212 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3213 kmp_team_list_t list, // List of teams.
3214 kmp_team_p const *team // Team to add.
3217 // List must terminate with item where both entry and next are NULL.
3218 // Team is added to the list only once.
3219 // List is sorted in ascending order by team id.
3220 // Team id is *not* a key.
3224 KMP_DEBUG_ASSERT(list != NULL);
3229 __kmp_print_structure_team_accum(list, team->t.t_parent);
3230 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3232 // Search list for the team.
3234 while (l->next != NULL && l->entry != team) {
3237 if (l->next != NULL) {
3238 return; // Team has been added before, exit.
3241 // Team is not found. Search list again for insertion point.
3243 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3249 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3250 sizeof(kmp_team_list_item_t));
3257 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3260 __kmp_printf("%s", title);
3262 __kmp_printf("%2x %p\n", team->t.t_id, team);
3264 __kmp_printf(" - (nil)\n");
3268 static void __kmp_print_structure_thread(char const *title,
3269 kmp_info_p const *thread) {
3270 __kmp_printf("%s", title);
3271 if (thread != NULL) {
3272 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3274 __kmp_printf(" - (nil)\n");
3278 void __kmp_print_structure(void) {
3280 kmp_team_list_t list;
3282 // Initialize list of teams.
3284 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3288 __kmp_printf("\n------------------------------\nGlobal Thread "
3289 "Table\n------------------------------\n");
3292 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3293 __kmp_printf("%2d", gtid);
3294 if (__kmp_threads != NULL) {
3295 __kmp_printf(" %p", __kmp_threads[gtid]);
3297 if (__kmp_root != NULL) {
3298 __kmp_printf(" %p", __kmp_root[gtid]);
3304 // Print out __kmp_threads array.
3305 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3307 if (__kmp_threads != NULL) {
3309 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3310 kmp_info_t const *thread = __kmp_threads[gtid];
3311 if (thread != NULL) {
3312 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3313 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3314 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3315 __kmp_print_structure_team(" Serial Team: ",
3316 thread->th.th_serial_team);
3317 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3318 __kmp_print_structure_thread(" Master: ",
3319 thread->th.th_team_master);
3320 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3321 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3322 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3323 __kmp_print_structure_thread(" Next in pool: ",
3324 thread->th.th_next_pool);
3326 __kmp_print_structure_team_accum(list, thread->th.th_team);
3327 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3331 __kmp_printf("Threads array is not allocated.\n");
3334 // Print out __kmp_root array.
3335 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3337 if (__kmp_root != NULL) {
3339 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3340 kmp_root_t const *root = __kmp_root[gtid];
3342 __kmp_printf("GTID %2d %p:\n", gtid, root);
3343 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3344 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3345 __kmp_print_structure_thread(" Uber Thread: ",
3346 root->r.r_uber_thread);
3347 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3348 __kmp_printf(" In Parallel: %2d\n",
3349 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3351 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3352 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3356 __kmp_printf("Ubers array is not allocated.\n");
3359 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3361 while (list->next != NULL) {
3362 kmp_team_p const *team = list->entry;
3364 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3365 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3366 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3367 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3368 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3369 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3370 for (i = 0; i < team->t.t_nproc; ++i) {
3371 __kmp_printf(" Thread %2d: ", i);
3372 __kmp_print_structure_thread("", team->t.t_threads[i]);
3374 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3379 // Print out __kmp_thread_pool and __kmp_team_pool.
3380 __kmp_printf("\n------------------------------\nPools\n----------------------"
3382 __kmp_print_structure_thread("Thread pool: ",
3383 CCAST(kmp_info_t *, __kmp_thread_pool));
3384 __kmp_print_structure_team("Team pool: ",
3385 CCAST(kmp_team_t *, __kmp_team_pool));
3389 while (list != NULL) {
3390 kmp_team_list_item_t *item = list;
3392 KMP_INTERNAL_FREE(item);
3398 //---------------------------------------------------------------------------
3399 // Stuff for per-thread fast random number generator
3401 static const unsigned __kmp_primes[] = {
3402 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3403 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3404 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3405 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3406 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3407 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3408 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3409 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3410 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3411 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3412 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3414 //---------------------------------------------------------------------------
3415 // __kmp_get_random: Get a random number using a linear congruential method.
3416 unsigned short __kmp_get_random(kmp_info_t *thread) {
3417 unsigned x = thread->th.th_x;
3418 unsigned short r = x >> 16;
3420 thread->th.th_x = x * thread->th.th_a + 1;
3422 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3423 thread->th.th_info.ds.ds_tid, r));
3427 //--------------------------------------------------------
3428 // __kmp_init_random: Initialize a random number generator
3429 void __kmp_init_random(kmp_info_t *thread) {
3430 unsigned seed = thread->th.th_info.ds.ds_tid;
3433 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3434 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3436 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3440 /* reclaim array entries for root threads that are already dead, returns number
3442 static int __kmp_reclaim_dead_roots(void) {
3445 for (i = 0; i < __kmp_threads_capacity; ++i) {
3446 if (KMP_UBER_GTID(i) &&
3447 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3449 ->r.r_active) { // AC: reclaim only roots died in non-active state
3450 r += __kmp_unregister_root_other_thread(i);
3457 /* This function attempts to create free entries in __kmp_threads and
3458 __kmp_root, and returns the number of free entries generated.
3460 For Windows* OS static library, the first mechanism used is to reclaim array
3461 entries for root threads that are already dead.
3463 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3464 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3465 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3466 threadprivate cache array has been created. Synchronization with
3467 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3469 After any dead root reclamation, if the clipping value allows array expansion
3470 to result in the generation of a total of nNeed free slots, the function does
3471 that expansion. If not, nothing is done beyond the possible initial root
3474 If any argument is negative, the behavior is undefined. */
3475 static int __kmp_expand_threads(int nNeed) {
3477 int minimumRequiredCapacity;
3479 kmp_info_t **newThreads;
3480 kmp_root_t **newRoot;
3482 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3483 // resizing __kmp_threads does not need additional protection if foreign
3484 // threads are present
3486 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3487 /* only for Windows static library */
3488 /* reclaim array entries for root threads that are already dead */
3489 added = __kmp_reclaim_dead_roots();
3500 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3501 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3502 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3503 // > __kmp_max_nth in one of two ways:
3505 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3506 // may not be reused by another thread, so we may need to increase
3507 // __kmp_threads_capacity to __kmp_max_nth + 1.
3509 // 2) New foreign root(s) are encountered. We always register new foreign
3510 // roots. This may cause a smaller # of threads to be allocated at
3511 // subsequent parallel regions, but the worker threads hang around (and
3512 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3514 // Anyway, that is the reason for moving the check to see if
3515 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3516 // instead of having it performed here. -BB
3518 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3520 /* compute expansion headroom to check if we can expand */
3521 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3522 /* possible expansion too small -- give up */
3525 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3527 newCapacity = __kmp_threads_capacity;
3529 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3530 : __kmp_sys_max_nth;
3531 } while (newCapacity < minimumRequiredCapacity);
3532 newThreads = (kmp_info_t **)__kmp_allocate(
3533 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3535 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3536 KMP_MEMCPY(newThreads, __kmp_threads,
3537 __kmp_threads_capacity * sizeof(kmp_info_t *));
3538 KMP_MEMCPY(newRoot, __kmp_root,
3539 __kmp_threads_capacity * sizeof(kmp_root_t *));
3541 kmp_info_t **temp_threads = __kmp_threads;
3542 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3543 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3544 __kmp_free(temp_threads);
3545 added += newCapacity - __kmp_threads_capacity;
3546 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3548 if (newCapacity > __kmp_tp_capacity) {
3549 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3550 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3551 __kmp_threadprivate_resize_cache(newCapacity);
3552 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3553 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3555 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3561 /* Register the current thread as a root thread and obtain our gtid. We must
3562 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3563 thread that calls from __kmp_do_serial_initialize() */
3564 int __kmp_register_root(int initial_thread) {
3565 kmp_info_t *root_thread;
3569 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3570 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3574 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3575 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3576 work as expected -- it may return false (that means there is at least one
3577 empty slot in __kmp_threads array), but it is possible the only free slot
3578 is #0, which is reserved for initial thread and so cannot be used for this
3579 one. Following code workarounds this bug.
3581 However, right solution seems to be not reserving slot #0 for initial
3583 (1) there is no magic in slot #0,
3584 (2) we cannot detect initial thread reliably (the first thread which does
3585 serial initialization may be not a real initial thread).
3587 capacity = __kmp_threads_capacity;
3588 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3592 /* see if there are too many threads */
3593 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3594 if (__kmp_tp_cached) {
3595 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3596 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3597 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3599 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3604 /* find an available thread slot */
3605 /* Don't reassign the zero slot since we need that to only be used by initial
3607 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3611 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3612 KMP_ASSERT(gtid < __kmp_threads_capacity);
3614 /* update global accounting */
3616 TCW_4(__kmp_nth, __kmp_nth + 1);
3618 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3619 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3620 if (__kmp_adjust_gtid_mode) {
3621 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3622 if (TCR_4(__kmp_gtid_mode) != 2) {
3623 TCW_4(__kmp_gtid_mode, 2);
3626 if (TCR_4(__kmp_gtid_mode) != 1) {
3627 TCW_4(__kmp_gtid_mode, 1);
3632 #ifdef KMP_ADJUST_BLOCKTIME
3633 /* Adjust blocktime to zero if necessary */
3634 /* Middle initialization might not have occurred yet */
3635 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3636 if (__kmp_nth > __kmp_avail_proc) {
3637 __kmp_zero_bt = TRUE;
3640 #endif /* KMP_ADJUST_BLOCKTIME */
3642 /* setup this new hierarchy */
3643 if (!(root = __kmp_root[gtid])) {
3644 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3645 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3648 #if KMP_STATS_ENABLED
3649 // Initialize stats as soon as possible (right after gtid assignment).
3650 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3651 __kmp_stats_thread_ptr->startLife();
3652 KMP_SET_THREAD_STATE(SERIAL_REGION);
3653 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3655 __kmp_initialize_root(root);
3657 /* setup new root thread structure */
3658 if (root->r.r_uber_thread) {
3659 root_thread = root->r.r_uber_thread;
3661 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3662 if (__kmp_storage_map) {
3663 __kmp_print_thread_storage_map(root_thread, gtid);
3665 root_thread->th.th_info.ds.ds_gtid = gtid;
3667 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3669 root_thread->th.th_root = root;
3670 if (__kmp_env_consistency_check) {
3671 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3674 __kmp_initialize_fast_memory(root_thread);
3675 #endif /* USE_FAST_MEMORY */
3678 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3679 __kmp_initialize_bget(root_thread);
3681 __kmp_init_random(root_thread); // Initialize random number generator
3684 /* setup the serial team held in reserve by the root thread */
3685 if (!root_thread->th.th_serial_team) {
3686 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3687 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3688 root_thread->th.th_serial_team = __kmp_allocate_team(
3691 ompt_data_none, // root parallel id
3693 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3695 KMP_ASSERT(root_thread->th.th_serial_team);
3696 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3697 root_thread->th.th_serial_team));
3699 /* drop root_thread into place */
3700 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3702 root->r.r_root_team->t.t_threads[0] = root_thread;
3703 root->r.r_hot_team->t.t_threads[0] = root_thread;
3704 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3705 // AC: the team created in reserve, not for execution (it is unused for now).
3706 root_thread->th.th_serial_team->t.t_serialized = 0;
3707 root->r.r_uber_thread = root_thread;
3709 /* initialize the thread, get it ready to go */
3710 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3711 TCW_4(__kmp_init_gtid, TRUE);
3713 /* prepare the master thread for get_gtid() */
3714 __kmp_gtid_set_specific(gtid);
3717 __kmp_itt_thread_name(gtid);
3718 #endif /* USE_ITT_BUILD */
3720 #ifdef KMP_TDATA_GTID
3723 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3724 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3726 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3728 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3729 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3730 KMP_INIT_BARRIER_STATE));
3731 { // Initialize barrier data.
3733 for (b = 0; b < bs_last_barrier; ++b) {
3734 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3736 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3740 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3741 KMP_INIT_BARRIER_STATE);
3743 #if KMP_AFFINITY_SUPPORTED
3744 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3745 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3746 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3747 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3748 if (TCR_4(__kmp_init_middle)) {
3749 __kmp_affinity_set_init_mask(gtid, TRUE);
3751 #endif /* KMP_AFFINITY_SUPPORTED */
3752 root_thread->th.th_def_allocator = __kmp_def_allocator;
3753 root_thread->th.th_prev_level = 0;
3754 root_thread->th.th_prev_num_threads = 1;
3756 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3757 tmp->cg_root = root_thread;
3758 tmp->cg_thread_limit = __kmp_cg_max_nth;
3759 tmp->cg_nthreads = 1;
3760 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3761 " cg_nthreads init to 1\n",
3764 root_thread->th.th_cg_roots = tmp;
3766 __kmp_root_counter++;
3769 if (!initial_thread && ompt_enabled.enabled) {
3771 kmp_info_t *root_thread = ompt_get_thread();
3773 ompt_set_thread_state(root_thread, ompt_state_overhead);
3775 if (ompt_enabled.ompt_callback_thread_begin) {
3776 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3777 ompt_thread_initial, __ompt_get_thread_data_internal());
3779 ompt_data_t *task_data;
3780 ompt_data_t *parallel_data;
3781 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3782 if (ompt_enabled.ompt_callback_implicit_task) {
3783 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3784 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3787 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3792 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3797 #if KMP_NESTED_HOT_TEAMS
3798 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3799 const int max_level) {
3801 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3802 if (!hot_teams || !hot_teams[level].hot_team) {
3805 KMP_DEBUG_ASSERT(level < max_level);
3806 kmp_team_t *team = hot_teams[level].hot_team;
3807 nth = hot_teams[level].hot_team_nth;
3808 n = nth - 1; // master is not freed
3809 if (level < max_level - 1) {
3810 for (i = 0; i < nth; ++i) {
3811 kmp_info_t *th = team->t.t_threads[i];
3812 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3813 if (i > 0 && th->th.th_hot_teams) {
3814 __kmp_free(th->th.th_hot_teams);
3815 th->th.th_hot_teams = NULL;
3819 __kmp_free_team(root, team, NULL);
3824 // Resets a root thread and clear its root and hot teams.
3825 // Returns the number of __kmp_threads entries directly and indirectly freed.
3826 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3827 kmp_team_t *root_team = root->r.r_root_team;
3828 kmp_team_t *hot_team = root->r.r_hot_team;
3829 int n = hot_team->t.t_nproc;
3832 KMP_DEBUG_ASSERT(!root->r.r_active);
3834 root->r.r_root_team = NULL;
3835 root->r.r_hot_team = NULL;
3836 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3837 // before call to __kmp_free_team().
3838 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3839 #if KMP_NESTED_HOT_TEAMS
3840 if (__kmp_hot_teams_max_level >
3841 0) { // need to free nested hot teams and their threads if any
3842 for (i = 0; i < hot_team->t.t_nproc; ++i) {
3843 kmp_info_t *th = hot_team->t.t_threads[i];
3844 if (__kmp_hot_teams_max_level > 1) {
3845 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3847 if (th->th.th_hot_teams) {
3848 __kmp_free(th->th.th_hot_teams);
3849 th->th.th_hot_teams = NULL;
3854 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3856 // Before we can reap the thread, we need to make certain that all other
3857 // threads in the teams that had this root as ancestor have stopped trying to
3859 if (__kmp_tasking_mode != tskm_immediate_exec) {
3860 __kmp_wait_to_unref_task_teams();
3864 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3866 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3868 (LPVOID) & (root->r.r_uber_thread->th),
3869 root->r.r_uber_thread->th.th_info.ds.ds_thread));
3870 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3871 #endif /* KMP_OS_WINDOWS */
3874 ompt_data_t *task_data;
3875 ompt_data_t *parallel_data;
3876 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3877 if (ompt_enabled.ompt_callback_implicit_task) {
3878 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3879 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3881 if (ompt_enabled.ompt_callback_thread_end) {
3882 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3883 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3888 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3889 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3890 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3892 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3893 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3895 // need to free contention group structure
3896 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3897 root->r.r_uber_thread->th.th_cg_roots->cg_root);
3898 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3899 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3900 root->r.r_uber_thread->th.th_cg_roots = NULL;
3902 __kmp_reap_thread(root->r.r_uber_thread, 1);
3904 // We canot put root thread to __kmp_thread_pool, so we have to reap it
3905 // instead of freeing.
3906 root->r.r_uber_thread = NULL;
3907 /* mark root as no longer in use */
3908 root->r.r_begin = FALSE;
3913 void __kmp_unregister_root_current_thread(int gtid) {
3914 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3915 /* this lock should be ok, since unregister_root_current_thread is never
3916 called during an abort, only during a normal close. furthermore, if you
3917 have the forkjoin lock, you should never try to get the initz lock */
3918 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3919 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3920 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3923 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3926 kmp_root_t *root = __kmp_root[gtid];
3928 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3929 KMP_ASSERT(KMP_UBER_GTID(gtid));
3930 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3931 KMP_ASSERT(root->r.r_active == FALSE);
3935 kmp_info_t *thread = __kmp_threads[gtid];
3936 kmp_team_t *team = thread->th.th_team;
3937 kmp_task_team_t *task_team = thread->th.th_task_team;
3939 // we need to wait for the proxy tasks before finishing the thread
3940 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3942 // the runtime is shutting down so we won't report any events
3943 thread->th.ompt_thread_info.state = ompt_state_undefined;
3945 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3948 __kmp_reset_root(gtid, root);
3950 /* free up this thread slot */
3951 __kmp_gtid_set_specific(KMP_GTID_DNE);
3952 #ifdef KMP_TDATA_GTID
3953 __kmp_gtid = KMP_GTID_DNE;
3958 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3960 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3964 /* __kmp_forkjoin_lock must be already held
3965 Unregisters a root thread that is not the current thread. Returns the number
3966 of __kmp_threads entries freed as a result. */
3967 static int __kmp_unregister_root_other_thread(int gtid) {
3968 kmp_root_t *root = __kmp_root[gtid];
3971 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
3972 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3973 KMP_ASSERT(KMP_UBER_GTID(gtid));
3974 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3975 KMP_ASSERT(root->r.r_active == FALSE);
3977 r = __kmp_reset_root(gtid, root);
3979 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
3985 void __kmp_task_info() {
3987 kmp_int32 gtid = __kmp_entry_gtid();
3988 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
3989 kmp_info_t *this_thr = __kmp_threads[gtid];
3990 kmp_team_t *steam = this_thr->th.th_serial_team;
3991 kmp_team_t *team = this_thr->th.th_team;
3994 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
3996 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
3997 team->t.t_implicit_task_taskdata[tid].td_parent);
4001 /* TODO optimize with one big memclr, take out what isn't needed, split
4002 responsibility to workers as much as possible, and delay initialization of
4003 features as much as possible */
4004 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4005 int tid, int gtid) {
4006 /* this_thr->th.th_info.ds.ds_gtid is setup in
4007 kmp_allocate_thread/create_worker.
4008 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4009 kmp_info_t *master = team->t.t_threads[0];
4010 KMP_DEBUG_ASSERT(this_thr != NULL);
4011 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4012 KMP_DEBUG_ASSERT(team);
4013 KMP_DEBUG_ASSERT(team->t.t_threads);
4014 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4015 KMP_DEBUG_ASSERT(master);
4016 KMP_DEBUG_ASSERT(master->th.th_root);
4020 TCW_SYNC_PTR(this_thr->th.th_team, team);
4022 this_thr->th.th_info.ds.ds_tid = tid;
4023 this_thr->th.th_set_nproc = 0;
4024 if (__kmp_tasking_mode != tskm_immediate_exec)
4025 // When tasking is possible, threads are not safe to reap until they are
4026 // done tasking; this will be set when tasking code is exited in wait
4027 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4028 else // no tasking --> always safe to reap
4029 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4030 this_thr->th.th_set_proc_bind = proc_bind_default;
4031 #if KMP_AFFINITY_SUPPORTED
4032 this_thr->th.th_new_place = this_thr->th.th_current_place;
4034 this_thr->th.th_root = master->th.th_root;
4036 /* setup the thread's cache of the team structure */
4037 this_thr->th.th_team_nproc = team->t.t_nproc;
4038 this_thr->th.th_team_master = master;
4039 this_thr->th.th_team_serialized = team->t.t_serialized;
4040 TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4042 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4044 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4045 tid, gtid, this_thr, this_thr->th.th_current_task));
4047 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4050 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4051 tid, gtid, this_thr, this_thr->th.th_current_task));
4052 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4053 // __kmp_initialize_team()?
4055 /* TODO no worksharing in speculative threads */
4056 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4058 this_thr->th.th_local.this_construct = 0;
4060 if (!this_thr->th.th_pri_common) {
4061 this_thr->th.th_pri_common =
4062 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4063 if (__kmp_storage_map) {
4064 __kmp_print_storage_map_gtid(
4065 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4066 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4068 this_thr->th.th_pri_head = NULL;
4071 if (this_thr != master && // Master's CG root is initialized elsewhere
4072 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4073 // Make new thread's CG root same as master's
4074 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4075 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4077 // worker changes CG, need to check if old CG should be freed
4078 int i = tmp->cg_nthreads--;
4079 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4080 " on node %p of thread %p to %d\n",
4081 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4083 __kmp_free(tmp); // last thread left CG --> free it
4086 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4087 // Increment new thread's CG root's counter to add the new thread
4088 this_thr->th.th_cg_roots->cg_nthreads++;
4089 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4090 " node %p of thread %p to %d\n",
4091 this_thr, this_thr->th.th_cg_roots,
4092 this_thr->th.th_cg_roots->cg_root,
4093 this_thr->th.th_cg_roots->cg_nthreads));
4094 this_thr->th.th_current_task->td_icvs.thread_limit =
4095 this_thr->th.th_cg_roots->cg_thread_limit;
4098 /* Initialize dynamic dispatch */
4100 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4101 // Use team max_nproc since this will never change for the team.
4103 sizeof(dispatch_private_info_t) *
4104 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4105 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4106 team->t.t_max_nproc));
4107 KMP_ASSERT(dispatch);
4108 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4109 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4111 dispatch->th_disp_index = 0;
4112 dispatch->th_doacross_buf_idx = 0;
4113 if (!dispatch->th_disp_buffer) {
4114 dispatch->th_disp_buffer =
4115 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4117 if (__kmp_storage_map) {
4118 __kmp_print_storage_map_gtid(
4119 gtid, &dispatch->th_disp_buffer[0],
4120 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4122 : __kmp_dispatch_num_buffers],
4123 disp_size, "th_%d.th_dispatch.th_disp_buffer "
4124 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4125 gtid, team->t.t_id, gtid);
4128 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4131 dispatch->th_dispatch_pr_current = 0;
4132 dispatch->th_dispatch_sh_current = 0;
4134 dispatch->th_deo_fcn = 0; /* ORDERED */
4135 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4138 this_thr->th.th_next_pool = NULL;
4140 if (!this_thr->th.th_task_state_memo_stack) {
4142 this_thr->th.th_task_state_memo_stack =
4143 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4144 this_thr->th.th_task_state_top = 0;
4145 this_thr->th.th_task_state_stack_sz = 4;
4146 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4147 ++i) // zero init the stack
4148 this_thr->th.th_task_state_memo_stack[i] = 0;
4151 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4152 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4157 /* allocate a new thread for the requesting team. this is only called from
4158 within a forkjoin critical section. we will first try to get an available
4159 thread from the thread pool. if none is available, we will fork a new one
4160 assuming we are able to create a new one. this should be assured, as the
4161 caller should check on this first. */
4162 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4164 kmp_team_t *serial_team;
4165 kmp_info_t *new_thr;
4168 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4169 KMP_DEBUG_ASSERT(root && team);
4170 #if !KMP_NESTED_HOT_TEAMS
4171 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4175 /* first, try to get one from the thread pool */
4176 if (__kmp_thread_pool) {
4177 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4178 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4179 if (new_thr == __kmp_thread_pool_insert_pt) {
4180 __kmp_thread_pool_insert_pt = NULL;
4182 TCW_4(new_thr->th.th_in_pool, FALSE);
4183 __kmp_suspend_initialize_thread(new_thr);
4184 __kmp_lock_suspend_mx(new_thr);
4185 if (new_thr->th.th_active_in_pool == TRUE) {
4186 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4187 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4188 new_thr->th.th_active_in_pool = FALSE;
4190 __kmp_unlock_suspend_mx(new_thr);
4192 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4193 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4194 KMP_ASSERT(!new_thr->th.th_team);
4195 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4197 /* setup the thread structure */
4198 __kmp_initialize_info(new_thr, team, new_tid,
4199 new_thr->th.th_info.ds.ds_gtid);
4200 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4202 TCW_4(__kmp_nth, __kmp_nth + 1);
4204 new_thr->th.th_task_state = 0;
4205 new_thr->th.th_task_state_top = 0;
4206 new_thr->th.th_task_state_stack_sz = 4;
4208 #ifdef KMP_ADJUST_BLOCKTIME
4209 /* Adjust blocktime back to zero if necessary */
4210 /* Middle initialization might not have occurred yet */
4211 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4212 if (__kmp_nth > __kmp_avail_proc) {
4213 __kmp_zero_bt = TRUE;
4216 #endif /* KMP_ADJUST_BLOCKTIME */
4219 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4220 // KMP_BARRIER_PARENT_FLAG.
4222 kmp_balign_t *balign = new_thr->th.th_bar;
4223 for (b = 0; b < bs_last_barrier; ++b)
4224 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4227 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4228 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4234 /* no, well fork a new one */
4235 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4236 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4239 // If this is the first worker thread the RTL is creating, then also
4240 // launch the monitor thread. We try to do this as early as possible.
4241 if (!TCR_4(__kmp_init_monitor)) {
4242 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4243 if (!TCR_4(__kmp_init_monitor)) {
4244 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4245 TCW_4(__kmp_init_monitor, 1);
4246 __kmp_create_monitor(&__kmp_monitor);
4247 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4249 // AC: wait until monitor has started. This is a fix for CQ232808.
4250 // The reason is that if the library is loaded/unloaded in a loop with
4251 // small (parallel) work in between, then there is high probability that
4252 // monitor thread started after the library shutdown. At shutdown it is
4253 // too late to cope with the problem, because when the master is in
4254 // DllMain (process detach) the monitor has no chances to start (it is
4255 // blocked), and master has no means to inform the monitor that the
4256 // library has gone, because all the memory which the monitor can access
4257 // is going to be released/reset.
4258 while (TCR_4(__kmp_init_monitor) < 2) {
4261 KF_TRACE(10, ("after monitor thread has started\n"));
4264 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4269 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4270 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4273 /* allocate space for it. */
4274 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4276 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4278 if (__kmp_storage_map) {
4279 __kmp_print_thread_storage_map(new_thr, new_gtid);
4282 // add the reserve serialized team, initialized from the team's master thread
4284 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4285 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4286 new_thr->th.th_serial_team = serial_team =
4287 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4289 ompt_data_none, // root parallel id
4291 proc_bind_default, &r_icvs,
4292 0 USE_NESTED_HOT_ARG(NULL));
4294 KMP_ASSERT(serial_team);
4295 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4296 // execution (it is unused for now).
4297 serial_team->t.t_threads[0] = new_thr;
4299 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4302 /* setup the thread structures */
4303 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4306 __kmp_initialize_fast_memory(new_thr);
4307 #endif /* USE_FAST_MEMORY */
4310 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4311 __kmp_initialize_bget(new_thr);
4314 __kmp_init_random(new_thr); // Initialize random number generator
4316 /* Initialize these only once when thread is grabbed for a team allocation */
4318 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4319 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4322 kmp_balign_t *balign = new_thr->th.th_bar;
4323 for (b = 0; b < bs_last_barrier; ++b) {
4324 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4325 balign[b].bb.team = NULL;
4326 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4327 balign[b].bb.use_oncore_barrier = 0;
4330 new_thr->th.th_spin_here = FALSE;
4331 new_thr->th.th_next_waiting = 0;
4333 new_thr->th.th_blocking = false;
4336 #if KMP_AFFINITY_SUPPORTED
4337 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4338 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4339 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4340 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4342 new_thr->th.th_def_allocator = __kmp_def_allocator;
4343 new_thr->th.th_prev_level = 0;
4344 new_thr->th.th_prev_num_threads = 1;
4346 TCW_4(new_thr->th.th_in_pool, FALSE);
4347 new_thr->th.th_active_in_pool = FALSE;
4348 TCW_4(new_thr->th.th_active, TRUE);
4350 /* adjust the global counters */
4354 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4355 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4356 if (__kmp_adjust_gtid_mode) {
4357 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4358 if (TCR_4(__kmp_gtid_mode) != 2) {
4359 TCW_4(__kmp_gtid_mode, 2);
4362 if (TCR_4(__kmp_gtid_mode) != 1) {
4363 TCW_4(__kmp_gtid_mode, 1);
4368 #ifdef KMP_ADJUST_BLOCKTIME
4369 /* Adjust blocktime back to zero if necessary */
4370 /* Middle initialization might not have occurred yet */
4371 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4372 if (__kmp_nth > __kmp_avail_proc) {
4373 __kmp_zero_bt = TRUE;
4376 #endif /* KMP_ADJUST_BLOCKTIME */
4378 /* actually fork it and create the new worker thread */
4380 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4381 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4383 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4385 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4391 /* Reinitialize team for reuse.
4392 The hot team code calls this case at every fork barrier, so EPCC barrier
4393 test are extremely sensitive to changes in it, esp. writes to the team
4394 struct, which cause a cache invalidation in all threads.
4395 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4396 static void __kmp_reinitialize_team(kmp_team_t *team,
4397 kmp_internal_control_t *new_icvs,
4399 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4400 team->t.t_threads[0], team));
4401 KMP_DEBUG_ASSERT(team && new_icvs);
4402 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4403 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4405 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4406 // Copy ICVs to the master thread's implicit taskdata
4407 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4408 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4410 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4411 team->t.t_threads[0], team));
4414 /* Initialize the team data structure.
4415 This assumes the t_threads and t_max_nproc are already set.
4416 Also, we don't touch the arguments */
4417 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4418 kmp_internal_control_t *new_icvs,
4420 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4423 KMP_DEBUG_ASSERT(team);
4424 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4425 KMP_DEBUG_ASSERT(team->t.t_threads);
4428 team->t.t_master_tid = 0; /* not needed */
4429 /* team->t.t_master_bar; not needed */
4430 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4431 team->t.t_nproc = new_nproc;
4433 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4434 team->t.t_next_pool = NULL;
4435 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4438 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4439 team->t.t_invoke = NULL; /* not needed */
4441 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4442 team->t.t_sched.sched = new_icvs->sched.sched;
4444 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4445 team->t.t_fp_control_saved = FALSE; /* not needed */
4446 team->t.t_x87_fpu_control_word = 0; /* not needed */
4447 team->t.t_mxcsr = 0; /* not needed */
4448 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4450 team->t.t_construct = 0;
4452 team->t.t_ordered.dt.t_value = 0;
4453 team->t.t_master_active = FALSE;
4456 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4459 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4462 team->t.t_control_stack_top = NULL;
4464 __kmp_reinitialize_team(team, new_icvs, loc);
4467 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4470 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4471 /* Sets full mask for thread and returns old mask, no changes to structures. */
4473 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4474 if (KMP_AFFINITY_CAPABLE()) {
4476 if (old_mask != NULL) {
4477 status = __kmp_get_system_affinity(old_mask, TRUE);
4480 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4484 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4489 #if KMP_AFFINITY_SUPPORTED
4491 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4492 // It calculates the worker + master thread's partition based upon the parent
4493 // thread's partition, and binds each worker to a thread in their partition.
4494 // The master thread's partition should already include its current binding.
4495 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4496 // Copy the master thread's place partition to the team struct
4497 kmp_info_t *master_th = team->t.t_threads[0];
4498 KMP_DEBUG_ASSERT(master_th != NULL);
4499 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4500 int first_place = master_th->th.th_first_place;
4501 int last_place = master_th->th.th_last_place;
4502 int masters_place = master_th->th.th_current_place;
4503 team->t.t_first_place = first_place;
4504 team->t.t_last_place = last_place;
4506 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4507 "bound to place %d partition = [%d,%d]\n",
4508 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4509 team->t.t_id, masters_place, first_place, last_place));
4511 switch (proc_bind) {
4513 case proc_bind_default:
4514 // serial teams might have the proc_bind policy set to proc_bind_default. It
4515 // doesn't matter, as we don't rebind master thread for any proc_bind policy
4516 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4519 case proc_bind_master: {
4521 int n_th = team->t.t_nproc;
4522 for (f = 1; f < n_th; f++) {
4523 kmp_info_t *th = team->t.t_threads[f];
4524 KMP_DEBUG_ASSERT(th != NULL);
4525 th->th.th_first_place = first_place;
4526 th->th.th_last_place = last_place;
4527 th->th.th_new_place = masters_place;
4528 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4529 team->t.t_display_affinity != 1) {
4530 team->t.t_display_affinity = 1;
4533 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4534 "partition = [%d,%d]\n",
4535 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4536 f, masters_place, first_place, last_place));
4540 case proc_bind_close: {
4542 int n_th = team->t.t_nproc;
4544 if (first_place <= last_place) {
4545 n_places = last_place - first_place + 1;
4547 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4549 if (n_th <= n_places) {
4550 int place = masters_place;
4551 for (f = 1; f < n_th; f++) {
4552 kmp_info_t *th = team->t.t_threads[f];
4553 KMP_DEBUG_ASSERT(th != NULL);
4555 if (place == last_place) {
4556 place = first_place;
4557 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4562 th->th.th_first_place = first_place;
4563 th->th.th_last_place = last_place;
4564 th->th.th_new_place = place;
4565 if (__kmp_display_affinity && place != th->th.th_current_place &&
4566 team->t.t_display_affinity != 1) {
4567 team->t.t_display_affinity = 1;
4570 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4571 "partition = [%d,%d]\n",
4572 __kmp_gtid_from_thread(team->t.t_threads[f]),
4573 team->t.t_id, f, place, first_place, last_place));
4576 int S, rem, gap, s_count;
4577 S = n_th / n_places;
4579 rem = n_th - (S * n_places);
4580 gap = rem > 0 ? n_places / rem : n_places;
4581 int place = masters_place;
4583 for (f = 0; f < n_th; f++) {
4584 kmp_info_t *th = team->t.t_threads[f];
4585 KMP_DEBUG_ASSERT(th != NULL);
4587 th->th.th_first_place = first_place;
4588 th->th.th_last_place = last_place;
4589 th->th.th_new_place = place;
4590 if (__kmp_display_affinity && place != th->th.th_current_place &&
4591 team->t.t_display_affinity != 1) {
4592 team->t.t_display_affinity = 1;
4596 if ((s_count == S) && rem && (gap_ct == gap)) {
4597 // do nothing, add an extra thread to place on next iteration
4598 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4599 // we added an extra thread to this place; move to next place
4600 if (place == last_place) {
4601 place = first_place;
4602 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4610 } else if (s_count == S) { // place full; don't add extra
4611 if (place == last_place) {
4612 place = first_place;
4613 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4623 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4624 "partition = [%d,%d]\n",
4625 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4626 th->th.th_new_place, first_place, last_place));
4628 KMP_DEBUG_ASSERT(place == masters_place);
4632 case proc_bind_spread: {
4634 int n_th = team->t.t_nproc;
4637 if (first_place <= last_place) {
4638 n_places = last_place - first_place + 1;
4640 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4642 if (n_th <= n_places) {
4645 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4646 int S = n_places / n_th;
4647 int s_count, rem, gap, gap_ct;
4649 place = masters_place;
4650 rem = n_places - n_th * S;
4651 gap = rem ? n_th / rem : 1;
4654 if (update_master_only == 1)
4656 for (f = 0; f < thidx; f++) {
4657 kmp_info_t *th = team->t.t_threads[f];
4658 KMP_DEBUG_ASSERT(th != NULL);
4660 th->th.th_first_place = place;
4661 th->th.th_new_place = place;
4662 if (__kmp_display_affinity && place != th->th.th_current_place &&
4663 team->t.t_display_affinity != 1) {
4664 team->t.t_display_affinity = 1;
4667 while (s_count < S) {
4668 if (place == last_place) {
4669 place = first_place;
4670 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4677 if (rem && (gap_ct == gap)) {
4678 if (place == last_place) {
4679 place = first_place;
4680 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4688 th->th.th_last_place = place;
4691 if (place == last_place) {
4692 place = first_place;
4693 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4700 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4701 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4702 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4703 f, th->th.th_new_place, th->th.th_first_place,
4704 th->th.th_last_place, __kmp_affinity_num_masks));
4707 /* Having uniform space of available computation places I can create
4708 T partitions of round(P/T) size and put threads into the first
4709 place of each partition. */
4710 double current = static_cast<double>(masters_place);
4712 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4717 if (update_master_only == 1)
4719 for (f = 0; f < thidx; f++) {
4720 first = static_cast<int>(current);
4721 last = static_cast<int>(current + spacing) - 1;
4722 KMP_DEBUG_ASSERT(last >= first);
4723 if (first >= n_places) {
4724 if (masters_place) {
4727 if (first == (masters_place + 1)) {
4728 KMP_DEBUG_ASSERT(f == n_th);
4731 if (last == masters_place) {
4732 KMP_DEBUG_ASSERT(f == (n_th - 1));
4736 KMP_DEBUG_ASSERT(f == n_th);
4741 if (last >= n_places) {
4742 last = (n_places - 1);
4747 KMP_DEBUG_ASSERT(0 <= first);
4748 KMP_DEBUG_ASSERT(n_places > first);
4749 KMP_DEBUG_ASSERT(0 <= last);
4750 KMP_DEBUG_ASSERT(n_places > last);
4751 KMP_DEBUG_ASSERT(last_place >= first_place);
4752 th = team->t.t_threads[f];
4753 KMP_DEBUG_ASSERT(th);
4754 th->th.th_first_place = first;
4755 th->th.th_new_place = place;
4756 th->th.th_last_place = last;
4757 if (__kmp_display_affinity && place != th->th.th_current_place &&
4758 team->t.t_display_affinity != 1) {
4759 team->t.t_display_affinity = 1;
4762 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4763 "partition = [%d,%d], spacing = %.4f\n",
4764 __kmp_gtid_from_thread(team->t.t_threads[f]),
4765 team->t.t_id, f, th->th.th_new_place,
4766 th->th.th_first_place, th->th.th_last_place, spacing));
4770 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4772 int S, rem, gap, s_count;
4773 S = n_th / n_places;
4775 rem = n_th - (S * n_places);
4776 gap = rem > 0 ? n_places / rem : n_places;
4777 int place = masters_place;
4780 if (update_master_only == 1)
4782 for (f = 0; f < thidx; f++) {
4783 kmp_info_t *th = team->t.t_threads[f];
4784 KMP_DEBUG_ASSERT(th != NULL);
4786 th->th.th_first_place = place;
4787 th->th.th_last_place = place;
4788 th->th.th_new_place = place;
4789 if (__kmp_display_affinity && place != th->th.th_current_place &&
4790 team->t.t_display_affinity != 1) {
4791 team->t.t_display_affinity = 1;
4795 if ((s_count == S) && rem && (gap_ct == gap)) {
4796 // do nothing, add an extra thread to place on next iteration
4797 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4798 // we added an extra thread to this place; move on to next place
4799 if (place == last_place) {
4800 place = first_place;
4801 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4809 } else if (s_count == S) { // place is full; don't add extra thread
4810 if (place == last_place) {
4811 place = first_place;
4812 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4821 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4822 "partition = [%d,%d]\n",
4823 __kmp_gtid_from_thread(team->t.t_threads[f]),
4824 team->t.t_id, f, th->th.th_new_place,
4825 th->th.th_first_place, th->th.th_last_place));
4827 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4835 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4838 #endif // KMP_AFFINITY_SUPPORTED
4840 /* allocate a new team data structure to use. take one off of the free pool if
4843 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4845 ompt_data_t ompt_parallel_data,
4847 kmp_proc_bind_t new_proc_bind,
4848 kmp_internal_control_t *new_icvs,
4849 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4850 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4853 int use_hot_team = !root->r.r_active;
4856 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4857 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4858 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4861 #if KMP_NESTED_HOT_TEAMS
4862 kmp_hot_team_ptr_t *hot_teams;
4864 team = master->th.th_team;
4865 level = team->t.t_active_level;
4866 if (master->th.th_teams_microtask) { // in teams construct?
4867 if (master->th.th_teams_size.nteams > 1 &&
4870 (microtask_t)__kmp_teams_master || // inner fork of the teams
4871 master->th.th_teams_level <
4872 team->t.t_level)) { // or nested parallel inside the teams
4873 ++level; // not increment if #teams==1, or for outer fork of the teams;
4874 // increment otherwise
4877 hot_teams = master->th.th_hot_teams;
4878 if (level < __kmp_hot_teams_max_level && hot_teams &&
4880 .hot_team) { // hot team has already been allocated for given level
4887 // Optimization to use a "hot" team
4888 if (use_hot_team && new_nproc > 1) {
4889 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4890 #if KMP_NESTED_HOT_TEAMS
4891 team = hot_teams[level].hot_team;
4893 team = root->r.r_hot_team;
4896 if (__kmp_tasking_mode != tskm_immediate_exec) {
4897 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4898 "task_team[1] = %p before reinit\n",
4899 team->t.t_task_team[0], team->t.t_task_team[1]));
4903 // Has the number of threads changed?
4904 /* Let's assume the most common case is that the number of threads is
4905 unchanged, and put that case first. */
4906 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4907 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4908 // This case can mean that omp_set_num_threads() was called and the hot
4909 // team size was already reduced, so we check the special flag
4910 if (team->t.t_size_changed == -1) {
4911 team->t.t_size_changed = 1;
4913 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4916 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4917 kmp_r_sched_t new_sched = new_icvs->sched;
4918 // set master's schedule as new run-time schedule
4919 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4921 __kmp_reinitialize_team(team, new_icvs,
4922 root->r.r_uber_thread->th.th_ident);
4924 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4925 team->t.t_threads[0], team));
4926 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4928 #if KMP_AFFINITY_SUPPORTED
4929 if ((team->t.t_size_changed == 0) &&
4930 (team->t.t_proc_bind == new_proc_bind)) {
4931 if (new_proc_bind == proc_bind_spread) {
4932 __kmp_partition_places(
4933 team, 1); // add flag to update only master for spread
4935 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
4936 "proc_bind = %d, partition = [%d,%d]\n",
4937 team->t.t_id, new_proc_bind, team->t.t_first_place,
4938 team->t.t_last_place));
4940 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4941 __kmp_partition_places(team);
4944 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4945 #endif /* KMP_AFFINITY_SUPPORTED */
4946 } else if (team->t.t_nproc > new_nproc) {
4948 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
4951 team->t.t_size_changed = 1;
4952 #if KMP_NESTED_HOT_TEAMS
4953 if (__kmp_hot_teams_mode == 0) {
4954 // AC: saved number of threads should correspond to team's value in this
4955 // mode, can be bigger in mode 1, when hot team has threads in reserve
4956 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4957 hot_teams[level].hot_team_nth = new_nproc;
4958 #endif // KMP_NESTED_HOT_TEAMS
4959 /* release the extra threads we don't need any more */
4960 for (f = new_nproc; f < team->t.t_nproc; f++) {
4961 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4962 if (__kmp_tasking_mode != tskm_immediate_exec) {
4963 // When decreasing team size, threads no longer in the team should
4965 team->t.t_threads[f]->th.th_task_team = NULL;
4967 __kmp_free_thread(team->t.t_threads[f]);
4968 team->t.t_threads[f] = NULL;
4970 #if KMP_NESTED_HOT_TEAMS
4971 } // (__kmp_hot_teams_mode == 0)
4973 // When keeping extra threads in team, switch threads to wait on own
4975 for (f = new_nproc; f < team->t.t_nproc; ++f) {
4976 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4977 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4978 for (int b = 0; b < bs_last_barrier; ++b) {
4979 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4980 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4982 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4986 #endif // KMP_NESTED_HOT_TEAMS
4987 team->t.t_nproc = new_nproc;
4988 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4989 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
4990 __kmp_reinitialize_team(team, new_icvs,
4991 root->r.r_uber_thread->th.th_ident);
4993 // Update remaining threads
4994 for (f = 0; f < new_nproc; ++f) {
4995 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4998 // restore the current task state of the master thread: should be the
5000 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5001 team->t.t_threads[0], team));
5003 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5006 for (f = 0; f < team->t.t_nproc; f++) {
5007 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5008 team->t.t_threads[f]->th.th_team_nproc ==
5013 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5014 #if KMP_AFFINITY_SUPPORTED
5015 __kmp_partition_places(team);
5017 } else { // team->t.t_nproc < new_nproc
5018 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5019 kmp_affin_mask_t *old_mask;
5020 if (KMP_AFFINITY_CAPABLE()) {
5021 KMP_CPU_ALLOC(old_mask);
5026 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5029 team->t.t_size_changed = 1;
5031 #if KMP_NESTED_HOT_TEAMS
5032 int avail_threads = hot_teams[level].hot_team_nth;
5033 if (new_nproc < avail_threads)
5034 avail_threads = new_nproc;
5035 kmp_info_t **other_threads = team->t.t_threads;
5036 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5037 // Adjust barrier data of reserved threads (if any) of the team
5038 // Other data will be set in __kmp_initialize_info() below.
5040 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5041 for (b = 0; b < bs_last_barrier; ++b) {
5042 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5043 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5045 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5049 if (hot_teams[level].hot_team_nth >= new_nproc) {
5050 // we have all needed threads in reserve, no need to allocate any
5051 // this only possible in mode 1, cannot have reserved threads in mode 0
5052 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5053 team->t.t_nproc = new_nproc; // just get reserved threads involved
5055 // we may have some threads in reserve, but not enough
5058 .hot_team_nth; // get reserved threads involved if any
5059 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5060 #endif // KMP_NESTED_HOT_TEAMS
5061 if (team->t.t_max_nproc < new_nproc) {
5062 /* reallocate larger arrays */
5063 __kmp_reallocate_team_arrays(team, new_nproc);
5064 __kmp_reinitialize_team(team, new_icvs, NULL);
5067 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5068 /* Temporarily set full mask for master thread before creation of
5069 workers. The reason is that workers inherit the affinity from master,
5070 so if a lot of workers are created on the single core quickly, they
5071 don't get a chance to set their own affinity for a long time. */
5072 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5075 /* allocate new threads for the hot team */
5076 for (f = team->t.t_nproc; f < new_nproc; f++) {
5077 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5078 KMP_DEBUG_ASSERT(new_worker);
5079 team->t.t_threads[f] = new_worker;
5082 ("__kmp_allocate_team: team %d init T#%d arrived: "
5083 "join=%llu, plain=%llu\n",
5084 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5085 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5086 team->t.t_bar[bs_plain_barrier].b_arrived));
5088 { // Initialize barrier data for new threads.
5090 kmp_balign_t *balign = new_worker->th.th_bar;
5091 for (b = 0; b < bs_last_barrier; ++b) {
5092 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5093 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5094 KMP_BARRIER_PARENT_FLAG);
5096 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5102 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5103 if (KMP_AFFINITY_CAPABLE()) {
5104 /* Restore initial master thread's affinity mask */
5105 __kmp_set_system_affinity(old_mask, TRUE);
5106 KMP_CPU_FREE(old_mask);
5109 #if KMP_NESTED_HOT_TEAMS
5110 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5111 #endif // KMP_NESTED_HOT_TEAMS
5112 /* make sure everyone is syncronized */
5113 int old_nproc = team->t.t_nproc; // save old value and use to update only
5114 // new threads below
5115 __kmp_initialize_team(team, new_nproc, new_icvs,
5116 root->r.r_uber_thread->th.th_ident);
5118 /* reinitialize the threads */
5119 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5120 for (f = 0; f < team->t.t_nproc; ++f)
5121 __kmp_initialize_info(team->t.t_threads[f], team, f,
5122 __kmp_gtid_from_tid(f, team));
5124 if (level) { // set th_task_state for new threads in nested hot team
5125 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5126 // only need to set the th_task_state for the new threads. th_task_state
5127 // for master thread will not be accurate until after this in
5128 // __kmp_fork_call(), so we look to the master's memo_stack to get the
5130 for (f = old_nproc; f < team->t.t_nproc; ++f)
5131 team->t.t_threads[f]->th.th_task_state =
5132 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5133 } else { // set th_task_state for new threads in non-nested hot team
5135 team->t.t_threads[0]->th.th_task_state; // copy master's state
5136 for (f = old_nproc; f < team->t.t_nproc; ++f)
5137 team->t.t_threads[f]->th.th_task_state = old_state;
5141 for (f = 0; f < team->t.t_nproc; ++f) {
5142 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5143 team->t.t_threads[f]->th.th_team_nproc ==
5148 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5149 #if KMP_AFFINITY_SUPPORTED
5150 __kmp_partition_places(team);
5152 } // Check changes in number of threads
5154 kmp_info_t *master = team->t.t_threads[0];
5155 if (master->th.th_teams_microtask) {
5156 for (f = 1; f < new_nproc; ++f) {
5157 // propagate teams construct specific info to workers
5158 kmp_info_t *thr = team->t.t_threads[f];
5159 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5160 thr->th.th_teams_level = master->th.th_teams_level;
5161 thr->th.th_teams_size = master->th.th_teams_size;
5164 #if KMP_NESTED_HOT_TEAMS
5166 // Sync barrier state for nested hot teams, not needed for outermost hot
5168 for (f = 1; f < new_nproc; ++f) {
5169 kmp_info_t *thr = team->t.t_threads[f];
5171 kmp_balign_t *balign = thr->th.th_bar;
5172 for (b = 0; b < bs_last_barrier; ++b) {
5173 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5174 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5176 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5181 #endif // KMP_NESTED_HOT_TEAMS
5183 /* reallocate space for arguments if necessary */
5184 __kmp_alloc_argv_entries(argc, team, TRUE);
5185 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5186 // The hot team re-uses the previous task team,
5187 // if untouched during the previous release->gather phase.
5189 KF_TRACE(10, (" hot_team = %p\n", team));
5192 if (__kmp_tasking_mode != tskm_immediate_exec) {
5193 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5194 "task_team[1] = %p after reinit\n",
5195 team->t.t_task_team[0], team->t.t_task_team[1]));
5200 __ompt_team_assign_id(team, ompt_parallel_data);
5208 /* next, let's try to take one from the team pool */
5210 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5211 /* TODO: consider resizing undersized teams instead of reaping them, now
5212 that we have a resizing mechanism */
5213 if (team->t.t_max_nproc >= max_nproc) {
5214 /* take this team from the team pool */
5215 __kmp_team_pool = team->t.t_next_pool;
5217 /* setup the team for fresh use */
5218 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5220 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5221 "task_team[1] %p to NULL\n",
5222 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5223 team->t.t_task_team[0] = NULL;
5224 team->t.t_task_team[1] = NULL;
5226 /* reallocate space for arguments if necessary */
5227 __kmp_alloc_argv_entries(argc, team, TRUE);
5228 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5231 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5232 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5233 { // Initialize barrier data.
5235 for (b = 0; b < bs_last_barrier; ++b) {
5236 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5238 team->t.t_bar[b].b_master_arrived = 0;
5239 team->t.t_bar[b].b_team_arrived = 0;
5244 team->t.t_proc_bind = new_proc_bind;
5246 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5250 __ompt_team_assign_id(team, ompt_parallel_data);
5258 /* reap team if it is too small, then loop back and check the next one */
5259 // not sure if this is wise, but, will be redone during the hot-teams
5261 /* TODO: Use technique to find the right size hot-team, don't reap them */
5262 team = __kmp_reap_team(team);
5263 __kmp_team_pool = team;
5266 /* nothing available in the pool, no matter, make a new team! */
5268 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5271 team->t.t_max_nproc = max_nproc;
5272 /* NOTE well, for some reason allocating one big buffer and dividing it up
5273 seems to really hurt performance a lot on the P4, so, let's not use this */
5274 __kmp_allocate_team_arrays(team, max_nproc);
5276 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5277 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5279 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5281 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5282 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5283 // memory, no need to duplicate
5284 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5285 // memory, no need to duplicate
5287 if (__kmp_storage_map) {
5288 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5291 /* allocate space for arguments */
5292 __kmp_alloc_argv_entries(argc, team, FALSE);
5293 team->t.t_argc = argc;
5296 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5297 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5298 { // Initialize barrier data.
5300 for (b = 0; b < bs_last_barrier; ++b) {
5301 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5303 team->t.t_bar[b].b_master_arrived = 0;
5304 team->t.t_bar[b].b_team_arrived = 0;
5309 team->t.t_proc_bind = new_proc_bind;
5312 __ompt_team_assign_id(team, ompt_parallel_data);
5313 team->t.ompt_serialized_team_info = NULL;
5318 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5324 /* TODO implement hot-teams at all levels */
5325 /* TODO implement lazy thread release on demand (disband request) */
5327 /* free the team. return it to the team pool. release all the threads
5328 * associated with it */
5329 void __kmp_free_team(kmp_root_t *root,
5330 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5332 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5336 KMP_DEBUG_ASSERT(root);
5337 KMP_DEBUG_ASSERT(team);
5338 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5339 KMP_DEBUG_ASSERT(team->t.t_threads);
5341 int use_hot_team = team == root->r.r_hot_team;
5342 #if KMP_NESTED_HOT_TEAMS
5344 kmp_hot_team_ptr_t *hot_teams;
5346 level = team->t.t_active_level - 1;
5347 if (master->th.th_teams_microtask) { // in teams construct?
5348 if (master->th.th_teams_size.nteams > 1) {
5349 ++level; // level was not increased in teams construct for
5352 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5353 master->th.th_teams_level == team->t.t_level) {
5354 ++level; // level was not increased in teams construct for
5355 // team_of_workers before the parallel
5356 } // team->t.t_level will be increased inside parallel
5358 hot_teams = master->th.th_hot_teams;
5359 if (level < __kmp_hot_teams_max_level) {
5360 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5364 #endif // KMP_NESTED_HOT_TEAMS
5366 /* team is done working */
5367 TCW_SYNC_PTR(team->t.t_pkfn,
5368 NULL); // Important for Debugging Support Library.
5370 team->t.t_copyin_counter = 0; // init counter for possible reuse
5372 // Do not reset pointer to parent team to NULL for hot teams.
5374 /* if we are non-hot team, release our threads */
5375 if (!use_hot_team) {
5376 if (__kmp_tasking_mode != tskm_immediate_exec) {
5377 // Wait for threads to reach reapable state
5378 for (f = 1; f < team->t.t_nproc; ++f) {
5379 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5380 kmp_info_t *th = team->t.t_threads[f];
5381 volatile kmp_uint32 *state = &th->th.th_reap_state;
5382 while (*state != KMP_SAFE_TO_REAP) {
5384 // On Windows a thread can be killed at any time, check this
5386 if (!__kmp_is_thread_alive(th, &ecode)) {
5387 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5391 // first check if thread is sleeping
5392 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5393 if (fl.is_sleeping())
5394 fl.resume(__kmp_gtid_from_thread(th));
5399 // Delete task teams
5401 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5402 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5403 if (task_team != NULL) {
5404 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5405 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5406 team->t.t_threads[f]->th.th_task_team = NULL;
5410 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5411 __kmp_get_gtid(), task_team, team->t.t_id));
5412 #if KMP_NESTED_HOT_TEAMS
5413 __kmp_free_task_team(master, task_team);
5415 team->t.t_task_team[tt_idx] = NULL;
5420 // Reset pointer to parent team only for non-hot teams.
5421 team->t.t_parent = NULL;
5422 team->t.t_level = 0;
5423 team->t.t_active_level = 0;
5425 /* free the worker threads */
5426 for (f = 1; f < team->t.t_nproc; ++f) {
5427 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5428 __kmp_free_thread(team->t.t_threads[f]);
5429 team->t.t_threads[f] = NULL;
5432 /* put the team back in the team pool */
5433 /* TODO limit size of team pool, call reap_team if pool too large */
5434 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5435 __kmp_team_pool = (volatile kmp_team_t *)team;
5436 } else { // Check if team was created for the masters in a teams construct
5437 // See if first worker is a CG root
5438 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5439 team->t.t_threads[1]->th.th_cg_roots);
5440 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5441 // Clean up the CG root nodes on workers so that this team can be re-used
5442 for (f = 1; f < team->t.t_nproc; ++f) {
5443 kmp_info_t *thr = team->t.t_threads[f];
5444 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5445 thr->th.th_cg_roots->cg_root == thr);
5446 // Pop current CG root off list
5447 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5448 thr->th.th_cg_roots = tmp->up;
5449 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5450 " up to node %p. cg_nthreads was %d\n",
5451 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5452 int i = tmp->cg_nthreads--;
5454 __kmp_free(tmp); // free CG if we are the last thread in it
5456 // Restore current task's thread_limit from CG root
5457 if (thr->th.th_cg_roots)
5458 thr->th.th_current_task->td_icvs.thread_limit =
5459 thr->th.th_cg_roots->cg_thread_limit;
5467 /* reap the team. destroy it, reclaim all its resources and free its memory */
5468 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5469 kmp_team_t *next_pool = team->t.t_next_pool;
5471 KMP_DEBUG_ASSERT(team);
5472 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5473 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5474 KMP_DEBUG_ASSERT(team->t.t_threads);
5475 KMP_DEBUG_ASSERT(team->t.t_argv);
5477 /* TODO clean the threads that are a part of this? */
5480 __kmp_free_team_arrays(team);
5481 if (team->t.t_argv != &team->t.t_inline_argv[0])
5482 __kmp_free((void *)team->t.t_argv);
5489 // Free the thread. Don't reap it, just place it on the pool of available
5492 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5493 // binding for the affinity mechanism to be useful.
5495 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5496 // However, we want to avoid a potential performance problem by always
5497 // scanning through the list to find the correct point at which to insert
5498 // the thread (potential N**2 behavior). To do this we keep track of the
5499 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5500 // With single-level parallelism, threads will always be added to the tail
5501 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5502 // parallelism, all bets are off and we may need to scan through the entire
5505 // This change also has a potentially large performance benefit, for some
5506 // applications. Previously, as threads were freed from the hot team, they
5507 // would be placed back on the free list in inverse order. If the hot team
5508 // grew back to it's original size, then the freed thread would be placed
5509 // back on the hot team in reverse order. This could cause bad cache
5510 // locality problems on programs where the size of the hot team regularly
5513 // Now, for single-level parallelism, the OMP tid is always == gtid.
5514 void __kmp_free_thread(kmp_info_t *this_th) {
5518 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5519 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5521 KMP_DEBUG_ASSERT(this_th);
5523 // When moving thread to pool, switch thread to wait on own b_go flag, and
5524 // uninitialized (NULL team).
5526 kmp_balign_t *balign = this_th->th.th_bar;
5527 for (b = 0; b < bs_last_barrier; ++b) {
5528 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5529 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5530 balign[b].bb.team = NULL;
5531 balign[b].bb.leaf_kids = 0;
5533 this_th->th.th_task_state = 0;
5534 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5536 /* put thread back on the free pool */
5537 TCW_PTR(this_th->th.th_team, NULL);
5538 TCW_PTR(this_th->th.th_root, NULL);
5539 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5541 while (this_th->th.th_cg_roots) {
5542 this_th->th.th_cg_roots->cg_nthreads--;
5543 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5544 " %p of thread %p to %d\n",
5545 this_th, this_th->th.th_cg_roots,
5546 this_th->th.th_cg_roots->cg_root,
5547 this_th->th.th_cg_roots->cg_nthreads));
5548 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5549 if (tmp->cg_root == this_th) { // Thread is a cg_root
5550 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5552 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5553 this_th->th.th_cg_roots = tmp->up;
5555 } else { // Worker thread
5556 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5559 this_th->th.th_cg_roots = NULL;
5564 /* If the implicit task assigned to this thread can be used by other threads
5565 * -> multiple threads can share the data and try to free the task at
5566 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5567 * with higher probability when hot team is disabled but can occurs even when
5568 * the hot team is enabled */
5569 __kmp_free_implicit_task(this_th);
5570 this_th->th.th_current_task = NULL;
5572 // If the __kmp_thread_pool_insert_pt is already past the new insert
5573 // point, then we need to re-scan the entire list.
5574 gtid = this_th->th.th_info.ds.ds_gtid;
5575 if (__kmp_thread_pool_insert_pt != NULL) {
5576 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5577 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5578 __kmp_thread_pool_insert_pt = NULL;
5582 // Scan down the list to find the place to insert the thread.
5583 // scan is the address of a link in the list, possibly the address of
5584 // __kmp_thread_pool itself.
5586 // In the absence of nested parallelism, the for loop will have 0 iterations.
5587 if (__kmp_thread_pool_insert_pt != NULL) {
5588 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5590 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5592 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5593 scan = &((*scan)->th.th_next_pool))
5596 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5598 TCW_PTR(this_th->th.th_next_pool, *scan);
5599 __kmp_thread_pool_insert_pt = *scan = this_th;
5600 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5601 (this_th->th.th_info.ds.ds_gtid <
5602 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5603 TCW_4(this_th->th.th_in_pool, TRUE);
5604 __kmp_suspend_initialize_thread(this_th);
5605 __kmp_lock_suspend_mx(this_th);
5606 if (this_th->th.th_active == TRUE) {
5607 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5608 this_th->th.th_active_in_pool = TRUE;
5612 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5615 __kmp_unlock_suspend_mx(this_th);
5617 TCW_4(__kmp_nth, __kmp_nth - 1);
5619 #ifdef KMP_ADJUST_BLOCKTIME
5620 /* Adjust blocktime back to user setting or default if necessary */
5621 /* Middle initialization might never have occurred */
5622 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5623 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5624 if (__kmp_nth <= __kmp_avail_proc) {
5625 __kmp_zero_bt = FALSE;
5628 #endif /* KMP_ADJUST_BLOCKTIME */
5633 /* ------------------------------------------------------------------------ */
5635 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5636 int gtid = this_thr->th.th_info.ds.ds_gtid;
5637 /* void *stack_data;*/
5638 kmp_team_t **volatile pteam;
5641 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5643 if (__kmp_env_consistency_check) {
5644 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5648 ompt_data_t *thread_data;
5649 if (ompt_enabled.enabled) {
5650 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5651 *thread_data = ompt_data_none;
5653 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5654 this_thr->th.ompt_thread_info.wait_id = 0;
5655 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5656 this_thr->th.ompt_thread_info.parallel_flags = 0;
5657 if (ompt_enabled.ompt_callback_thread_begin) {
5658 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5659 ompt_thread_worker, thread_data);
5661 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5665 /* This is the place where threads wait for work */
5666 while (!TCR_4(__kmp_global.g.g_done)) {
5667 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5670 /* wait for work to do */
5671 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5673 /* No tid yet since not part of a team */
5674 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5677 if (ompt_enabled.enabled) {
5678 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5682 pteam = &this_thr->th.th_team;
5684 /* have we been allocated? */
5685 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5686 /* we were just woken up, so run our new task */
5687 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5690 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5691 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5692 (*pteam)->t.t_pkfn));
5694 updateHWFPControl(*pteam);
5697 if (ompt_enabled.enabled) {
5698 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5702 rc = (*pteam)->t.t_invoke(gtid);
5706 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5707 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5708 (*pteam)->t.t_pkfn));
5711 if (ompt_enabled.enabled) {
5712 /* no frame set while outside task */
5713 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5715 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5718 /* join barrier after parallel region */
5719 __kmp_join_barrier(gtid);
5722 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5725 if (ompt_enabled.ompt_callback_thread_end) {
5726 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5730 this_thr->th.th_task_team = NULL;
5731 /* run the destructors for the threadprivate data for this thread */
5732 __kmp_common_destroy_gtid(gtid);
5734 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5739 /* ------------------------------------------------------------------------ */
5741 void __kmp_internal_end_dest(void *specific_gtid) {
5742 #if KMP_COMPILER_ICC
5743 #pragma warning(push)
5744 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5747 // Make sure no significant bits are lost
5748 int gtid = (kmp_intptr_t)specific_gtid - 1;
5749 #if KMP_COMPILER_ICC
5750 #pragma warning(pop)
5753 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5754 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5755 * this is because 0 is reserved for the nothing-stored case */
5757 /* josh: One reason for setting the gtid specific data even when it is being
5758 destroyed by pthread is to allow gtid lookup through thread specific data
5759 (__kmp_gtid_get_specific). Some of the code, especially stat code,
5760 that gets executed in the call to __kmp_internal_end_thread, actually
5761 gets the gtid through the thread specific data. Setting it here seems
5762 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5764 todo: get rid of this after we remove the dependence on
5765 __kmp_gtid_get_specific */
5766 if (gtid >= 0 && KMP_UBER_GTID(gtid))
5767 __kmp_gtid_set_specific(gtid);
5768 #ifdef KMP_TDATA_GTID
5771 __kmp_internal_end_thread(gtid);
5774 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5776 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5777 __kmp_internal_end_atexit();
5782 /* [Windows] josh: when the atexit handler is called, there may still be more
5783 than one thread alive */
5784 void __kmp_internal_end_atexit(void) {
5785 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5787 josh: ideally, we want to completely shutdown the library in this atexit
5788 handler, but stat code that depends on thread specific data for gtid fails
5789 because that data becomes unavailable at some point during the shutdown, so
5790 we call __kmp_internal_end_thread instead. We should eventually remove the
5791 dependency on __kmp_get_specific_gtid in the stat code and use
5792 __kmp_internal_end_library to cleanly shutdown the library.
5794 // TODO: Can some of this comment about GVS be removed?
5795 I suspect that the offending stat code is executed when the calling thread
5796 tries to clean up a dead root thread's data structures, resulting in GVS
5797 code trying to close the GVS structures for that thread, but since the stat
5798 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5799 the calling thread is cleaning up itself instead of another thread, it get
5800 confused. This happens because allowing a thread to unregister and cleanup
5801 another thread is a recent modification for addressing an issue.
5802 Based on the current design (20050722), a thread may end up
5803 trying to unregister another thread only if thread death does not trigger
5804 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5805 thread specific data destructor function to detect thread death. For
5806 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5807 is nothing. Thus, the workaround is applicable only for Windows static
5809 __kmp_internal_end_library(-1);
5811 __kmp_close_console();
5815 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5816 // It is assumed __kmp_forkjoin_lock is acquired.
5820 KMP_DEBUG_ASSERT(thread != NULL);
5822 gtid = thread->th.th_info.ds.ds_gtid;
5825 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5826 /* Assume the threads are at the fork barrier here */
5828 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5830 /* Need release fence here to prevent seg faults for tree forkjoin barrier
5832 ANNOTATE_HAPPENS_BEFORE(thread);
5833 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5834 __kmp_release_64(&flag);
5837 // Terminate OS thread.
5838 __kmp_reap_worker(thread);
5840 // The thread was killed asynchronously. If it was actively
5841 // spinning in the thread pool, decrement the global count.
5843 // There is a small timing hole here - if the worker thread was just waking
5844 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5845 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5846 // the global counter might not get updated.
5848 // Currently, this can only happen as the library is unloaded,
5849 // so there are no harmful side effects.
5850 if (thread->th.th_active_in_pool) {
5851 thread->th.th_active_in_pool = FALSE;
5852 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5853 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5857 __kmp_free_implicit_task(thread);
5859 // Free the fast memory for tasking
5861 __kmp_free_fast_memory(thread);
5862 #endif /* USE_FAST_MEMORY */
5864 __kmp_suspend_uninitialize_thread(thread);
5866 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5867 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5870 // __kmp_nth was decremented when thread is added to the pool.
5872 #ifdef KMP_ADJUST_BLOCKTIME
5873 /* Adjust blocktime back to user setting or default if necessary */
5874 /* Middle initialization might never have occurred */
5875 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5876 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5877 if (__kmp_nth <= __kmp_avail_proc) {
5878 __kmp_zero_bt = FALSE;
5881 #endif /* KMP_ADJUST_BLOCKTIME */
5883 /* free the memory being used */
5884 if (__kmp_env_consistency_check) {
5885 if (thread->th.th_cons) {
5886 __kmp_free_cons_stack(thread->th.th_cons);
5887 thread->th.th_cons = NULL;
5891 if (thread->th.th_pri_common != NULL) {
5892 __kmp_free(thread->th.th_pri_common);
5893 thread->th.th_pri_common = NULL;
5896 if (thread->th.th_task_state_memo_stack != NULL) {
5897 __kmp_free(thread->th.th_task_state_memo_stack);
5898 thread->th.th_task_state_memo_stack = NULL;
5902 if (thread->th.th_local.bget_data != NULL) {
5903 __kmp_finalize_bget(thread);
5907 #if KMP_AFFINITY_SUPPORTED
5908 if (thread->th.th_affin_mask != NULL) {
5909 KMP_CPU_FREE(thread->th.th_affin_mask);
5910 thread->th.th_affin_mask = NULL;
5912 #endif /* KMP_AFFINITY_SUPPORTED */
5914 #if KMP_USE_HIER_SCHED
5915 if (thread->th.th_hier_bar_data != NULL) {
5916 __kmp_free(thread->th.th_hier_bar_data);
5917 thread->th.th_hier_bar_data = NULL;
5921 __kmp_reap_team(thread->th.th_serial_team);
5922 thread->th.th_serial_team = NULL;
5927 } // __kmp_reap_thread
5929 static void __kmp_internal_end(void) {
5932 /* First, unregister the library */
5933 __kmp_unregister_library();
5936 /* In Win static library, we can't tell when a root actually dies, so we
5937 reclaim the data structures for any root threads that have died but not
5938 unregistered themselves, in order to shut down cleanly.
5939 In Win dynamic library we also can't tell when a thread dies. */
5940 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5944 for (i = 0; i < __kmp_threads_capacity; i++)
5946 if (__kmp_root[i]->r.r_active)
5948 KMP_MB(); /* Flush all pending memory write invalidates. */
5949 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5951 if (i < __kmp_threads_capacity) {
5953 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5954 KMP_MB(); /* Flush all pending memory write invalidates. */
5956 // Need to check that monitor was initialized before reaping it. If we are
5957 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
5958 // __kmp_monitor will appear to contain valid data, but it is only valid in
5959 // the parent process, not the child.
5960 // New behavior (201008): instead of keying off of the flag
5961 // __kmp_init_parallel, the monitor thread creation is keyed off
5962 // of the new flag __kmp_init_monitor.
5963 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
5964 if (TCR_4(__kmp_init_monitor)) {
5965 __kmp_reap_monitor(&__kmp_monitor);
5966 TCW_4(__kmp_init_monitor, 0);
5968 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
5969 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
5970 #endif // KMP_USE_MONITOR
5972 /* TODO move this to cleanup code */
5974 /* make sure that everything has properly ended */
5975 for (i = 0; i < __kmp_threads_capacity; i++) {
5976 if (__kmp_root[i]) {
5977 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
5978 // there can be uber threads alive here
5979 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
5986 // Reap the worker threads.
5987 // This is valid for now, but be careful if threads are reaped sooner.
5988 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
5989 // Get the next thread from the pool.
5990 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
5991 __kmp_thread_pool = thread->th.th_next_pool;
5993 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5994 thread->th.th_next_pool = NULL;
5995 thread->th.th_in_pool = FALSE;
5996 __kmp_reap_thread(thread, 0);
5998 __kmp_thread_pool_insert_pt = NULL;
6001 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6002 // Get the next team from the pool.
6003 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6004 __kmp_team_pool = team->t.t_next_pool;
6006 team->t.t_next_pool = NULL;
6007 __kmp_reap_team(team);
6010 __kmp_reap_task_teams();
6013 // Threads that are not reaped should not access any resources since they
6014 // are going to be deallocated soon, so the shutdown sequence should wait
6015 // until all threads either exit the final spin-waiting loop or begin
6016 // sleeping after the given blocktime.
6017 for (i = 0; i < __kmp_threads_capacity; i++) {
6018 kmp_info_t *thr = __kmp_threads[i];
6019 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6024 for (i = 0; i < __kmp_threads_capacity; ++i) {
6025 // TBD: Add some checking...
6026 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6029 /* Make sure all threadprivate destructors get run by joining with all
6030 worker threads before resetting this flag */
6031 TCW_SYNC_4(__kmp_init_common, FALSE);
6033 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6037 // See note above: One of the possible fixes for CQ138434 / CQ140126
6039 // FIXME: push both code fragments down and CSE them?
6040 // push them into __kmp_cleanup() ?
6041 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6042 if (TCR_4(__kmp_init_monitor)) {
6043 __kmp_reap_monitor(&__kmp_monitor);
6044 TCW_4(__kmp_init_monitor, 0);
6046 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6047 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6049 } /* else !__kmp_global.t_active */
6050 TCW_4(__kmp_init_gtid, FALSE);
6051 KMP_MB(); /* Flush all pending memory write invalidates. */
6059 void __kmp_internal_end_library(int gtid_req) {
6060 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6061 /* this shouldn't be a race condition because __kmp_internal_end() is the
6062 only place to clear __kmp_serial_init */
6063 /* we'll check this later too, after we get the lock */
6064 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6065 // redundant, because the next check will work in any case.
6066 if (__kmp_global.g.g_abort) {
6067 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6071 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6072 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6076 KMP_MB(); /* Flush all pending memory write invalidates. */
6078 /* find out who we are and what we should do */
6080 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6082 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6083 if (gtid == KMP_GTID_SHUTDOWN) {
6084 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6085 "already shutdown\n"));
6087 } else if (gtid == KMP_GTID_MONITOR) {
6088 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6089 "registered, or system shutdown\n"));
6091 } else if (gtid == KMP_GTID_DNE) {
6092 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6094 /* we don't know who we are, but we may still shutdown the library */
6095 } else if (KMP_UBER_GTID(gtid)) {
6096 /* unregister ourselves as an uber thread. gtid is no longer valid */
6097 if (__kmp_root[gtid]->r.r_active) {
6098 __kmp_global.g.g_abort = -1;
6099 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6101 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6107 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6108 __kmp_unregister_root_current_thread(gtid);
6111 /* worker threads may call this function through the atexit handler, if they
6113 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6114 TODO: do a thorough shutdown instead */
6115 #ifdef DUMP_DEBUG_ON_EXIT
6116 if (__kmp_debug_buf)
6117 __kmp_dump_debug_buffer();
6122 /* synchronize the termination process */
6123 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6125 /* have we already finished */
6126 if (__kmp_global.g.g_abort) {
6127 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6129 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6132 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6133 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6137 /* We need this lock to enforce mutex between this reading of
6138 __kmp_threads_capacity and the writing by __kmp_register_root.
6139 Alternatively, we can use a counter of roots that is atomically updated by
6140 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6141 __kmp_internal_end_*. */
6142 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6144 /* now we can safely conduct the actual termination */
6145 __kmp_internal_end();
6147 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6148 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6150 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6152 #ifdef DUMP_DEBUG_ON_EXIT
6153 if (__kmp_debug_buf)
6154 __kmp_dump_debug_buffer();
6158 __kmp_close_console();
6161 __kmp_fini_allocator();
6163 } // __kmp_internal_end_library
6165 void __kmp_internal_end_thread(int gtid_req) {
6168 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6169 /* this shouldn't be a race condition because __kmp_internal_end() is the
6170 * only place to clear __kmp_serial_init */
6171 /* we'll check this later too, after we get the lock */
6172 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6173 // redundant, because the next check will work in any case.
6174 if (__kmp_global.g.g_abort) {
6175 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6179 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6180 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6184 KMP_MB(); /* Flush all pending memory write invalidates. */
6186 /* find out who we are and what we should do */
6188 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6190 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6191 if (gtid == KMP_GTID_SHUTDOWN) {
6192 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6193 "already shutdown\n"));
6195 } else if (gtid == KMP_GTID_MONITOR) {
6196 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6197 "registered, or system shutdown\n"));
6199 } else if (gtid == KMP_GTID_DNE) {
6200 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6203 /* we don't know who we are */
6204 } else if (KMP_UBER_GTID(gtid)) {
6205 /* unregister ourselves as an uber thread. gtid is no longer valid */
6206 if (__kmp_root[gtid]->r.r_active) {
6207 __kmp_global.g.g_abort = -1;
6208 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6210 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6214 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6216 __kmp_unregister_root_current_thread(gtid);
6219 /* just a worker thread, let's leave */
6220 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6223 __kmp_threads[gtid]->th.th_task_team = NULL;
6227 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6233 if (__kmp_pause_status != kmp_hard_paused)
6234 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6235 // because we will better shutdown later in the library destructor.
6237 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6241 /* synchronize the termination process */
6242 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6244 /* have we already finished */
6245 if (__kmp_global.g.g_abort) {
6246 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6248 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6251 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6252 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6256 /* We need this lock to enforce mutex between this reading of
6257 __kmp_threads_capacity and the writing by __kmp_register_root.
6258 Alternatively, we can use a counter of roots that is atomically updated by
6259 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6260 __kmp_internal_end_*. */
6262 /* should we finish the run-time? are all siblings done? */
6263 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6265 for (i = 0; i < __kmp_threads_capacity; ++i) {
6266 if (KMP_UBER_GTID(i)) {
6269 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6270 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6271 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6276 /* now we can safely conduct the actual termination */
6278 __kmp_internal_end();
6280 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6281 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6283 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6285 #ifdef DUMP_DEBUG_ON_EXIT
6286 if (__kmp_debug_buf)
6287 __kmp_dump_debug_buffer();
6289 } // __kmp_internal_end_thread
6291 // -----------------------------------------------------------------------------
6292 // Library registration stuff.
6294 static long __kmp_registration_flag = 0;
6295 // Random value used to indicate library initialization.
6296 static char *__kmp_registration_str = NULL;
6297 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6299 static inline char *__kmp_reg_status_name() {
6300 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6301 each thread. If registration and unregistration go in different threads
6302 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6303 env var can not be found, because the name will contain different pid. */
6304 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6305 } // __kmp_reg_status_get
6307 void __kmp_register_library_startup(void) {
6309 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6315 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6316 __kmp_initialize_system_tick();
6318 __kmp_read_system_time(&time.dtime);
6319 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6320 __kmp_registration_str =
6321 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6322 __kmp_registration_flag, KMP_LIBRARY_FILE);
6324 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6325 __kmp_registration_str));
6329 char *value = NULL; // Actual value of the environment variable.
6331 // Set environment variable, but do not overwrite if it is exist.
6332 __kmp_env_set(name, __kmp_registration_str, 0);
6333 // Check the variable is written.
6334 value = __kmp_env_get(name);
6335 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6337 done = 1; // Ok, environment variable set successfully, exit the loop.
6341 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6342 // Check whether it alive or dead.
6343 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6345 char *flag_addr_str = NULL;
6346 char *flag_val_str = NULL;
6347 char const *file_name = NULL;
6348 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6349 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6352 long *flag_addr = 0;
6354 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6355 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6356 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6357 // First, check whether environment-encoded address is mapped into
6359 // If so, dereference it to see if it still has the right value.
6360 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6363 // If not, then we know the other copy of the library is no longer
6370 case 0: // Cannot parse environment variable -- neighbor status unknown.
6371 // Assume it is the incompatible format of future version of the
6372 // library. Assume the other library is alive.
6373 // WARN( ... ); // TODO: Issue a warning.
6374 file_name = "unknown library";
6376 // Attention! Falling to the next case. That's intentional.
6377 case 1: { // Neighbor is alive.
6378 // Check it is allowed.
6379 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6380 if (!__kmp_str_match_true(duplicate_ok)) {
6381 // That's not allowed. Issue fatal error.
6382 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6383 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6385 KMP_INTERNAL_FREE(duplicate_ok);
6386 __kmp_duplicate_library_ok = 1;
6387 done = 1; // Exit the loop.
6389 case 2: { // Neighbor is dead.
6390 // Clear the variable and try to register library again.
6391 __kmp_env_unset(name);
6393 default: { KMP_DEBUG_ASSERT(0); } break;
6396 KMP_INTERNAL_FREE((void *)value);
6398 KMP_INTERNAL_FREE((void *)name);
6400 } // func __kmp_register_library_startup
6402 void __kmp_unregister_library(void) {
6404 char *name = __kmp_reg_status_name();
6405 char *value = __kmp_env_get(name);
6407 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6408 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6409 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6410 // Ok, this is our variable. Delete it.
6411 __kmp_env_unset(name);
6414 KMP_INTERNAL_FREE(__kmp_registration_str);
6415 KMP_INTERNAL_FREE(value);
6416 KMP_INTERNAL_FREE(name);
6418 __kmp_registration_flag = 0;
6419 __kmp_registration_str = NULL;
6421 } // __kmp_unregister_library
6423 // End of Library registration stuff.
6424 // -----------------------------------------------------------------------------
6426 #if KMP_MIC_SUPPORTED
6428 static void __kmp_check_mic_type() {
6429 kmp_cpuid_t cpuid_state = {0};
6430 kmp_cpuid_t *cs_p = &cpuid_state;
6431 __kmp_x86_cpuid(1, 0, cs_p);
6432 // We don't support mic1 at the moment
6433 if ((cs_p->eax & 0xff0) == 0xB10) {
6434 __kmp_mic_type = mic2;
6435 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6436 __kmp_mic_type = mic3;
6438 __kmp_mic_type = non_mic;
6442 #endif /* KMP_MIC_SUPPORTED */
6444 static void __kmp_do_serial_initialize(void) {
6448 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6450 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6451 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6452 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6453 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6454 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6460 __kmp_validate_locks();
6462 /* Initialize internal memory allocator */
6463 __kmp_init_allocator();
6465 /* Register the library startup via an environment variable and check to see
6466 whether another copy of the library is already registered. */
6468 __kmp_register_library_startup();
6470 /* TODO reinitialization of library */
6471 if (TCR_4(__kmp_global.g.g_done)) {
6472 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6475 __kmp_global.g.g_abort = 0;
6476 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6478 /* initialize the locks */
6479 #if KMP_USE_ADAPTIVE_LOCKS
6480 #if KMP_DEBUG_ADAPTIVE_LOCKS
6481 __kmp_init_speculative_stats();
6484 #if KMP_STATS_ENABLED
6487 __kmp_init_lock(&__kmp_global_lock);
6488 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6489 __kmp_init_lock(&__kmp_debug_lock);
6490 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6491 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6492 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6493 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6494 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6495 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6496 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6497 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6498 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6499 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6500 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6501 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6502 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6503 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6504 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6506 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6508 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6510 /* conduct initialization and initial setup of configuration */
6512 __kmp_runtime_initialize();
6514 #if KMP_MIC_SUPPORTED
6515 __kmp_check_mic_type();
6518 // Some global variable initialization moved here from kmp_env_initialize()
6522 __kmp_abort_delay = 0;
6524 // From __kmp_init_dflt_team_nth()
6525 /* assume the entire machine will be used */
6526 __kmp_dflt_team_nth_ub = __kmp_xproc;
6527 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6528 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6530 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6531 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6533 __kmp_max_nth = __kmp_sys_max_nth;
6534 __kmp_cg_max_nth = __kmp_sys_max_nth;
6535 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6536 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6537 __kmp_teams_max_nth = __kmp_sys_max_nth;
6540 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6542 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6544 __kmp_monitor_wakeups =
6545 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6546 __kmp_bt_intervals =
6547 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6549 // From "KMP_LIBRARY" part of __kmp_env_initialize()
6550 __kmp_library = library_throughput;
6551 // From KMP_SCHEDULE initialization
6552 __kmp_static = kmp_sch_static_balanced;
6553 // AC: do not use analytical here, because it is non-monotonous
6554 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6555 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6556 // need to repeat assignment
6557 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6558 // bit control and barrier method control parts
6559 #if KMP_FAST_REDUCTION_BARRIER
6560 #define kmp_reduction_barrier_gather_bb ((int)1)
6561 #define kmp_reduction_barrier_release_bb ((int)1)
6562 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6563 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6564 #endif // KMP_FAST_REDUCTION_BARRIER
6565 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6566 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6567 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6568 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6569 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6570 #if KMP_FAST_REDUCTION_BARRIER
6571 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6572 // lin_64 ): hyper,1
6573 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6574 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6575 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6576 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6578 #endif // KMP_FAST_REDUCTION_BARRIER
6580 #if KMP_FAST_REDUCTION_BARRIER
6581 #undef kmp_reduction_barrier_release_pat
6582 #undef kmp_reduction_barrier_gather_pat
6583 #undef kmp_reduction_barrier_release_bb
6584 #undef kmp_reduction_barrier_gather_bb
6585 #endif // KMP_FAST_REDUCTION_BARRIER
6586 #if KMP_MIC_SUPPORTED
6587 if (__kmp_mic_type == mic2) { // KNC
6588 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6589 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6590 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6591 1; // forkjoin release
6592 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6593 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6595 #if KMP_FAST_REDUCTION_BARRIER
6596 if (__kmp_mic_type == mic2) { // KNC
6597 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6598 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6600 #endif // KMP_FAST_REDUCTION_BARRIER
6601 #endif // KMP_MIC_SUPPORTED
6603 // From KMP_CHECKS initialization
6605 __kmp_env_checks = TRUE; /* development versions have the extra checks */
6607 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6610 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6611 __kmp_foreign_tp = TRUE;
6613 __kmp_global.g.g_dynamic = FALSE;
6614 __kmp_global.g.g_dynamic_mode = dynamic_default;
6616 __kmp_env_initialize(NULL);
6618 // Print all messages in message catalog for testing purposes.
6620 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6621 if (__kmp_str_match_true(val)) {
6622 kmp_str_buf_t buffer;
6623 __kmp_str_buf_init(&buffer);
6624 __kmp_i18n_dump_catalog(&buffer);
6625 __kmp_printf("%s", buffer.str);
6626 __kmp_str_buf_free(&buffer);
6628 __kmp_env_free(&val);
6631 __kmp_threads_capacity =
6632 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6633 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6634 __kmp_tp_capacity = __kmp_default_tp_capacity(
6635 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6637 // If the library is shut down properly, both pools must be NULL. Just in
6638 // case, set them to NULL -- some memory may leak, but subsequent code will
6639 // work even if pools are not freed.
6640 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6641 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6642 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6643 __kmp_thread_pool = NULL;
6644 __kmp_thread_pool_insert_pt = NULL;
6645 __kmp_team_pool = NULL;
6647 /* Allocate all of the variable sized records */
6648 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6650 /* Since allocation is cache-aligned, just add extra padding at the end */
6652 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6654 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6655 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6656 sizeof(kmp_info_t *) * __kmp_threads_capacity);
6658 /* init thread counts */
6659 KMP_DEBUG_ASSERT(__kmp_all_nth ==
6660 0); // Asserts fail if the library is reinitializing and
6661 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6665 /* setup the uber master thread and hierarchy */
6666 gtid = __kmp_register_root(TRUE);
6667 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6668 KMP_ASSERT(KMP_UBER_GTID(gtid));
6669 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6671 KMP_MB(); /* Flush all pending memory write invalidates. */
6673 __kmp_common_initialize();
6676 /* invoke the child fork handler */
6677 __kmp_register_atfork();
6680 #if !KMP_DYNAMIC_LIB
6682 /* Invoke the exit handler when the program finishes, only for static
6683 library. For dynamic library, we already have _fini and DllMain. */
6684 int rc = atexit(__kmp_internal_end_atexit);
6686 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6692 #if KMP_HANDLE_SIGNALS
6694 /* NOTE: make sure that this is called before the user installs their own
6695 signal handlers so that the user handlers are called first. this way they
6696 can return false, not call our handler, avoid terminating the library, and
6697 continue execution where they left off. */
6698 __kmp_install_signals(FALSE);
6699 #endif /* KMP_OS_UNIX */
6701 __kmp_install_signals(TRUE);
6702 #endif /* KMP_OS_WINDOWS */
6705 /* we have finished the serial initialization */
6706 __kmp_init_counter++;
6708 __kmp_init_serial = TRUE;
6710 if (__kmp_settings) {
6714 if (__kmp_display_env || __kmp_display_env_verbose) {
6715 __kmp_env_print_2();
6724 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6727 void __kmp_serial_initialize(void) {
6728 if (__kmp_init_serial) {
6731 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6732 if (__kmp_init_serial) {
6733 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6736 __kmp_do_serial_initialize();
6737 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6740 static void __kmp_do_middle_initialize(void) {
6742 int prev_dflt_team_nth;
6744 if (!__kmp_init_serial) {
6745 __kmp_do_serial_initialize();
6748 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6750 // Save the previous value for the __kmp_dflt_team_nth so that
6751 // we can avoid some reinitialization if it hasn't changed.
6752 prev_dflt_team_nth = __kmp_dflt_team_nth;
6754 #if KMP_AFFINITY_SUPPORTED
6755 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6756 // number of cores on the machine.
6757 __kmp_affinity_initialize();
6759 // Run through the __kmp_threads array and set the affinity mask
6760 // for each root thread that is currently registered with the RTL.
6761 for (i = 0; i < __kmp_threads_capacity; i++) {
6762 if (TCR_PTR(__kmp_threads[i]) != NULL) {
6763 __kmp_affinity_set_init_mask(i, TRUE);
6766 #endif /* KMP_AFFINITY_SUPPORTED */
6768 KMP_ASSERT(__kmp_xproc > 0);
6769 if (__kmp_avail_proc == 0) {
6770 __kmp_avail_proc = __kmp_xproc;
6773 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6776 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6777 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6782 if (__kmp_dflt_team_nth == 0) {
6783 #ifdef KMP_DFLT_NTH_CORES
6784 // Default #threads = #cores
6785 __kmp_dflt_team_nth = __kmp_ncores;
6786 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6787 "__kmp_ncores (%d)\n",
6788 __kmp_dflt_team_nth));
6790 // Default #threads = #available OS procs
6791 __kmp_dflt_team_nth = __kmp_avail_proc;
6792 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6793 "__kmp_avail_proc(%d)\n",
6794 __kmp_dflt_team_nth));
6795 #endif /* KMP_DFLT_NTH_CORES */
6798 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6799 __kmp_dflt_team_nth = KMP_MIN_NTH;
6801 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6802 __kmp_dflt_team_nth = __kmp_sys_max_nth;
6805 // There's no harm in continuing if the following check fails,
6806 // but it indicates an error in the previous logic.
6807 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6809 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6810 // Run through the __kmp_threads array and set the num threads icv for each
6811 // root thread that is currently registered with the RTL (which has not
6812 // already explicitly set its nthreads-var with a call to
6813 // omp_set_num_threads()).
6814 for (i = 0; i < __kmp_threads_capacity; i++) {
6815 kmp_info_t *thread = __kmp_threads[i];
6818 if (thread->th.th_current_task->td_icvs.nproc != 0)
6821 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6826 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6827 __kmp_dflt_team_nth));
6829 #ifdef KMP_ADJUST_BLOCKTIME
6830 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6831 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6832 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6833 if (__kmp_nth > __kmp_avail_proc) {
6834 __kmp_zero_bt = TRUE;
6837 #endif /* KMP_ADJUST_BLOCKTIME */
6839 /* we have finished middle initialization */
6840 TCW_SYNC_4(__kmp_init_middle, TRUE);
6842 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6845 void __kmp_middle_initialize(void) {
6846 if (__kmp_init_middle) {
6849 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6850 if (__kmp_init_middle) {
6851 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6854 __kmp_do_middle_initialize();
6855 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6858 void __kmp_parallel_initialize(void) {
6859 int gtid = __kmp_entry_gtid(); // this might be a new root
6861 /* synchronize parallel initialization (for sibling) */
6862 if (TCR_4(__kmp_init_parallel))
6864 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6865 if (TCR_4(__kmp_init_parallel)) {
6866 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6870 /* TODO reinitialization after we have already shut down */
6871 if (TCR_4(__kmp_global.g.g_done)) {
6874 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6875 __kmp_infinite_loop();
6878 /* jc: The lock __kmp_initz_lock is already held, so calling
6879 __kmp_serial_initialize would cause a deadlock. So we call
6880 __kmp_do_serial_initialize directly. */
6881 if (!__kmp_init_middle) {
6882 __kmp_do_middle_initialize();
6884 __kmp_resume_if_hard_paused();
6886 /* begin initialization */
6887 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6888 KMP_ASSERT(KMP_UBER_GTID(gtid));
6890 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6891 // Save the FP control regs.
6892 // Worker threads will set theirs to these values at thread startup.
6893 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6894 __kmp_store_mxcsr(&__kmp_init_mxcsr);
6895 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6896 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6899 #if KMP_HANDLE_SIGNALS
6900 /* must be after __kmp_serial_initialize */
6901 __kmp_install_signals(TRUE);
6905 __kmp_suspend_initialize();
6907 #if defined(USE_LOAD_BALANCE)
6908 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6909 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6912 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6913 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6917 if (__kmp_version) {
6918 __kmp_print_version_2();
6921 /* we have finished parallel initialization */
6922 TCW_SYNC_4(__kmp_init_parallel, TRUE);
6925 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6927 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6930 /* ------------------------------------------------------------------------ */
6932 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6934 kmp_disp_t *dispatch;
6938 /* none of the threads have encountered any constructs, yet. */
6939 this_thr->th.th_local.this_construct = 0;
6940 #if KMP_CACHE_MANAGE
6941 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
6942 #endif /* KMP_CACHE_MANAGE */
6943 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6944 KMP_DEBUG_ASSERT(dispatch);
6945 KMP_DEBUG_ASSERT(team->t.t_dispatch);
6946 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
6947 // this_thr->th.th_info.ds.ds_tid ] );
6949 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6950 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
6951 if (__kmp_env_consistency_check)
6952 __kmp_push_parallel(gtid, team->t.t_ident);
6954 KMP_MB(); /* Flush all pending memory write invalidates. */
6957 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6959 if (__kmp_env_consistency_check)
6960 __kmp_pop_parallel(gtid, team->t.t_ident);
6962 __kmp_finish_implicit_task(this_thr);
6965 int __kmp_invoke_task_func(int gtid) {
6967 int tid = __kmp_tid_from_gtid(gtid);
6968 kmp_info_t *this_thr = __kmp_threads[gtid];
6969 kmp_team_t *team = this_thr->th.th_team;
6971 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
6973 if (__itt_stack_caller_create_ptr) {
6974 __kmp_itt_stack_callee_enter(
6976 team->t.t_stack_id); // inform ittnotify about entering user's code
6978 #endif /* USE_ITT_BUILD */
6979 #if INCLUDE_SSC_MARKS
6980 SSC_MARK_INVOKING();
6985 void **exit_frame_p;
6986 ompt_data_t *my_task_data;
6987 ompt_data_t *my_parallel_data;
6990 if (ompt_enabled.enabled) {
6992 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
6994 exit_frame_p = &dummy;
6998 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
6999 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7000 if (ompt_enabled.ompt_callback_implicit_task) {
7001 ompt_team_size = team->t.t_nproc;
7002 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7003 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7004 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7005 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7009 #if KMP_STATS_ENABLED
7010 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7011 if (previous_state == stats_state_e::TEAMS_REGION) {
7012 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7014 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7016 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7019 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7020 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7027 *exit_frame_p = NULL;
7028 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7031 #if KMP_STATS_ENABLED
7032 if (previous_state == stats_state_e::TEAMS_REGION) {
7033 KMP_SET_THREAD_STATE(previous_state);
7035 KMP_POP_PARTITIONED_TIMER();
7039 if (__itt_stack_caller_create_ptr) {
7040 __kmp_itt_stack_callee_leave(
7042 team->t.t_stack_id); // inform ittnotify about leaving user's code
7044 #endif /* USE_ITT_BUILD */
7045 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7050 void __kmp_teams_master(int gtid) {
7051 // This routine is called by all master threads in teams construct
7052 kmp_info_t *thr = __kmp_threads[gtid];
7053 kmp_team_t *team = thr->th.th_team;
7054 ident_t *loc = team->t.t_ident;
7055 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7056 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7057 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7058 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7059 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7061 // This thread is a new CG root. Set up the proper variables.
7062 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7063 tmp->cg_root = thr; // Make thr the CG root
7064 // Init to thread limit that was stored when league masters were forked
7065 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7066 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7067 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7068 " cg_nthreads to 1\n",
7070 tmp->up = thr->th.th_cg_roots;
7071 thr->th.th_cg_roots = tmp;
7073 // Launch league of teams now, but not let workers execute
7074 // (they hang on fork barrier until next parallel)
7075 #if INCLUDE_SSC_MARKS
7078 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7079 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7080 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7081 #if INCLUDE_SSC_MARKS
7084 // If the team size was reduced from the limit, set it to the new size
7085 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7086 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7087 // AC: last parameter "1" eliminates join barrier which won't work because
7088 // worker threads are in a fork barrier waiting for more parallel regions
7089 __kmp_join_call(loc, gtid
7098 int __kmp_invoke_teams_master(int gtid) {
7099 kmp_info_t *this_thr = __kmp_threads[gtid];
7100 kmp_team_t *team = this_thr->th.th_team;
7102 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7103 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7104 (void *)__kmp_teams_master);
7106 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7108 int tid = __kmp_tid_from_gtid(gtid);
7109 ompt_data_t *task_data =
7110 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7111 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7112 if (ompt_enabled.ompt_callback_implicit_task) {
7113 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7114 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7116 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7119 __kmp_teams_master(gtid);
7121 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7123 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7127 /* this sets the requested number of threads for the next parallel region
7128 encountered by this team. since this should be enclosed in the forkjoin
7129 critical section it should avoid race conditions with asymmetrical nested
7132 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7133 kmp_info_t *thr = __kmp_threads[gtid];
7135 if (num_threads > 0)
7136 thr->th.th_set_nproc = num_threads;
7139 /* this sets the requested number of teams for the teams region and/or
7140 the number of threads for the next parallel region encountered */
7141 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7143 kmp_info_t *thr = __kmp_threads[gtid];
7144 KMP_DEBUG_ASSERT(num_teams >= 0);
7145 KMP_DEBUG_ASSERT(num_threads >= 0);
7148 num_teams = 1; // default number of teams is 1.
7149 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7150 if (!__kmp_reserve_warn) {
7151 __kmp_reserve_warn = 1;
7152 __kmp_msg(kmp_ms_warning,
7153 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7154 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7156 num_teams = __kmp_teams_max_nth;
7158 // Set number of teams (number of threads in the outer "parallel" of the
7160 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7162 // Remember the number of threads for inner parallel regions
7163 if (!TCR_4(__kmp_init_middle))
7164 __kmp_middle_initialize(); // get internal globals calculated
7165 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7166 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7167 if (num_threads == 0) {
7168 num_threads = __kmp_avail_proc / num_teams;
7169 // adjust num_threads w/o warning as it is not user setting
7170 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7171 // no thread_limit clause specified - do not change thread-limit-var ICV
7172 if (num_threads > __kmp_dflt_team_nth) {
7173 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7175 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7176 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7177 } // prevent team size to exceed thread-limit-var
7178 if (num_teams * num_threads > __kmp_teams_max_nth) {
7179 num_threads = __kmp_teams_max_nth / num_teams;
7182 // This thread will be the master of the league masters
7183 // Store new thread limit; old limit is saved in th_cg_roots list
7184 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7185 // num_threads = min(num_threads, nthreads-var)
7186 if (num_threads > __kmp_dflt_team_nth) {
7187 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7189 if (num_teams * num_threads > __kmp_teams_max_nth) {
7190 int new_threads = __kmp_teams_max_nth / num_teams;
7191 if (!__kmp_reserve_warn) { // user asked for too many threads
7192 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7193 __kmp_msg(kmp_ms_warning,
7194 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7195 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7197 num_threads = new_threads;
7200 thr->th.th_teams_size.nth = num_threads;
7203 // Set the proc_bind var to use in the following parallel region.
7204 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7205 kmp_info_t *thr = __kmp_threads[gtid];
7206 thr->th.th_set_proc_bind = proc_bind;
7209 /* Launch the worker threads into the microtask. */
7211 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7212 kmp_info_t *this_thr = __kmp_threads[gtid];
7216 #endif /* KMP_DEBUG */
7218 KMP_DEBUG_ASSERT(team);
7219 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7220 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7221 KMP_MB(); /* Flush all pending memory write invalidates. */
7223 team->t.t_construct = 0; /* no single directives seen yet */
7224 team->t.t_ordered.dt.t_value =
7225 0; /* thread 0 enters the ordered section first */
7227 /* Reset the identifiers on the dispatch buffer */
7228 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7229 if (team->t.t_max_nproc > 1) {
7231 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7232 team->t.t_disp_buffer[i].buffer_index = i;
7233 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7236 team->t.t_disp_buffer[0].buffer_index = 0;
7237 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7240 KMP_MB(); /* Flush all pending memory write invalidates. */
7241 KMP_ASSERT(this_thr->th.th_team == team);
7244 for (f = 0; f < team->t.t_nproc; f++) {
7245 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7246 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7248 #endif /* KMP_DEBUG */
7250 /* release the worker threads so they may begin working */
7251 __kmp_fork_barrier(gtid, 0);
7254 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7255 kmp_info_t *this_thr = __kmp_threads[gtid];
7257 KMP_DEBUG_ASSERT(team);
7258 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7259 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7260 KMP_MB(); /* Flush all pending memory write invalidates. */
7262 /* Join barrier after fork */
7265 if (__kmp_threads[gtid] &&
7266 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7267 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7268 __kmp_threads[gtid]);
7269 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7270 "team->t.t_nproc=%d\n",
7271 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7273 __kmp_print_structure();
7275 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7276 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7277 #endif /* KMP_DEBUG */
7279 __kmp_join_barrier(gtid); /* wait for everyone */
7281 if (ompt_enabled.enabled &&
7282 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7283 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7284 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7285 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7287 void *codeptr = NULL;
7288 if (KMP_MASTER_TID(ds_tid) &&
7289 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7290 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7291 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7293 if (ompt_enabled.ompt_callback_sync_region_wait) {
7294 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7295 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7298 if (ompt_enabled.ompt_callback_sync_region) {
7299 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7300 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7304 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7305 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7306 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7311 KMP_MB(); /* Flush all pending memory write invalidates. */
7312 KMP_ASSERT(this_thr->th.th_team == team);
7315 /* ------------------------------------------------------------------------ */
7317 #ifdef USE_LOAD_BALANCE
7319 // Return the worker threads actively spinning in the hot team, if we
7320 // are at the outermost level of parallelism. Otherwise, return 0.
7321 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7324 kmp_team_t *hot_team;
7326 if (root->r.r_active) {
7329 hot_team = root->r.r_hot_team;
7330 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7331 return hot_team->t.t_nproc - 1; // Don't count master thread
7334 // Skip the master thread - it is accounted for elsewhere.
7336 for (i = 1; i < hot_team->t.t_nproc; i++) {
7337 if (hot_team->t.t_threads[i]->th.th_active) {
7344 // Perform an automatic adjustment to the number of
7345 // threads used by the next parallel region.
7346 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7349 int hot_team_active;
7350 int team_curr_active;
7353 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7355 KMP_DEBUG_ASSERT(root);
7356 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7357 ->th.th_current_task->td_icvs.dynamic == TRUE);
7358 KMP_DEBUG_ASSERT(set_nproc > 1);
7360 if (set_nproc == 1) {
7361 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7365 // Threads that are active in the thread pool, active in the hot team for this
7366 // particular root (if we are at the outer par level), and the currently
7367 // executing thread (to become the master) are available to add to the new
7368 // team, but are currently contributing to the system load, and must be
7370 pool_active = __kmp_thread_pool_active_nth;
7371 hot_team_active = __kmp_active_hot_team_nproc(root);
7372 team_curr_active = pool_active + hot_team_active + 1;
7374 // Check the system load.
7375 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7376 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7377 "hot team active = %d\n",
7378 system_active, pool_active, hot_team_active));
7380 if (system_active < 0) {
7381 // There was an error reading the necessary info from /proc, so use the
7382 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7383 // = dynamic_thread_limit, we shouldn't wind up getting back here.
7384 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7385 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7387 // Make this call behave like the thread limit algorithm.
7388 retval = __kmp_avail_proc - __kmp_nth +
7389 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7390 if (retval > set_nproc) {
7393 if (retval < KMP_MIN_NTH) {
7394 retval = KMP_MIN_NTH;
7397 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7402 // There is a slight delay in the load balance algorithm in detecting new
7403 // running procs. The real system load at this instant should be at least as
7404 // large as the #active omp thread that are available to add to the team.
7405 if (system_active < team_curr_active) {
7406 system_active = team_curr_active;
7408 retval = __kmp_avail_proc - system_active + team_curr_active;
7409 if (retval > set_nproc) {
7412 if (retval < KMP_MIN_NTH) {
7413 retval = KMP_MIN_NTH;
7416 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7418 } // __kmp_load_balance_nproc()
7420 #endif /* USE_LOAD_BALANCE */
7422 /* ------------------------------------------------------------------------ */
7424 /* NOTE: this is called with the __kmp_init_lock held */
7425 void __kmp_cleanup(void) {
7428 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7430 if (TCR_4(__kmp_init_parallel)) {
7431 #if KMP_HANDLE_SIGNALS
7432 __kmp_remove_signals();
7434 TCW_4(__kmp_init_parallel, FALSE);
7437 if (TCR_4(__kmp_init_middle)) {
7438 #if KMP_AFFINITY_SUPPORTED
7439 __kmp_affinity_uninitialize();
7440 #endif /* KMP_AFFINITY_SUPPORTED */
7441 __kmp_cleanup_hierarchy();
7442 TCW_4(__kmp_init_middle, FALSE);
7445 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7447 if (__kmp_init_serial) {
7448 __kmp_runtime_destroy();
7449 __kmp_init_serial = FALSE;
7452 __kmp_cleanup_threadprivate_caches();
7454 for (f = 0; f < __kmp_threads_capacity; f++) {
7455 if (__kmp_root[f] != NULL) {
7456 __kmp_free(__kmp_root[f]);
7457 __kmp_root[f] = NULL;
7460 __kmp_free(__kmp_threads);
7461 // __kmp_threads and __kmp_root were allocated at once, as single block, so
7462 // there is no need in freeing __kmp_root.
7463 __kmp_threads = NULL;
7465 __kmp_threads_capacity = 0;
7467 #if KMP_USE_DYNAMIC_LOCK
7468 __kmp_cleanup_indirect_user_locks();
7470 __kmp_cleanup_user_locks();
7473 #if KMP_AFFINITY_SUPPORTED
7474 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7475 __kmp_cpuinfo_file = NULL;
7476 #endif /* KMP_AFFINITY_SUPPORTED */
7478 #if KMP_USE_ADAPTIVE_LOCKS
7479 #if KMP_DEBUG_ADAPTIVE_LOCKS
7480 __kmp_print_speculative_stats();
7483 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7484 __kmp_nested_nth.nth = NULL;
7485 __kmp_nested_nth.size = 0;
7486 __kmp_nested_nth.used = 0;
7487 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7488 __kmp_nested_proc_bind.bind_types = NULL;
7489 __kmp_nested_proc_bind.size = 0;
7490 __kmp_nested_proc_bind.used = 0;
7491 if (__kmp_affinity_format) {
7492 KMP_INTERNAL_FREE(__kmp_affinity_format);
7493 __kmp_affinity_format = NULL;
7496 __kmp_i18n_catclose();
7498 #if KMP_USE_HIER_SCHED
7499 __kmp_hier_scheds.deallocate();
7502 #if KMP_STATS_ENABLED
7506 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7509 /* ------------------------------------------------------------------------ */
7511 int __kmp_ignore_mppbeg(void) {
7514 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7515 if (__kmp_str_match_false(env))
7518 // By default __kmpc_begin() is no-op.
7522 int __kmp_ignore_mppend(void) {
7525 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7526 if (__kmp_str_match_false(env))
7529 // By default __kmpc_end() is no-op.
7533 void __kmp_internal_begin(void) {
7537 /* this is a very important step as it will register new sibling threads
7538 and assign these new uber threads a new gtid */
7539 gtid = __kmp_entry_gtid();
7540 root = __kmp_threads[gtid]->th.th_root;
7541 KMP_ASSERT(KMP_UBER_GTID(gtid));
7543 if (root->r.r_begin)
7545 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7546 if (root->r.r_begin) {
7547 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7551 root->r.r_begin = TRUE;
7553 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7556 /* ------------------------------------------------------------------------ */
7558 void __kmp_user_set_library(enum library_type arg) {
7563 /* first, make sure we are initialized so we can get our gtid */
7565 gtid = __kmp_entry_gtid();
7566 thread = __kmp_threads[gtid];
7568 root = thread->th.th_root;
7570 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7572 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7574 KMP_WARNING(SetLibraryIncorrectCall);
7579 case library_serial:
7580 thread->th.th_set_nproc = 0;
7581 set__nproc(thread, 1);
7583 case library_turnaround:
7584 thread->th.th_set_nproc = 0;
7585 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7586 : __kmp_dflt_team_nth_ub);
7588 case library_throughput:
7589 thread->th.th_set_nproc = 0;
7590 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7591 : __kmp_dflt_team_nth_ub);
7594 KMP_FATAL(UnknownLibraryType, arg);
7597 __kmp_aux_set_library(arg);
7600 void __kmp_aux_set_stacksize(size_t arg) {
7601 if (!__kmp_init_serial)
7602 __kmp_serial_initialize();
7605 if (arg & (0x1000 - 1)) {
7606 arg &= ~(0x1000 - 1);
7607 if (arg + 0x1000) /* check for overflow if we round up */
7611 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7613 /* only change the default stacksize before the first parallel region */
7614 if (!TCR_4(__kmp_init_parallel)) {
7615 size_t value = arg; /* argument is in bytes */
7617 if (value < __kmp_sys_min_stksize)
7618 value = __kmp_sys_min_stksize;
7619 else if (value > KMP_MAX_STKSIZE)
7620 value = KMP_MAX_STKSIZE;
7622 __kmp_stksize = value;
7624 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7627 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7630 /* set the behaviour of the runtime library */
7631 /* TODO this can cause some odd behaviour with sibling parallelism... */
7632 void __kmp_aux_set_library(enum library_type arg) {
7633 __kmp_library = arg;
7635 switch (__kmp_library) {
7636 case library_serial: {
7637 KMP_INFORM(LibraryIsSerial);
7639 case library_turnaround:
7640 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7641 __kmp_use_yield = 2; // only yield when oversubscribed
7643 case library_throughput:
7644 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7645 __kmp_dflt_blocktime = 200;
7648 KMP_FATAL(UnknownLibraryType, arg);
7652 /* Getting team information common for all team API */
7653 // Returns NULL if not in teams construct
7654 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7655 kmp_info_t *thr = __kmp_entry_thread();
7656 teams_serialized = 0;
7657 if (thr->th.th_teams_microtask) {
7658 kmp_team_t *team = thr->th.th_team;
7659 int tlevel = thr->th.th_teams_level; // the level of the teams construct
7660 int ii = team->t.t_level;
7661 teams_serialized = team->t.t_serialized;
7662 int level = tlevel + 1;
7663 KMP_DEBUG_ASSERT(ii >= tlevel);
7664 while (ii > level) {
7665 for (teams_serialized = team->t.t_serialized;
7666 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7668 if (team->t.t_serialized && (!teams_serialized)) {
7669 team = team->t.t_parent;
7673 team = team->t.t_parent;
7682 int __kmp_aux_get_team_num() {
7684 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7686 if (serialized > 1) {
7687 return 0; // teams region is serialized ( 1 team of 1 thread ).
7689 return team->t.t_master_tid;
7695 int __kmp_aux_get_num_teams() {
7697 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7699 if (serialized > 1) {
7702 return team->t.t_parent->t.t_nproc;
7708 /* ------------------------------------------------------------------------ */
7711 * Affinity Format Parser
7713 * Field is in form of: %[[[0].]size]type
7714 * % and type are required (%% means print a literal '%')
7715 * type is either single char or long name surrounded by {},
7716 * e.g., N or {num_threads}
7717 * 0 => leading zeros
7718 * . => right justified when size is specified
7719 * by default output is left justified
7720 * size is the *minimum* field length
7721 * All other characters are printed as is
7723 * Available field types:
7724 * L {thread_level} - omp_get_level()
7725 * n {thread_num} - omp_get_thread_num()
7726 * h {host} - name of host machine
7727 * P {process_id} - process id (integer)
7728 * T {thread_identifier} - native thread identifier (integer)
7729 * N {num_threads} - omp_get_num_threads()
7730 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7731 * a {thread_affinity} - comma separated list of integers or integer ranges
7732 * (values of affinity mask)
7734 * Implementation-specific field types can be added
7735 * If a type is unknown, print "undefined"
7738 // Structure holding the short name, long name, and corresponding data type
7739 // for snprintf. A table of these will represent the entire valid keyword
7741 typedef struct kmp_affinity_format_field_t {
7742 char short_name; // from spec e.g., L -> thread level
7743 const char *long_name; // from spec thread_level -> thread level
7744 char field_format; // data type for snprintf (typically 'd' or 's'
7745 // for integer or string)
7746 } kmp_affinity_format_field_t;
7748 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7749 #if KMP_AFFINITY_SUPPORTED
7750 {'A', "thread_affinity", 's'},
7752 {'t', "team_num", 'd'},
7753 {'T', "num_teams", 'd'},
7754 {'L', "nesting_level", 'd'},
7755 {'n', "thread_num", 'd'},
7756 {'N', "num_threads", 'd'},
7757 {'a', "ancestor_tnum", 'd'},
7759 {'P', "process_id", 'd'},
7760 {'i', "native_thread_id", 'd'}};
7762 // Return the number of characters it takes to hold field
7763 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7765 kmp_str_buf_t *field_buffer) {
7766 int rc, format_index, field_value;
7767 const char *width_left, *width_right;
7768 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7769 static const int FORMAT_SIZE = 20;
7770 char format[FORMAT_SIZE] = {0};
7771 char absolute_short_name = 0;
7773 KMP_DEBUG_ASSERT(gtid >= 0);
7774 KMP_DEBUG_ASSERT(th);
7775 KMP_DEBUG_ASSERT(**ptr == '%');
7776 KMP_DEBUG_ASSERT(field_buffer);
7778 __kmp_str_buf_clear(field_buffer);
7780 // Skip the initial %
7783 // Check for %% first
7785 __kmp_str_buf_cat(field_buffer, "%", 1);
7786 (*ptr)++; // skip over the second %
7790 // Parse field modifiers if they are present
7794 (*ptr)++; // skip over 0
7796 right_justify = false;
7798 right_justify = true;
7799 (*ptr)++; // skip over .
7801 // Parse width of field: [width_left, width_right)
7802 width_left = width_right = NULL;
7803 if (**ptr >= '0' && **ptr <= '9') {
7809 // Create the format for KMP_SNPRINTF based on flags parsed above
7811 format[format_index++] = '%';
7813 format[format_index++] = '-';
7815 format[format_index++] = '0';
7816 if (width_left && width_right) {
7818 // Only allow 8 digit number widths.
7819 // This also prevents overflowing format variable
7820 while (i < 8 && width_left < width_right) {
7821 format[format_index++] = *width_left;
7827 // Parse a name (long or short)
7828 // Canonicalize the name into absolute_short_name
7829 found_valid_name = false;
7830 parse_long_name = (**ptr == '{');
7831 if (parse_long_name)
7832 (*ptr)++; // skip initial left brace
7833 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7834 sizeof(__kmp_affinity_format_table[0]);
7836 char short_name = __kmp_affinity_format_table[i].short_name;
7837 const char *long_name = __kmp_affinity_format_table[i].long_name;
7838 char field_format = __kmp_affinity_format_table[i].field_format;
7839 if (parse_long_name) {
7840 int length = KMP_STRLEN(long_name);
7841 if (strncmp(*ptr, long_name, length) == 0) {
7842 found_valid_name = true;
7843 (*ptr) += length; // skip the long name
7845 } else if (**ptr == short_name) {
7846 found_valid_name = true;
7847 (*ptr)++; // skip the short name
7849 if (found_valid_name) {
7850 format[format_index++] = field_format;
7851 format[format_index++] = '\0';
7852 absolute_short_name = short_name;
7856 if (parse_long_name) {
7858 absolute_short_name = 0;
7860 (*ptr)++; // skip over the right brace
7864 // Attempt to fill the buffer with the requested
7865 // value using snprintf within __kmp_str_buf_print()
7866 switch (absolute_short_name) {
7868 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7871 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7874 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7877 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7880 static const int BUFFER_SIZE = 256;
7881 char buf[BUFFER_SIZE];
7882 __kmp_expand_host_name(buf, BUFFER_SIZE);
7883 rc = __kmp_str_buf_print(field_buffer, format, buf);
7886 rc = __kmp_str_buf_print(field_buffer, format, getpid());
7889 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7892 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7896 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7897 rc = __kmp_str_buf_print(field_buffer, format, field_value);
7899 #if KMP_AFFINITY_SUPPORTED
7902 __kmp_str_buf_init(&buf);
7903 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7904 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7905 __kmp_str_buf_free(&buf);
7909 // According to spec, If an implementation does not have info for field
7910 // type, then "undefined" is printed
7911 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7913 if (parse_long_name) {
7922 KMP_ASSERT(format_index <= FORMAT_SIZE);
7927 * Return number of characters needed to hold the affinity string
7928 * (not including null byte character)
7929 * The resultant string is printed to buffer, which the caller can then
7932 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7933 kmp_str_buf_t *buffer) {
7934 const char *parse_ptr;
7936 const kmp_info_t *th;
7937 kmp_str_buf_t field;
7939 KMP_DEBUG_ASSERT(buffer);
7940 KMP_DEBUG_ASSERT(gtid >= 0);
7942 __kmp_str_buf_init(&field);
7943 __kmp_str_buf_clear(buffer);
7945 th = __kmp_threads[gtid];
7948 // If format is NULL or zero-length string, then we use
7949 // affinity-format-var ICV
7951 if (parse_ptr == NULL || *parse_ptr == '\0') {
7952 parse_ptr = __kmp_affinity_format;
7954 KMP_DEBUG_ASSERT(parse_ptr);
7956 while (*parse_ptr != '\0') {
7958 if (*parse_ptr == '%') {
7959 // Put field in the buffer
7960 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
7961 __kmp_str_buf_catbuf(buffer, &field);
7964 // Put literal character in buffer
7965 __kmp_str_buf_cat(buffer, parse_ptr, 1);
7970 __kmp_str_buf_free(&field);
7974 // Displays the affinity string to stdout
7975 void __kmp_aux_display_affinity(int gtid, const char *format) {
7977 __kmp_str_buf_init(&buf);
7978 __kmp_aux_capture_affinity(gtid, format, &buf);
7979 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
7980 __kmp_str_buf_free(&buf);
7983 /* ------------------------------------------------------------------------ */
7985 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
7986 int blocktime = arg; /* argument is in milliseconds */
7992 __kmp_save_internal_controls(thread);
7994 /* Normalize and set blocktime for the teams */
7995 if (blocktime < KMP_MIN_BLOCKTIME)
7996 blocktime = KMP_MIN_BLOCKTIME;
7997 else if (blocktime > KMP_MAX_BLOCKTIME)
7998 blocktime = KMP_MAX_BLOCKTIME;
8000 set__blocktime_team(thread->th.th_team, tid, blocktime);
8001 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8004 /* Calculate and set blocktime intervals for the teams */
8005 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8007 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8008 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8011 /* Set whether blocktime has been set to "TRUE" */
8014 set__bt_set_team(thread->th.th_team, tid, bt_set);
8015 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8017 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8018 "bt_intervals=%d, monitor_updates=%d\n",
8019 __kmp_gtid_from_tid(tid, thread->th.th_team),
8020 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8021 __kmp_monitor_wakeups));
8023 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8024 __kmp_gtid_from_tid(tid, thread->th.th_team),
8025 thread->th.th_team->t.t_id, tid, blocktime));
8029 void __kmp_aux_set_defaults(char const *str, int len) {
8030 if (!__kmp_init_serial) {
8031 __kmp_serial_initialize();
8033 __kmp_env_initialize(str);
8035 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8038 } // __kmp_aux_set_defaults
8040 /* ------------------------------------------------------------------------ */
8041 /* internal fast reduction routines */
8043 PACKED_REDUCTION_METHOD_T
8044 __kmp_determine_reduction_method(
8045 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8046 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8047 kmp_critical_name *lck) {
8049 // Default reduction method: critical construct ( lck != NULL, like in current
8051 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8052 // can be selected by RTL
8053 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8054 // can be selected by RTL
8055 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8056 // among generated by PAROPT.
8058 PACKED_REDUCTION_METHOD_T retval;
8062 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8063 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8065 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8066 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8067 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8069 retval = critical_reduce_block;
8071 // another choice of getting a team size (with 1 dynamic deference) is slower
8072 team_size = __kmp_get_team_num_threads(global_tid);
8073 if (team_size == 1) {
8075 retval = empty_reduce_block;
8079 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8081 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8082 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8084 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8085 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8087 int teamsize_cutoff = 4;
8089 #if KMP_MIC_SUPPORTED
8090 if (__kmp_mic_type != non_mic) {
8091 teamsize_cutoff = 8;
8094 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8095 if (tree_available) {
8096 if (team_size <= teamsize_cutoff) {
8097 if (atomic_available) {
8098 retval = atomic_reduce_block;
8101 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8103 } else if (atomic_available) {
8104 retval = atomic_reduce_block;
8107 #error "Unknown or unsupported OS"
8108 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8109 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8111 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8113 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8117 if (atomic_available) {
8118 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8119 retval = atomic_reduce_block;
8121 } // otherwise: use critical section
8125 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8126 if (atomic_available && (num_vars <= 3)) {
8127 retval = atomic_reduce_block;
8128 } else if (tree_available) {
8129 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8130 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8131 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8133 } // otherwise: use critical section
8136 #error "Unknown or unsupported OS"
8140 #error "Unknown or unsupported architecture"
8144 // KMP_FORCE_REDUCTION
8146 // If the team is serialized (team_size == 1), ignore the forced reduction
8147 // method and stay with the unsynchronized method (empty_reduce_block)
8148 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8151 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8153 int atomic_available, tree_available;
8155 switch ((forced_retval = __kmp_force_reduction_method)) {
8156 case critical_reduce_block:
8157 KMP_ASSERT(lck); // lck should be != 0
8160 case atomic_reduce_block:
8161 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8162 if (!atomic_available) {
8163 KMP_WARNING(RedMethodNotSupported, "atomic");
8164 forced_retval = critical_reduce_block;
8168 case tree_reduce_block:
8169 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8170 if (!tree_available) {
8171 KMP_WARNING(RedMethodNotSupported, "tree");
8172 forced_retval = critical_reduce_block;
8174 #if KMP_FAST_REDUCTION_BARRIER
8175 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8181 KMP_ASSERT(0); // "unsupported method specified"
8184 retval = forced_retval;
8187 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8189 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8190 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8194 // this function is for testing set/get/determine reduce method
8195 kmp_int32 __kmp_get_reduce_method(void) {
8196 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8199 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8200 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8201 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8203 // Hard pause shuts down the runtime completely. Resume happens naturally when
8204 // OpenMP is used subsequently.
8205 void __kmp_hard_pause() {
8206 __kmp_pause_status = kmp_hard_paused;
8207 __kmp_internal_end_thread(-1);
8210 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8211 void __kmp_resume_if_soft_paused() {
8212 if (__kmp_pause_status == kmp_soft_paused) {
8213 __kmp_pause_status = kmp_not_paused;
8215 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8216 kmp_info_t *thread = __kmp_threads[gtid];
8217 if (thread) { // Wake it if sleeping
8218 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8219 if (fl.is_sleeping())
8221 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8222 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8223 } else { // thread holds the lock and may sleep soon
8224 do { // until either the thread sleeps, or we can get the lock
8225 if (fl.is_sleeping()) {
8228 } else if (__kmp_try_suspend_mx(thread)) {
8229 __kmp_unlock_suspend_mx(thread);
8239 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8240 // TODO: add warning messages
8241 int __kmp_pause_resource(kmp_pause_status_t level) {
8242 if (level == kmp_not_paused) { // requesting resume
8243 if (__kmp_pause_status == kmp_not_paused) {
8244 // error message about runtime not being paused, so can't resume
8247 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8248 __kmp_pause_status == kmp_hard_paused);
8249 __kmp_pause_status = kmp_not_paused;
8252 } else if (level == kmp_soft_paused) { // requesting soft pause
8253 if (__kmp_pause_status != kmp_not_paused) {
8254 // error message about already being paused
8260 } else if (level == kmp_hard_paused) { // requesting hard pause
8261 if (__kmp_pause_status != kmp_not_paused) {
8262 // error message about already being paused
8269 // error message about invalid level
8275 void __kmp_omp_display_env(int verbose) {
8276 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8277 if (__kmp_init_serial == 0)
8278 __kmp_do_serial_initialize();
8279 __kmp_display_env_impl(!verbose, verbose);
8280 __kmp_release_bootstrap_lock(&__kmp_initz_lock);