5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "kmp_wait_release.h"
17 #include "kmp_stats.h"
19 #include "ompt-specific.h"
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
27 #include "tsan_annotations.h"
29 #if KMP_MIC && USE_NGO_STORES
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
42 void __kmp_print_structure(void); // Forward declaration
44 // ---------------------------- Barrier Algorithms ----------------------------
47 template <bool cancellable = false>
48 static bool __kmp_linear_barrier_gather_template(
49 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52 kmp_team_t *team = this_thr->th.th_team;
53 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54 kmp_info_t **other_threads = team->t.t_threads;
58 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63 // Barrier imbalance - save arrive time to the thread
64 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66 __itt_get_timestamp();
69 // We now perform a linear reduction to signal that all of the threads have
71 if (!KMP_MASTER_TID(tid)) {
73 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74 "arrived(%p): %llu => %llu\n",
75 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78 // Mark arrival to master thread
79 /* After performing this write, a worker thread may not assume that the team
80 is valid any more - it could be deallocated by the master thread at any
82 ANNOTATE_BARRIER_BEGIN(this_thr);
83 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
86 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87 int nproc = this_thr->th.th_team_nproc;
89 // Don't have to worry about sleep bit here or atomic since team setting
90 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
92 // Collect all the worker team member threads.
93 for (i = 1; i < nproc; ++i) {
95 // Prefetch next thread's arrived count
97 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100 "arrived(%p) == %llu\n",
101 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 // Wait for worker thread to arrive
106 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
109 bool cancelled = flag.wait_cancellable_nosleep(
110 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
114 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
116 ANNOTATE_BARRIER_END(other_threads[i]);
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118 // Barrier imbalance - write min of the thread time and the other thread
119 // time to the thread.
120 if (__kmp_forkjoin_frames_mode == 2) {
121 this_thr->th.th_bar_min_time = KMP_MIN(
122 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
127 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
128 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
130 ANNOTATE_REDUCE_AFTER(reduce);
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
133 ANNOTATE_REDUCE_BEFORE(reduce);
134 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
137 // Don't have to worry about sleep bit here or atomic since team setting
138 team_bar->b_arrived = new_state;
139 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
140 "arrived(%p) = %llu\n",
141 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
146 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
147 gtid, team->t.t_id, tid, bt));
151 template <bool cancellable = false>
152 static bool __kmp_linear_barrier_release_template(
153 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
154 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
155 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
159 if (KMP_MASTER_TID(tid)) {
161 kmp_uint32 nproc = this_thr->th.th_team_nproc;
162 kmp_info_t **other_threads;
164 team = __kmp_threads[gtid]->th.th_team;
165 KMP_DEBUG_ASSERT(team != NULL);
166 other_threads = team->t.t_threads;
168 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170 gtid, team->t.t_id, tid, bt));
173 #if KMP_BARRIER_ICV_PUSH
175 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
176 if (propagate_icvs) {
177 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
178 for (i = 1; i < nproc; ++i) {
179 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
182 &team->t.t_implicit_task_taskdata[0].td_icvs);
187 #endif // KMP_BARRIER_ICV_PUSH
189 // Now, release all of the worker threads
190 for (i = 1; i < nproc; ++i) {
192 // Prefetch next thread's go flag
194 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
195 #endif /* KMP_CACHE_MANAGE */
198 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
199 "go(%p): %u => %u\n",
200 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
201 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
202 other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
204 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
205 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
210 } else { // Wait for the MASTER thread to release us
211 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
212 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
213 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215 bool cancelled = flag.wait_cancellable_nosleep(
216 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223 ANNOTATE_BARRIER_END(this_thr);
224 #if USE_ITT_BUILD && USE_ITT_NOTIFY
225 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
226 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229 // Cancel wait on previous parallel region...
230 __kmp_itt_task_starting(itt_sync_obj);
232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
235 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
236 if (itt_sync_obj != NULL)
237 // Call prepare as early as possible for "new" barrier
238 __kmp_itt_task_finished(itt_sync_obj);
240 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
241 // Early exit for reaping threads releasing forkjoin barrier
242 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244 // The worker thread may now assume that the team is valid.
246 tid = __kmp_tid_from_gtid(gtid);
247 team = __kmp_threads[gtid]->th.th_team;
249 KMP_DEBUG_ASSERT(team != NULL);
250 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
253 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
254 KMP_MB(); // Flush all pending memory write invalidates.
258 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
259 gtid, team->t.t_id, tid, bt));
263 static void __kmp_linear_barrier_gather(
264 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
265 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
266 __kmp_linear_barrier_gather_template<false>(
267 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
270 static bool __kmp_linear_barrier_gather_cancellable(
271 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
272 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
273 return __kmp_linear_barrier_gather_template<true>(
274 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
277 static void __kmp_linear_barrier_release(
278 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
279 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
280 __kmp_linear_barrier_release_template<false>(
281 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
284 static bool __kmp_linear_barrier_release_cancellable(
285 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
286 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
287 return __kmp_linear_barrier_release_template<true>(
288 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
293 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
294 int tid, void (*reduce)(void *, void *)
295 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
296 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
297 kmp_team_t *team = this_thr->th.th_team;
298 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
299 kmp_info_t **other_threads = team->t.t_threads;
300 kmp_uint32 nproc = this_thr->th.th_team_nproc;
301 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
302 kmp_uint32 branch_factor = 1 << branch_bits;
304 kmp_uint32 child_tid;
305 kmp_uint64 new_state;
308 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
309 gtid, team->t.t_id, tid, bt));
310 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 #if USE_ITT_BUILD && USE_ITT_NOTIFY
313 // Barrier imbalance - save arrive time to the thread
314 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
315 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
316 __itt_get_timestamp();
319 // Perform tree gather to wait until all threads have arrived; reduce any
320 // required data as we go
321 child_tid = (tid << branch_bits) + 1;
322 if (child_tid < nproc) {
323 // Parent threads wait for all their children to arrive
324 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
327 kmp_info_t *child_thr = other_threads[child_tid];
328 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330 // Prefetch next thread's arrived count
331 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
334 #endif /* KMP_CACHE_MANAGE */
336 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
337 "arrived(%p) == %llu\n",
338 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
339 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
340 // Wait for child to arrive
341 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
342 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
343 ANNOTATE_BARRIER_END(child_thr);
344 #if USE_ITT_BUILD && USE_ITT_NOTIFY
345 // Barrier imbalance - write min of the thread time and a child time to
347 if (__kmp_forkjoin_frames_mode == 2) {
348 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
349 child_thr->th.th_bar_min_time);
354 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
355 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
356 team->t.t_id, child_tid));
357 ANNOTATE_REDUCE_AFTER(reduce);
358 (*reduce)(this_thr->th.th_local.reduce_data,
359 child_thr->th.th_local.reduce_data);
360 ANNOTATE_REDUCE_BEFORE(reduce);
361 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
365 } while (child <= branch_factor && child_tid < nproc);
368 if (!KMP_MASTER_TID(tid)) { // Worker threads
369 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
373 "arrived(%p): %llu => %llu\n",
374 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
375 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
376 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
378 // Mark arrival to parent thread
379 /* After performing this write, a worker thread may not assume that the team
380 is valid any more - it could be deallocated by the master thread at any
382 ANNOTATE_BARRIER_BEGIN(this_thr);
383 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386 // Need to update the team arrived pointer if we are the master thread
387 if (nproc > 1) // New value was already computed above
388 team->t.t_bar[bt].b_arrived = new_state;
390 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
391 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
392 "arrived(%p) = %llu\n",
393 gtid, team->t.t_id, tid, team->t.t_id,
394 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
398 gtid, team->t.t_id, tid, bt));
401 static void __kmp_tree_barrier_release(
402 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
403 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
404 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
406 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
408 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
409 kmp_uint32 branch_factor = 1 << branch_bits;
411 kmp_uint32 child_tid;
413 // Perform a tree release for all of the threads that have been gathered
415 tid)) { // Handle fork barrier workers who aren't part of a team yet
416 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
417 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
418 // Wait for parent thread to release us
419 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
420 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
421 ANNOTATE_BARRIER_END(this_thr);
422 #if USE_ITT_BUILD && USE_ITT_NOTIFY
423 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
424 // In fork barrier where we could not get the object reliably (or
425 // ITTNOTIFY is disabled)
426 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
427 // Cancel wait on previous parallel region...
428 __kmp_itt_task_starting(itt_sync_obj);
430 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
434 if (itt_sync_obj != NULL)
435 // Call prepare as early as possible for "new" barrier
436 __kmp_itt_task_finished(itt_sync_obj);
438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
439 // Early exit for reaping threads releasing forkjoin barrier
440 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443 // The worker thread may now assume that the team is valid.
444 team = __kmp_threads[gtid]->th.th_team;
445 KMP_DEBUG_ASSERT(team != NULL);
446 tid = __kmp_tid_from_gtid(gtid);
448 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
450 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
451 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
452 KMP_MB(); // Flush all pending memory write invalidates.
454 team = __kmp_threads[gtid]->th.th_team;
455 KMP_DEBUG_ASSERT(team != NULL);
456 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
458 gtid, team->t.t_id, tid, bt));
460 nproc = this_thr->th.th_team_nproc;
461 child_tid = (tid << branch_bits) + 1;
463 if (child_tid < nproc) {
464 kmp_info_t **other_threads = team->t.t_threads;
466 // Parent threads release all their children
468 kmp_info_t *child_thr = other_threads[child_tid];
469 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
471 // Prefetch next thread's go count
472 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
474 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
475 #endif /* KMP_CACHE_MANAGE */
477 #if KMP_BARRIER_ICV_PUSH
479 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
480 if (propagate_icvs) {
481 __kmp_init_implicit_task(team->t.t_ident,
482 team->t.t_threads[child_tid], team,
484 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
485 &team->t.t_implicit_task_taskdata[0].td_icvs);
488 #endif // KMP_BARRIER_ICV_PUSH
490 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
491 "go(%p): %u => %u\n",
492 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
493 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
494 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
495 // Release child from barrier
496 ANNOTATE_BARRIER_BEGIN(child_thr);
497 kmp_flag_64 flag(&child_bar->b_go, child_thr);
501 } while (child <= branch_factor && child_tid < nproc);
504 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
505 gtid, team->t.t_id, tid, bt));
510 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
511 int tid, void (*reduce)(void *, void *)
512 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514 kmp_team_t *team = this_thr->th.th_team;
515 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516 kmp_info_t **other_threads = team->t.t_threads;
517 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520 kmp_uint32 branch_factor = 1 << branch_bits;
526 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527 gtid, team->t.t_id, tid, bt));
528 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531 // Barrier imbalance - save arrive time to the thread
532 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534 __itt_get_timestamp();
537 /* Perform a hypercube-embedded tree gather to wait until all of the threads
538 have arrived, and reduce any required data as we go. */
539 kmp_flag_64 p_flag(&thr_bar->b_arrived);
540 for (level = 0, offset = 1; offset < num_threads;
541 level += branch_bits, offset <<= branch_bits) {
543 kmp_uint32 child_tid;
545 if (((tid >> level) & (branch_factor - 1)) != 0) {
546 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
550 "arrived(%p): %llu => %llu\n",
551 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
552 team->t.t_id, parent_tid, &thr_bar->b_arrived,
554 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
555 // Mark arrival to parent thread
556 /* After performing this write (in the last iteration of the enclosing for
557 loop), a worker thread may not assume that the team is valid any more
558 - it could be deallocated by the master thread at any time. */
559 ANNOTATE_BARRIER_BEGIN(this_thr);
560 p_flag.set_waiter(other_threads[parent_tid]);
565 // Parent threads wait for children to arrive
566 if (new_state == KMP_BARRIER_UNUSED_STATE)
567 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
568 for (child = 1, child_tid = tid + (1 << level);
569 child < branch_factor && child_tid < num_threads;
570 child++, child_tid += (1 << level)) {
571 kmp_info_t *child_thr = other_threads[child_tid];
572 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
574 kmp_uint32 next_child_tid = child_tid + (1 << level);
575 // Prefetch next thread's arrived count
576 if (child + 1 < branch_factor && next_child_tid < num_threads)
578 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
579 #endif /* KMP_CACHE_MANAGE */
581 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
582 "arrived(%p) == %llu\n",
583 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
584 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
585 // Wait for child to arrive
586 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
587 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
588 ANNOTATE_BARRIER_END(child_thr);
589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
590 // Barrier imbalance - write min of the thread time and a child time to
592 if (__kmp_forkjoin_frames_mode == 2) {
593 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
594 child_thr->th.th_bar_min_time);
599 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
600 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
601 team->t.t_id, child_tid));
602 ANNOTATE_REDUCE_AFTER(reduce);
603 (*reduce)(this_thr->th.th_local.reduce_data,
604 child_thr->th.th_local.reduce_data);
605 ANNOTATE_REDUCE_BEFORE(reduce);
606 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
611 if (KMP_MASTER_TID(tid)) {
612 // Need to update the team arrived pointer if we are the master thread
613 if (new_state == KMP_BARRIER_UNUSED_STATE)
614 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
616 team->t.t_bar[bt].b_arrived = new_state;
617 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
618 "arrived(%p) = %llu\n",
619 gtid, team->t.t_id, tid, team->t.t_id,
620 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
623 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
624 gtid, team->t.t_id, tid, bt));
627 // The reverse versions seem to beat the forward versions overall
628 #define KMP_REVERSE_HYPER_BAR
629 static void __kmp_hyper_barrier_release(
630 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
631 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
632 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
634 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
635 kmp_info_t **other_threads;
636 kmp_uint32 num_threads;
637 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
638 kmp_uint32 branch_factor = 1 << branch_bits;
640 kmp_uint32 child_tid;
644 /* Perform a hypercube-embedded tree release for all of the threads that have
645 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
646 are released in the reverse order of the corresponding gather, otherwise
647 threads are released in the same order. */
648 if (KMP_MASTER_TID(tid)) { // master
649 team = __kmp_threads[gtid]->th.th_team;
650 KMP_DEBUG_ASSERT(team != NULL);
651 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
653 gtid, team->t.t_id, tid, bt));
654 #if KMP_BARRIER_ICV_PUSH
655 if (propagate_icvs) { // master already has ICVs in final destination; copy
656 copy_icvs(&thr_bar->th_fixed_icvs,
657 &team->t.t_implicit_task_taskdata[tid].td_icvs);
660 } else { // Handle fork barrier workers who aren't part of a team yet
661 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
662 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
663 // Wait for parent thread to release us
664 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
665 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
666 ANNOTATE_BARRIER_END(this_thr);
667 #if USE_ITT_BUILD && USE_ITT_NOTIFY
668 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
669 // In fork barrier where we could not get the object reliably
670 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
671 // Cancel wait on previous parallel region...
672 __kmp_itt_task_starting(itt_sync_obj);
674 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
678 if (itt_sync_obj != NULL)
679 // Call prepare as early as possible for "new" barrier
680 __kmp_itt_task_finished(itt_sync_obj);
682 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
683 // Early exit for reaping threads releasing forkjoin barrier
684 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
687 // The worker thread may now assume that the team is valid.
688 team = __kmp_threads[gtid]->th.th_team;
689 KMP_DEBUG_ASSERT(team != NULL);
690 tid = __kmp_tid_from_gtid(gtid);
692 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
694 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
695 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
696 KMP_MB(); // Flush all pending memory write invalidates.
698 num_threads = this_thr->th.th_team_nproc;
699 other_threads = team->t.t_threads;
701 #ifdef KMP_REVERSE_HYPER_BAR
702 // Count up to correct level for parent
703 for (level = 0, offset = 1;
704 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
705 level += branch_bits, offset <<= branch_bits)
708 // Now go down from there
709 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
710 level -= branch_bits, offset >>= branch_bits)
712 // Go down the tree, level by level
713 for (level = 0, offset = 1; offset < num_threads;
714 level += branch_bits, offset <<= branch_bits)
715 #endif // KMP_REVERSE_HYPER_BAR
717 #ifdef KMP_REVERSE_HYPER_BAR
718 /* Now go in reverse order through the children, highest to lowest.
719 Initial setting of child is conservative here. */
720 child = num_threads >> ((level == 0) ? level : level - 1);
721 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
722 child_tid = tid + (child << level);
723 child >= 1; child--, child_tid -= (1 << level))
725 if (((tid >> level) & (branch_factor - 1)) != 0)
726 // No need to go lower than this, since this is the level parent would be
729 // Iterate through children on this level of the tree
730 for (child = 1, child_tid = tid + (1 << level);
731 child < branch_factor && child_tid < num_threads;
732 child++, child_tid += (1 << level))
733 #endif // KMP_REVERSE_HYPER_BAR
735 if (child_tid >= num_threads)
736 continue; // Child doesn't exist so keep going
738 kmp_info_t *child_thr = other_threads[child_tid];
739 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
741 kmp_uint32 next_child_tid = child_tid - (1 << level);
742 // Prefetch next thread's go count
743 #ifdef KMP_REVERSE_HYPER_BAR
744 if (child - 1 >= 1 && next_child_tid < num_threads)
746 if (child + 1 < branch_factor && next_child_tid < num_threads)
747 #endif // KMP_REVERSE_HYPER_BAR
749 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
750 #endif /* KMP_CACHE_MANAGE */
752 #if KMP_BARRIER_ICV_PUSH
753 if (propagate_icvs) // push my fixed ICVs to my child
754 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
755 #endif // KMP_BARRIER_ICV_PUSH
759 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
760 "go(%p): %u => %u\n",
761 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
762 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
763 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
764 // Release child from barrier
765 ANNOTATE_BARRIER_BEGIN(child_thr);
766 kmp_flag_64 flag(&child_bar->b_go, child_thr);
771 #if KMP_BARRIER_ICV_PUSH
772 if (propagate_icvs &&
773 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
774 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
776 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
777 &thr_bar->th_fixed_icvs);
782 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
783 gtid, team->t.t_id, tid, bt));
786 // Hierarchical Barrier
788 // Initialize thread barrier data
789 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
790 Performs the minimum amount of initialization required based on how the team
791 has changed. Returns true if leaf children will require both on-core and
792 traditional wake-up mechanisms. For example, if the team size increases,
793 threads already in the team will respond to on-core wakeup on their parent
794 thread, but threads newly added to the team will only be listening on the
796 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
797 kmp_bstate_t *thr_bar,
798 kmp_uint32 nproc, int gtid,
799 int tid, kmp_team_t *team) {
800 // Checks to determine if (re-)initialization is needed
801 bool uninitialized = thr_bar->team == NULL;
802 bool team_changed = team != thr_bar->team;
803 bool team_sz_changed = nproc != thr_bar->nproc;
804 bool tid_changed = tid != thr_bar->old_tid;
807 if (uninitialized || team_sz_changed) {
808 __kmp_get_hierarchy(nproc, thr_bar);
811 if (uninitialized || team_sz_changed || tid_changed) {
812 thr_bar->my_level = thr_bar->depth - 1; // default for master
813 thr_bar->parent_tid = -1; // default for master
815 tid)) { // if not master, find parent thread in hierarchy
817 while (d < thr_bar->depth) { // find parent based on level of thread in
818 // hierarchy, and note level
820 if (d == thr_bar->depth - 2) { // reached level right below the master
821 thr_bar->parent_tid = 0;
822 thr_bar->my_level = d;
824 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
825 0) { // TODO: can we make this op faster?
826 // thread is not a subtree root at next level, so this is max
827 thr_bar->parent_tid = tid - rem;
828 thr_bar->my_level = d;
834 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
835 thr_bar->old_tid = tid;
836 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
837 thr_bar->team = team;
838 thr_bar->parent_bar =
839 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
841 if (uninitialized || team_changed || tid_changed) {
842 thr_bar->team = team;
843 thr_bar->parent_bar =
844 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847 if (uninitialized || team_sz_changed || tid_changed) {
848 thr_bar->nproc = nproc;
849 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
850 if (thr_bar->my_level == 0)
851 thr_bar->leaf_kids = 0;
852 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
853 thr_bar->leaf_kids = nproc - tid - 1;
854 thr_bar->leaf_state = 0;
855 for (int i = 0; i < thr_bar->leaf_kids; ++i)
856 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
861 static void __kmp_hierarchical_barrier_gather(
862 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
864 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
865 kmp_team_t *team = this_thr->th.th_team;
866 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
867 kmp_uint32 nproc = this_thr->th.th_team_nproc;
868 kmp_info_t **other_threads = team->t.t_threads;
869 kmp_uint64 new_state;
871 int level = team->t.t_level;
873 ->th.th_teams_microtask) // are we inside the teams construct?
874 if (this_thr->th.th_teams_size.nteams > 1)
875 ++level; // level was not increased in teams construct for team_of_masters
877 thr_bar->use_oncore_barrier = 1;
879 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
881 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
883 gtid, team->t.t_id, tid, bt));
884 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
886 #if USE_ITT_BUILD && USE_ITT_NOTIFY
887 // Barrier imbalance - save arrive time to the thread
888 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
889 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
893 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
896 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
899 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
900 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
901 thr_bar->use_oncore_barrier) {
902 if (thr_bar->leaf_kids) {
903 // First, wait for leaf children to check-in on my b_arrived flag
904 kmp_uint64 leaf_state =
906 ? thr_bar->b_arrived | thr_bar->leaf_state
907 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
908 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
910 gtid, team->t.t_id, tid));
911 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
912 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
914 ANNOTATE_REDUCE_AFTER(reduce);
915 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
917 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
919 gtid, team->t.t_id, tid,
920 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
922 ANNOTATE_BARRIER_END(other_threads[child_tid]);
923 (*reduce)(this_thr->th.th_local.reduce_data,
924 other_threads[child_tid]->th.th_local.reduce_data);
926 ANNOTATE_REDUCE_BEFORE(reduce);
927 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
929 // clear leaf_state bits
930 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
932 // Next, wait for higher level children on each child's b_arrived flag
933 for (kmp_uint32 d = 1; d < thr_bar->my_level;
934 ++d) { // gather lowest level threads first, but skip 0
935 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
936 skip = thr_bar->skip_per_level[d];
939 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
940 kmp_info_t *child_thr = other_threads[child_tid];
941 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
942 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
944 "arrived(%p) == %llu\n",
945 gtid, team->t.t_id, tid,
946 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
947 child_tid, &child_bar->b_arrived, new_state));
948 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
949 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
950 ANNOTATE_BARRIER_END(child_thr);
952 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
954 gtid, team->t.t_id, tid,
955 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957 ANNOTATE_REDUCE_AFTER(reduce);
958 (*reduce)(this_thr->th.th_local.reduce_data,
959 child_thr->th.th_local.reduce_data);
960 ANNOTATE_REDUCE_BEFORE(reduce);
961 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
965 } else { // Blocktime is not infinite
966 for (kmp_uint32 d = 0; d < thr_bar->my_level;
967 ++d) { // Gather lowest level threads first
968 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
969 skip = thr_bar->skip_per_level[d];
972 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
973 kmp_info_t *child_thr = other_threads[child_tid];
974 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
975 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
977 "arrived(%p) == %llu\n",
978 gtid, team->t.t_id, tid,
979 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
980 child_tid, &child_bar->b_arrived, new_state));
981 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
982 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
983 ANNOTATE_BARRIER_END(child_thr);
985 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
987 gtid, team->t.t_id, tid,
988 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990 ANNOTATE_REDUCE_AFTER(reduce);
991 (*reduce)(this_thr->th.th_local.reduce_data,
992 child_thr->th.th_local.reduce_data);
993 ANNOTATE_REDUCE_BEFORE(reduce);
994 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1000 // All subordinates are gathered; now release parent if not master thread
1002 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1003 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1004 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1005 gtid, team->t.t_id, tid,
1006 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1007 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1008 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1009 /* Mark arrival to parent: After performing this write, a worker thread may
1010 not assume that the team is valid any more - it could be deallocated by
1011 the master thread at any time. */
1012 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1013 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1015 ANNOTATE_BARRIER_BEGIN(this_thr);
1016 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1019 // Leaf does special release on "offset" bits of parent's b_arrived flag
1020 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1021 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1022 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1025 } else { // Master thread needs to update the team's b_arrived value
1026 team->t.t_bar[bt].b_arrived = new_state;
1027 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1028 "arrived(%p) = %llu\n",
1029 gtid, team->t.t_id, tid, team->t.t_id,
1030 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1032 // Is the team access below unsafe or just technically invalid?
1033 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1034 "barrier type %d\n",
1035 gtid, team->t.t_id, tid, bt));
1038 static void __kmp_hierarchical_barrier_release(
1039 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1040 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1041 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1043 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1045 bool team_change = false; // indicates on-core barrier shouldn't be used
1047 if (KMP_MASTER_TID(tid)) {
1048 team = __kmp_threads[gtid]->th.th_team;
1049 KMP_DEBUG_ASSERT(team != NULL);
1050 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1051 "entered barrier type %d\n",
1052 gtid, team->t.t_id, tid, bt));
1053 } else { // Worker threads
1054 // Wait for parent thread to release me
1055 if (!thr_bar->use_oncore_barrier ||
1056 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1057 thr_bar->team == NULL) {
1058 // Use traditional method of waiting on my own b_go flag
1059 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1060 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1061 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1062 ANNOTATE_BARRIER_END(this_thr);
1063 TCW_8(thr_bar->b_go,
1064 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1065 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1066 // infinite, not nested
1067 // Wait on my "offset" bits on parent's b_go flag
1068 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1069 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1070 thr_bar->offset, bt,
1071 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1072 flag.wait(this_thr, TRUE);
1073 if (thr_bar->wait_flag ==
1074 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1075 TCW_8(thr_bar->b_go,
1076 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1077 } else { // Reset my bits on parent's b_go flag
1078 (RCAST(volatile char *,
1079 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1082 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1083 // Early exit for reaping threads releasing forkjoin barrier
1084 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1086 // The worker thread may now assume that the team is valid.
1087 team = __kmp_threads[gtid]->th.th_team;
1088 KMP_DEBUG_ASSERT(team != NULL);
1089 tid = __kmp_tid_from_gtid(gtid);
1093 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1094 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1095 KMP_MB(); // Flush all pending memory write invalidates.
1098 nproc = this_thr->th.th_team_nproc;
1099 int level = team->t.t_level;
1100 if (team->t.t_threads[0]
1101 ->th.th_teams_microtask) { // are we inside the teams construct?
1102 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1103 this_thr->th.th_teams_level == level)
1104 ++level; // level was not increased in teams construct for team_of_workers
1105 if (this_thr->th.th_teams_size.nteams > 1)
1106 ++level; // level was not increased in teams construct for team_of_masters
1109 thr_bar->use_oncore_barrier = 1;
1111 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1113 // If the team size has increased, we still communicate with old leaves via
1115 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1116 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1117 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1119 // But if the entire team changes, we won't use oncore barrier at all
1123 #if KMP_BARRIER_ICV_PUSH
1124 if (propagate_icvs) {
1125 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1128 tid)) { // master already has copy in final destination; copy
1129 copy_icvs(&thr_bar->th_fixed_icvs,
1130 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1131 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1132 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1133 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1134 // leaves (on-core children) pull parent's fixed ICVs directly to local
1136 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1137 &thr_bar->parent_bar->th_fixed_icvs);
1138 // non-leaves will get ICVs piggybacked with b_go via NGO store
1139 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1140 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1142 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1143 else // leaves copy parent's fixed ICVs directly to local ICV store
1144 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1145 &thr_bar->parent_bar->th_fixed_icvs);
1148 #endif // KMP_BARRIER_ICV_PUSH
1150 // Now, release my children
1151 if (thr_bar->my_level) { // not a leaf
1152 kmp_int32 child_tid;
1154 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1155 thr_bar->use_oncore_barrier) {
1156 if (KMP_MASTER_TID(tid)) { // do a flat release
1157 // Set local b_go to bump children via NGO store of the cache line
1158 // containing IVCs and b_go.
1159 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1160 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1162 ngo_load(&thr_bar->th_fixed_icvs);
1163 // This loops over all the threads skipping only the leaf nodes in the
1165 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1166 child_tid += thr_bar->skip_per_level[1]) {
1167 kmp_bstate_t *child_bar =
1168 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1169 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1170 "releasing T#%d(%d:%d)"
1171 " go(%p): %u => %u\n",
1172 gtid, team->t.t_id, tid,
1173 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1174 child_tid, &child_bar->b_go, child_bar->b_go,
1175 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1176 // Use ngo store (if available) to both store ICVs and release child
1178 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1182 TCW_8(thr_bar->b_go,
1183 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1184 // Now, release leaf children
1185 if (thr_bar->leaf_kids) { // if there are any
1186 // We test team_change on the off-chance that the level 1 team changed.
1188 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1189 if (old_leaf_kids) { // release old leaf kids
1190 thr_bar->b_go |= old_leaf_state;
1192 // Release new leaf kids
1193 last = tid + thr_bar->skip_per_level[1];
1196 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1197 ++child_tid) { // skip_per_level[0]=1
1198 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1199 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1202 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1203 " T#%d(%d:%d) go(%p): %u => %u\n",
1204 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1205 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1206 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1207 // Release child using child's b_go flag
1208 ANNOTATE_BARRIER_BEGIN(child_thr);
1209 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1212 } else { // Release all children at once with leaf_state bits on my own
1214 thr_bar->b_go |= thr_bar->leaf_state;
1217 } else { // Blocktime is not infinite; do a simple hierarchical release
1218 for (int d = thr_bar->my_level - 1; d >= 0;
1219 --d) { // Release highest level threads first
1220 last = tid + thr_bar->skip_per_level[d + 1];
1221 kmp_uint32 skip = thr_bar->skip_per_level[d];
1224 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1225 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1226 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1227 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1228 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1229 gtid, team->t.t_id, tid,
1230 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1231 child_tid, &child_bar->b_go, child_bar->b_go,
1232 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1233 // Release child using child's b_go flag
1234 ANNOTATE_BARRIER_BEGIN(child_thr);
1235 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1240 #if KMP_BARRIER_ICV_PUSH
1241 if (propagate_icvs && !KMP_MASTER_TID(tid))
1242 // non-leaves copy ICVs from fixed ICVs to local dest
1243 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1244 &thr_bar->th_fixed_icvs);
1245 #endif // KMP_BARRIER_ICV_PUSH
1247 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1248 "barrier type %d\n",
1249 gtid, team->t.t_id, tid, bt));
1252 // End of Barrier Algorithms
1254 // type traits for cancellable value
1255 // if cancellable is true, then is_cancellable is a normal boolean variable
1256 // if cancellable is false, then is_cancellable is a compile time constant
1257 template <bool cancellable> struct is_cancellable {};
1258 template <> struct is_cancellable<true> {
1260 is_cancellable() : value(false) {}
1261 is_cancellable(bool b) : value(b) {}
1262 is_cancellable &operator=(bool b) {
1266 operator bool() const { return value; }
1268 template <> struct is_cancellable<false> {
1269 is_cancellable &operator=(bool b) { return *this; }
1270 constexpr operator bool() const { return false; }
1273 // Internal function to do a barrier.
1274 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1275 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1277 When cancellable = false,
1278 Returns 0 if master thread, 1 if worker thread.
1279 When cancellable = true
1280 Returns 0 if not cancelled, 1 if cancelled. */
1281 template <bool cancellable = false>
1282 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1283 size_t reduce_size, void *reduce_data,
1284 void (*reduce)(void *, void *)) {
1285 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1286 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1287 int tid = __kmp_tid_from_gtid(gtid);
1288 kmp_info_t *this_thr = __kmp_threads[gtid];
1289 kmp_team_t *team = this_thr->th.th_team;
1291 is_cancellable<cancellable> cancelled;
1292 #if OMPT_SUPPORT && OMPT_OPTIONAL
1293 ompt_data_t *my_task_data;
1294 ompt_data_t *my_parallel_data;
1295 void *return_address;
1296 ompt_sync_region_t barrier_kind;
1299 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1300 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1302 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1304 if (ompt_enabled.enabled) {
1306 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1307 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1308 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1309 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1310 if (ompt_enabled.ompt_callback_sync_region) {
1311 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1312 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1315 if (ompt_enabled.ompt_callback_sync_region_wait) {
1316 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1317 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1321 // It is OK to report the barrier state after the barrier begin callback.
1322 // According to the OMPT specification, a compliant implementation may
1323 // even delay reporting this state until the barrier begins to wait.
1324 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1328 if (!team->t.t_serialized) {
1330 // This value will be used in itt notify events below.
1331 void *itt_sync_obj = NULL;
1333 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1334 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1336 #endif /* USE_ITT_BUILD */
1337 if (__kmp_tasking_mode == tskm_extra_barrier) {
1338 __kmp_tasking_barrier(team, this_thr, gtid);
1340 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1341 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1344 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1345 access it when the team struct is not guaranteed to exist. */
1346 // See note about the corresponding code in __kmp_join_barrier() being
1347 // performance-critical.
1348 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1350 this_thr->th.th_team_bt_intervals =
1351 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1352 this_thr->th.th_team_bt_set =
1353 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1355 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1360 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1361 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1362 #endif /* USE_ITT_BUILD */
1364 // Let the debugger know: the thread arrived to the barrier and waiting.
1365 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1366 team->t.t_bar[bt].b_master_arrived += 1;
1368 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1370 #endif /* USE_DEBUGGER */
1371 if (reduce != NULL) {
1372 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1373 this_thr->th.th_local.reduce_data = reduce_data;
1376 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1377 // use 0 to only setup the current team if nthreads > 1
1378 __kmp_task_team_setup(this_thr, team, 0);
1381 cancelled = __kmp_linear_barrier_gather_cancellable(
1382 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1384 switch (__kmp_barrier_gather_pattern[bt]) {
1385 case bp_hyper_bar: {
1386 // don't set branch bits to 0; use linear
1387 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1388 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1389 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1392 case bp_hierarchical_bar: {
1393 __kmp_hierarchical_barrier_gather(
1394 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1398 // don't set branch bits to 0; use linear
1399 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1400 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1401 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1405 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1406 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1413 if (KMP_MASTER_TID(tid)) {
1415 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1416 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1419 // Let the debugger know: All threads are arrived and starting leaving the
1421 team->t.t_bar[bt].b_team_arrived += 1;
1424 if (__kmp_omp_cancellation) {
1425 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1426 // Reset cancellation flag for worksharing constructs
1427 if (cancel_request == cancel_loop ||
1428 cancel_request == cancel_sections) {
1429 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1433 /* TODO: In case of split reduction barrier, master thread may send
1434 acquired event early, before the final summation into the shared
1435 variable is done (final summation can be a long operation for array
1437 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1438 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1439 #endif /* USE_ITT_BUILD */
1440 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1441 // Barrier - report frame end (only if active_level == 1)
1442 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1443 __kmp_forkjoin_frames_mode &&
1444 this_thr->th.th_teams_microtask == NULL &&
1445 team->t.t_active_level == 1) {
1446 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1447 kmp_uint64 cur_time = __itt_get_timestamp();
1448 kmp_info_t **other_threads = team->t.t_threads;
1449 int nproc = this_thr->th.th_team_nproc;
1451 switch (__kmp_forkjoin_frames_mode) {
1453 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1455 this_thr->th.th_frame_time = cur_time;
1457 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1459 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1463 if (__itt_metadata_add_ptr) {
1464 // Initialize with master's wait time
1465 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1466 // Set arrive time to zero to be able to check it in
1467 // __kmp_invoke_task(); the same is done inside the loop below
1468 this_thr->th.th_bar_arrive_time = 0;
1469 for (i = 1; i < nproc; ++i) {
1470 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1471 other_threads[i]->th.th_bar_arrive_time = 0;
1473 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1475 (kmp_uint64)(reduce != NULL));
1477 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1479 this_thr->th.th_frame_time = cur_time;
1483 #endif /* USE_ITT_BUILD */
1487 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1488 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1489 #endif /* USE_ITT_BUILD */
1491 if ((status == 1 || !is_split) && !cancelled) {
1493 cancelled = __kmp_linear_barrier_release_cancellable(
1494 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1496 switch (__kmp_barrier_release_pattern[bt]) {
1497 case bp_hyper_bar: {
1498 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1499 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1500 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1503 case bp_hierarchical_bar: {
1504 __kmp_hierarchical_barrier_release(
1505 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1509 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1510 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1511 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1515 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1516 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1520 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1521 __kmp_task_team_sync(this_thr, team);
1526 /* GEH: TODO: Move this under if-condition above and also include in
1527 __kmp_end_split_barrier(). This will more accurately represent the actual
1528 release time of the threads for split barriers. */
1529 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1530 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1531 #endif /* USE_ITT_BUILD */
1532 } else { // Team is serialized.
1534 if (__kmp_tasking_mode != tskm_immediate_exec) {
1535 if (this_thr->th.th_task_team != NULL) {
1537 void *itt_sync_obj = NULL;
1538 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1539 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1540 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1544 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1546 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1547 __kmp_task_team_setup(this_thr, team, 0);
1550 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1551 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1552 #endif /* USE_ITT_BUILD */
1556 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1557 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1558 __kmp_tid_from_gtid(gtid), status));
1561 if (ompt_enabled.enabled) {
1563 if (ompt_enabled.ompt_callback_sync_region_wait) {
1564 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1565 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1568 if (ompt_enabled.ompt_callback_sync_region) {
1569 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1570 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1574 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1577 ANNOTATE_BARRIER_END(&team->t.t_bar);
1580 return (int)cancelled;
1584 // Returns 0 if master thread, 1 if worker thread.
1585 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1586 size_t reduce_size, void *reduce_data,
1587 void (*reduce)(void *, void *)) {
1588 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1592 #if defined(KMP_GOMP_COMPAT)
1593 // Returns 1 if cancelled, 0 otherwise
1594 int __kmp_barrier_gomp_cancel(int gtid) {
1595 if (__kmp_omp_cancellation) {
1596 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1599 int tid = __kmp_tid_from_gtid(gtid);
1600 kmp_info_t *this_thr = __kmp_threads[gtid];
1601 if (KMP_MASTER_TID(tid)) {
1602 // Master does not need to revert anything
1604 // Workers need to revert their private b_arrived flag
1605 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1606 KMP_BARRIER_STATE_BUMP;
1611 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1616 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1617 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1618 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1619 int tid = __kmp_tid_from_gtid(gtid);
1620 kmp_info_t *this_thr = __kmp_threads[gtid];
1621 kmp_team_t *team = this_thr->th.th_team;
1623 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1624 if (!team->t.t_serialized) {
1625 if (KMP_MASTER_GTID(gtid)) {
1626 switch (__kmp_barrier_release_pattern[bt]) {
1627 case bp_hyper_bar: {
1628 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1629 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1630 FALSE USE_ITT_BUILD_ARG(NULL));
1633 case bp_hierarchical_bar: {
1634 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1635 FALSE USE_ITT_BUILD_ARG(NULL));
1639 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1640 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1641 FALSE USE_ITT_BUILD_ARG(NULL));
1645 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1646 FALSE USE_ITT_BUILD_ARG(NULL));
1649 if (__kmp_tasking_mode != tskm_immediate_exec) {
1650 __kmp_task_team_sync(this_thr, team);
1654 ANNOTATE_BARRIER_END(&team->t.t_bar);
1657 void __kmp_join_barrier(int gtid) {
1658 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1659 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1660 kmp_info_t *this_thr = __kmp_threads[gtid];
1663 kmp_info_t *master_thread;
1667 #endif /* KMP_DEBUG */
1669 void *itt_sync_obj = NULL;
1671 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1672 // Get object created at fork_barrier
1673 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1675 #endif /* USE_ITT_BUILD */
1679 team = this_thr->th.th_team;
1680 nproc = this_thr->th.th_team_nproc;
1681 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1682 tid = __kmp_tid_from_gtid(gtid);
1684 team_id = team->t.t_id;
1685 #endif /* KMP_DEBUG */
1686 master_thread = this_thr->th.th_team_master;
1688 if (master_thread != team->t.t_threads[0]) {
1689 __kmp_print_structure();
1691 #endif /* KMP_DEBUG */
1692 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1696 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1697 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1698 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1699 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1700 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1701 gtid, team_id, tid));
1703 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1705 if (ompt_enabled.enabled) {
1707 ompt_data_t *my_task_data;
1708 ompt_data_t *my_parallel_data;
1709 void *codeptr = NULL;
1710 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1711 if (KMP_MASTER_TID(ds_tid) &&
1712 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1713 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1714 codeptr = team->t.ompt_team_info.master_return_address;
1715 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1716 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1717 if (ompt_enabled.ompt_callback_sync_region) {
1718 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1719 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1720 my_task_data, codeptr);
1722 if (ompt_enabled.ompt_callback_sync_region_wait) {
1723 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1724 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1725 my_task_data, codeptr);
1727 if (!KMP_MASTER_TID(ds_tid))
1728 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1730 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1734 if (__kmp_tasking_mode == tskm_extra_barrier) {
1735 __kmp_tasking_barrier(team, this_thr, gtid);
1736 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1740 if (__kmp_tasking_mode != tskm_immediate_exec) {
1741 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1742 "%p, th_task_team = %p\n",
1743 __kmp_gtid_from_thread(this_thr), team_id,
1744 team->t.t_task_team[this_thr->th.th_task_state],
1745 this_thr->th.th_task_team));
1746 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1747 team->t.t_task_team[this_thr->th.th_task_state]);
1749 #endif /* KMP_DEBUG */
1751 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1752 access it when the team struct is not guaranteed to exist. Doing these
1753 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1754 we do not perform the copy if blocktime=infinite, since the values are not
1755 used by __kmp_wait_template() in that case. */
1756 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1758 this_thr->th.th_team_bt_intervals =
1759 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1760 this_thr->th.th_team_bt_set =
1761 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1763 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1768 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1769 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1770 #endif /* USE_ITT_BUILD */
1772 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1773 case bp_hyper_bar: {
1774 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1775 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1776 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1779 case bp_hierarchical_bar: {
1780 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1781 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1785 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1786 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1787 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1791 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1792 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1796 /* From this point on, the team data structure may be deallocated at any time
1797 by the master thread - it is unsafe to reference it in any of the worker
1798 threads. Any per-team data items that need to be referenced before the
1799 end of the barrier should be moved to the kmp_task_team_t structs. */
1800 if (KMP_MASTER_TID(tid)) {
1801 if (__kmp_tasking_mode != tskm_immediate_exec) {
1802 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1804 if (__kmp_display_affinity) {
1805 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1807 #if KMP_STATS_ENABLED
1808 // Have master thread flag the workers to indicate they are now waiting for
1809 // next parallel region, Also wake them up so they switch their timers to
1811 for (int i = 0; i < team->t.t_nproc; ++i) {
1812 kmp_info_t *team_thread = team->t.t_threads[i];
1813 if (team_thread == this_thr)
1815 team_thread->th.th_stats->setIdleFlag();
1816 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1817 team_thread->th.th_sleep_loc != NULL)
1818 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1819 team_thread->th.th_sleep_loc);
1823 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1824 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1825 #endif /* USE_ITT_BUILD */
1827 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1828 // Join barrier - report frame end
1829 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1830 __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1831 team->t.t_active_level == 1) {
1832 kmp_uint64 cur_time = __itt_get_timestamp();
1833 ident_t *loc = team->t.t_ident;
1834 kmp_info_t **other_threads = team->t.t_threads;
1835 int nproc = this_thr->th.th_team_nproc;
1837 switch (__kmp_forkjoin_frames_mode) {
1839 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1843 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1847 if (__itt_metadata_add_ptr) {
1848 // Initialize with master's wait time
1849 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1850 // Set arrive time to zero to be able to check it in
1851 // __kmp_invoke_task(); the same is done inside the loop below
1852 this_thr->th.th_bar_arrive_time = 0;
1853 for (i = 1; i < nproc; ++i) {
1854 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1855 other_threads[i]->th.th_bar_arrive_time = 0;
1857 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1858 cur_time, delta, 0);
1860 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1862 this_thr->th.th_frame_time = cur_time;
1866 #endif /* USE_ITT_BUILD */
1870 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1871 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1873 #endif /* USE_ITT_BUILD */
1876 if (KMP_MASTER_TID(tid)) {
1879 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1880 gtid, team_id, tid, nproc));
1882 #endif /* KMP_DEBUG */
1884 // TODO now, mark worker threads as done so they may be disbanded
1885 KMP_MB(); // Flush all pending memory write invalidates.
1887 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1889 ANNOTATE_BARRIER_END(&team->t.t_bar);
1892 // TODO release worker threads' fork barriers as we are ready instead of all at
1894 void __kmp_fork_barrier(int gtid, int tid) {
1895 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1896 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1897 kmp_info_t *this_thr = __kmp_threads[gtid];
1898 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1900 void *itt_sync_obj = NULL;
1901 #endif /* USE_ITT_BUILD */
1903 ANNOTATE_BARRIER_END(&team->t.t_bar);
1905 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1906 (team != NULL) ? team->t.t_id : -1, tid));
1908 // th_team pointer only valid for master thread here
1909 if (KMP_MASTER_TID(tid)) {
1910 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1911 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1912 // Create itt barrier object
1913 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1914 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1916 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1919 kmp_info_t **other_threads = team->t.t_threads;
1925 for (i = 1; i < team->t.t_nproc; ++i) {
1927 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1929 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1930 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1931 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1933 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1934 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1935 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1939 if (__kmp_tasking_mode != tskm_immediate_exec) {
1940 // 0 indicates setup current task team if nthreads > 1
1941 __kmp_task_team_setup(this_thr, team, 0);
1944 /* The master thread may have changed its blocktime between the join barrier
1945 and the fork barrier. Copy the blocktime info to the thread, where
1946 __kmp_wait_template() can access it when the team struct is not
1947 guaranteed to exist. */
1948 // See note about the corresponding code in __kmp_join_barrier() being
1949 // performance-critical
1950 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1952 this_thr->th.th_team_bt_intervals =
1953 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1954 this_thr->th.th_team_bt_set =
1955 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1957 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1962 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1963 case bp_hyper_bar: {
1964 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1965 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1966 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1969 case bp_hierarchical_bar: {
1970 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1971 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1975 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1976 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1977 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1981 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1982 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987 if (ompt_enabled.enabled &&
1988 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1989 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1990 ompt_data_t *task_data = (team)
1991 ? OMPT_CUR_TASK_DATA(this_thr)
1992 : &(this_thr->th.ompt_thread_info.task_data);
1993 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1995 void *codeptr = NULL;
1996 if (KMP_MASTER_TID(ds_tid) &&
1997 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1998 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1999 codeptr = team->t.ompt_team_info.master_return_address;
2000 if (ompt_enabled.ompt_callback_sync_region_wait) {
2001 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2002 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2005 if (ompt_enabled.ompt_callback_sync_region) {
2006 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2007 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2011 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2012 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2013 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2018 // Early exit for reaping threads releasing forkjoin barrier
2019 if (TCR_4(__kmp_global.g.g_done)) {
2020 this_thr->th.th_task_team = NULL;
2022 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2023 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2024 if (!KMP_MASTER_TID(tid)) {
2025 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2027 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2030 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2031 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2035 /* We can now assume that a valid team structure has been allocated by the
2036 master and propagated to all worker threads. The current thread, however,
2037 may not be part of the team, so we can't blindly assume that the team
2038 pointer is non-null. */
2039 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2040 KMP_DEBUG_ASSERT(team != NULL);
2041 tid = __kmp_tid_from_gtid(gtid);
2043 #if KMP_BARRIER_ICV_PULL
2044 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2045 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2046 implicit task has this data before this function is called. We cannot
2047 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2048 struct, because it is not always the case that the threads arrays have
2049 been allocated when __kmp_fork_call() is executed. */
2051 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2052 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2053 // Copy the initial ICVs from the master's thread struct to the implicit
2054 // task for this tid.
2056 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2057 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2059 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2060 &team->t.t_threads[0]
2061 ->th.th_bar[bs_forkjoin_barrier]
2065 #endif // KMP_BARRIER_ICV_PULL
2067 if (__kmp_tasking_mode != tskm_immediate_exec) {
2068 __kmp_task_team_sync(this_thr, team);
2071 #if KMP_AFFINITY_SUPPORTED
2072 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2073 if (proc_bind == proc_bind_intel) {
2074 // Call dynamic affinity settings
2075 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2076 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2078 } else if (proc_bind != proc_bind_false) {
2079 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2080 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2081 __kmp_gtid_from_thread(this_thr),
2082 this_thr->th.th_current_place));
2084 __kmp_affinity_set_place(gtid);
2087 #endif // KMP_AFFINITY_SUPPORTED
2088 // Perform the display affinity functionality
2089 if (__kmp_display_affinity) {
2090 if (team->t.t_display_affinity
2091 #if KMP_AFFINITY_SUPPORTED
2092 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2095 // NULL means use the affinity-format-var ICV
2096 __kmp_aux_display_affinity(gtid, NULL);
2097 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2098 this_thr->th.th_prev_level = team->t.t_level;
2101 if (!KMP_MASTER_TID(tid))
2102 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2104 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2105 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2106 if (!KMP_MASTER_TID(tid)) {
2107 // Get correct barrier object
2108 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2109 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2110 } // (prepare called inside barrier_release)
2112 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2113 ANNOTATE_BARRIER_END(&team->t.t_bar);
2114 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2115 team->t.t_id, tid));
2118 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2119 kmp_internal_control_t *new_icvs, ident_t *loc) {
2120 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2122 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2123 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2125 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2126 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2127 implicit task has this data before this function is called. */
2128 #if KMP_BARRIER_ICV_PULL
2129 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2130 untouched), where all of the worker threads can access them and make their
2131 own copies after the barrier. */
2132 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2133 // allocated at this point
2135 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2137 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2138 team->t.t_threads[0], team));
2139 #elif KMP_BARRIER_ICV_PUSH
2140 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2142 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2143 team->t.t_threads[0], team));
2145 // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2148 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2149 // allocated at this point
2150 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2151 // TODO: GEH - pass in better source location info since usually NULL here
2152 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2153 f, team->t.t_threads[f], team));
2154 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2155 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2156 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2157 f, team->t.t_threads[f], team));
2160 #endif // KMP_BARRIER_ICV_PULL