5 //===----------------------------------------------------------------------===//
7 // The LLVM Compiler Infrastructure
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
12 //===----------------------------------------------------------------------===//
15 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
20 #include "ompt-specific.h"
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
28 #include "tsan_annotations.h"
30 #if KMP_MIC && USE_NGO_STORES
32 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
37 #define ngo_load(src) ((void)0)
38 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
39 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
40 #define ngo_sync() ((void)0)
41 #endif /* KMP_MIC && USE_NGO_STORES */
43 void __kmp_print_structure(void); // Forward declaration
45 // ---------------------------- Barrier Algorithms ----------------------------
48 static void __kmp_linear_barrier_gather(
49 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52 kmp_team_t *team = this_thr->th.th_team;
53 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54 kmp_info_t **other_threads = team->t.t_threads;
58 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63 // Barrier imbalance - save arrive time to the thread
64 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66 __itt_get_timestamp();
69 // We now perform a linear reduction to signal that all of the threads have
71 if (!KMP_MASTER_TID(tid)) {
73 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74 "arrived(%p): %llu => %llu\n",
75 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78 // Mark arrival to master thread
79 /* After performing this write, a worker thread may not assume that the team
80 is valid any more - it could be deallocated by the master thread at any
82 ANNOTATE_BARRIER_BEGIN(this_thr);
83 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
86 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87 int nproc = this_thr->th.th_team_nproc;
89 // Don't have to worry about sleep bit here or atomic since team setting
90 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
92 // Collect all the worker team member threads.
93 for (i = 1; i < nproc; ++i) {
95 // Prefetch next thread's arrived count
97 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100 "arrived(%p) == %llu\n",
101 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
105 // Wait for worker thread to arrive
106 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
108 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109 ANNOTATE_BARRIER_END(other_threads[i]);
110 #if USE_ITT_BUILD && USE_ITT_NOTIFY
111 // Barrier imbalance - write min of the thread time and the other thread
112 // time to the thread.
113 if (__kmp_forkjoin_frames_mode == 2) {
114 this_thr->th.th_bar_min_time = KMP_MIN(
115 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
120 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
121 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
123 ANNOTATE_REDUCE_AFTER(reduce);
124 (*reduce)(this_thr->th.th_local.reduce_data,
125 other_threads[i]->th.th_local.reduce_data);
126 ANNOTATE_REDUCE_BEFORE(reduce);
127 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
130 // Don't have to worry about sleep bit here or atomic since team setting
131 team_bar->b_arrived = new_state;
132 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
133 "arrived(%p) = %llu\n",
134 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
139 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
143 static void __kmp_linear_barrier_release(
144 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
145 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
146 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
147 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
150 if (KMP_MASTER_TID(tid)) {
152 kmp_uint32 nproc = this_thr->th.th_team_nproc;
153 kmp_info_t **other_threads;
155 team = __kmp_threads[gtid]->th.th_team;
156 KMP_DEBUG_ASSERT(team != NULL);
157 other_threads = team->t.t_threads;
159 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
161 gtid, team->t.t_id, tid, bt));
164 #if KMP_BARRIER_ICV_PUSH
166 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
167 if (propagate_icvs) {
168 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
169 for (i = 1; i < nproc; ++i) {
170 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
172 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
173 &team->t.t_implicit_task_taskdata[0].td_icvs);
178 #endif // KMP_BARRIER_ICV_PUSH
180 // Now, release all of the worker threads
181 for (i = 1; i < nproc; ++i) {
183 // Prefetch next thread's go flag
185 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
186 #endif /* KMP_CACHE_MANAGE */
189 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
190 "go(%p): %u => %u\n",
191 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
192 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
193 other_threads[i]->th.th_bar[bt].bb.b_go,
194 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
195 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
196 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
201 } else { // Wait for the MASTER thread to release us
202 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
203 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
204 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
205 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
206 ANNOTATE_BARRIER_END(this_thr);
207 #if USE_ITT_BUILD && USE_ITT_NOTIFY
208 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
209 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
211 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
212 // Cancel wait on previous parallel region...
213 __kmp_itt_task_starting(itt_sync_obj);
215 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
219 if (itt_sync_obj != NULL)
220 // Call prepare as early as possible for "new" barrier
221 __kmp_itt_task_finished(itt_sync_obj);
223 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
224 // Early exit for reaping threads releasing forkjoin barrier
225 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
227 // The worker thread may now assume that the team is valid.
229 tid = __kmp_tid_from_gtid(gtid);
230 team = __kmp_threads[gtid]->th.th_team;
232 KMP_DEBUG_ASSERT(team != NULL);
233 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
235 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
236 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
237 KMP_MB(); // Flush all pending memory write invalidates.
241 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
242 gtid, team->t.t_id, tid, bt));
247 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
248 int tid, void (*reduce)(void *, void *)
249 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
250 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
251 kmp_team_t *team = this_thr->th.th_team;
252 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
253 kmp_info_t **other_threads = team->t.t_threads;
254 kmp_uint32 nproc = this_thr->th.th_team_nproc;
255 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
256 kmp_uint32 branch_factor = 1 << branch_bits;
258 kmp_uint32 child_tid;
259 kmp_uint64 new_state;
262 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
263 gtid, team->t.t_id, tid, bt));
264 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267 // Barrier imbalance - save arrive time to the thread
268 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
269 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
270 __itt_get_timestamp();
273 // Perform tree gather to wait until all threads have arrived; reduce any
274 // required data as we go
275 child_tid = (tid << branch_bits) + 1;
276 if (child_tid < nproc) {
277 // Parent threads wait for all their children to arrive
278 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
281 kmp_info_t *child_thr = other_threads[child_tid];
282 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
284 // Prefetch next thread's arrived count
285 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
287 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
288 #endif /* KMP_CACHE_MANAGE */
290 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
291 "arrived(%p) == %llu\n",
292 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
293 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
294 // Wait for child to arrive
295 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
296 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
297 ANNOTATE_BARRIER_END(child_thr);
298 #if USE_ITT_BUILD && USE_ITT_NOTIFY
299 // Barrier imbalance - write min of the thread time and a child time to
301 if (__kmp_forkjoin_frames_mode == 2) {
302 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
303 child_thr->th.th_bar_min_time);
308 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
309 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
310 team->t.t_id, child_tid));
311 ANNOTATE_REDUCE_AFTER(reduce);
312 (*reduce)(this_thr->th.th_local.reduce_data,
313 child_thr->th.th_local.reduce_data);
314 ANNOTATE_REDUCE_BEFORE(reduce);
315 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
319 } while (child <= branch_factor && child_tid < nproc);
322 if (!KMP_MASTER_TID(tid)) { // Worker threads
323 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
326 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
327 "arrived(%p): %llu => %llu\n",
328 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
329 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
330 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
332 // Mark arrival to parent thread
333 /* After performing this write, a worker thread may not assume that the team
334 is valid any more - it could be deallocated by the master thread at any
336 ANNOTATE_BARRIER_BEGIN(this_thr);
337 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
340 // Need to update the team arrived pointer if we are the master thread
341 if (nproc > 1) // New value was already computed above
342 team->t.t_bar[bt].b_arrived = new_state;
344 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
345 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
346 "arrived(%p) = %llu\n",
347 gtid, team->t.t_id, tid, team->t.t_id,
348 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
351 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
352 gtid, team->t.t_id, tid, bt));
355 static void __kmp_tree_barrier_release(
356 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
357 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
358 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
360 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
362 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
363 kmp_uint32 branch_factor = 1 << branch_bits;
365 kmp_uint32 child_tid;
367 // Perform a tree release for all of the threads that have been gathered
369 tid)) { // Handle fork barrier workers who aren't part of a team yet
370 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
371 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
372 // Wait for parent thread to release us
373 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
374 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
375 ANNOTATE_BARRIER_END(this_thr);
376 #if USE_ITT_BUILD && USE_ITT_NOTIFY
377 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
378 // In fork barrier where we could not get the object reliably (or
379 // ITTNOTIFY is disabled)
380 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
381 // Cancel wait on previous parallel region...
382 __kmp_itt_task_starting(itt_sync_obj);
384 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
387 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
388 if (itt_sync_obj != NULL)
389 // Call prepare as early as possible for "new" barrier
390 __kmp_itt_task_finished(itt_sync_obj);
392 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
393 // Early exit for reaping threads releasing forkjoin barrier
394 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
397 // The worker thread may now assume that the team is valid.
398 team = __kmp_threads[gtid]->th.th_team;
399 KMP_DEBUG_ASSERT(team != NULL);
400 tid = __kmp_tid_from_gtid(gtid);
402 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
404 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
405 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
406 KMP_MB(); // Flush all pending memory write invalidates.
408 team = __kmp_threads[gtid]->th.th_team;
409 KMP_DEBUG_ASSERT(team != NULL);
410 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
412 gtid, team->t.t_id, tid, bt));
414 nproc = this_thr->th.th_team_nproc;
415 child_tid = (tid << branch_bits) + 1;
417 if (child_tid < nproc) {
418 kmp_info_t **other_threads = team->t.t_threads;
420 // Parent threads release all their children
422 kmp_info_t *child_thr = other_threads[child_tid];
423 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
425 // Prefetch next thread's go count
426 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
428 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
429 #endif /* KMP_CACHE_MANAGE */
431 #if KMP_BARRIER_ICV_PUSH
433 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
434 if (propagate_icvs) {
435 __kmp_init_implicit_task(team->t.t_ident,
436 team->t.t_threads[child_tid], team,
438 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
439 &team->t.t_implicit_task_taskdata[0].td_icvs);
442 #endif // KMP_BARRIER_ICV_PUSH
444 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
445 "go(%p): %u => %u\n",
446 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
447 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
448 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
449 // Release child from barrier
450 ANNOTATE_BARRIER_BEGIN(child_thr);
451 kmp_flag_64 flag(&child_bar->b_go, child_thr);
455 } while (child <= branch_factor && child_tid < nproc);
458 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
459 gtid, team->t.t_id, tid, bt));
464 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
465 int tid, void (*reduce)(void *, void *)
466 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
467 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
468 kmp_team_t *team = this_thr->th.th_team;
469 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
470 kmp_info_t **other_threads = team->t.t_threads;
471 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
472 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
473 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
474 kmp_uint32 branch_factor = 1 << branch_bits;
480 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
481 gtid, team->t.t_id, tid, bt));
482 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
484 #if USE_ITT_BUILD && USE_ITT_NOTIFY
485 // Barrier imbalance - save arrive time to the thread
486 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
487 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
488 __itt_get_timestamp();
491 /* Perform a hypercube-embedded tree gather to wait until all of the threads
492 have arrived, and reduce any required data as we go. */
493 kmp_flag_64 p_flag(&thr_bar->b_arrived);
494 for (level = 0, offset = 1; offset < num_threads;
495 level += branch_bits, offset <<= branch_bits) {
497 kmp_uint32 child_tid;
499 if (((tid >> level) & (branch_factor - 1)) != 0) {
500 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
503 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
504 "arrived(%p): %llu => %llu\n",
505 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
506 team->t.t_id, parent_tid, &thr_bar->b_arrived,
508 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
509 // Mark arrival to parent thread
510 /* After performing this write (in the last iteration of the enclosing for
511 loop), a worker thread may not assume that the team is valid any more
512 - it could be deallocated by the master thread at any time. */
513 ANNOTATE_BARRIER_BEGIN(this_thr);
514 p_flag.set_waiter(other_threads[parent_tid]);
519 // Parent threads wait for children to arrive
520 if (new_state == KMP_BARRIER_UNUSED_STATE)
521 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
522 for (child = 1, child_tid = tid + (1 << level);
523 child < branch_factor && child_tid < num_threads;
524 child++, child_tid += (1 << level)) {
525 kmp_info_t *child_thr = other_threads[child_tid];
526 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
528 kmp_uint32 next_child_tid = child_tid + (1 << level);
529 // Prefetch next thread's arrived count
530 if (child + 1 < branch_factor && next_child_tid < num_threads)
532 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
533 #endif /* KMP_CACHE_MANAGE */
535 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
536 "arrived(%p) == %llu\n",
537 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
538 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
539 // Wait for child to arrive
540 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
541 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
542 ANNOTATE_BARRIER_END(child_thr);
543 #if USE_ITT_BUILD && USE_ITT_NOTIFY
544 // Barrier imbalance - write min of the thread time and a child time to
546 if (__kmp_forkjoin_frames_mode == 2) {
547 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
548 child_thr->th.th_bar_min_time);
553 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
554 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
555 team->t.t_id, child_tid));
556 ANNOTATE_REDUCE_AFTER(reduce);
557 (*reduce)(this_thr->th.th_local.reduce_data,
558 child_thr->th.th_local.reduce_data);
559 ANNOTATE_REDUCE_BEFORE(reduce);
560 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
565 if (KMP_MASTER_TID(tid)) {
566 // Need to update the team arrived pointer if we are the master thread
567 if (new_state == KMP_BARRIER_UNUSED_STATE)
568 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
570 team->t.t_bar[bt].b_arrived = new_state;
571 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
572 "arrived(%p) = %llu\n",
573 gtid, team->t.t_id, tid, team->t.t_id,
574 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
577 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
578 gtid, team->t.t_id, tid, bt));
581 // The reverse versions seem to beat the forward versions overall
582 #define KMP_REVERSE_HYPER_BAR
583 static void __kmp_hyper_barrier_release(
584 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
585 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
586 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
588 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
589 kmp_info_t **other_threads;
590 kmp_uint32 num_threads;
591 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
592 kmp_uint32 branch_factor = 1 << branch_bits;
594 kmp_uint32 child_tid;
598 /* Perform a hypercube-embedded tree release for all of the threads that have
599 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
600 are released in the reverse order of the corresponding gather, otherwise
601 threads are released in the same order. */
602 if (KMP_MASTER_TID(tid)) { // master
603 team = __kmp_threads[gtid]->th.th_team;
604 KMP_DEBUG_ASSERT(team != NULL);
605 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
607 gtid, team->t.t_id, tid, bt));
608 #if KMP_BARRIER_ICV_PUSH
609 if (propagate_icvs) { // master already has ICVs in final destination; copy
610 copy_icvs(&thr_bar->th_fixed_icvs,
611 &team->t.t_implicit_task_taskdata[tid].td_icvs);
614 } else { // Handle fork barrier workers who aren't part of a team yet
615 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
616 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
617 // Wait for parent thread to release us
618 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
619 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
620 ANNOTATE_BARRIER_END(this_thr);
621 #if USE_ITT_BUILD && USE_ITT_NOTIFY
622 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
623 // In fork barrier where we could not get the object reliably
624 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
625 // Cancel wait on previous parallel region...
626 __kmp_itt_task_starting(itt_sync_obj);
628 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
631 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
632 if (itt_sync_obj != NULL)
633 // Call prepare as early as possible for "new" barrier
634 __kmp_itt_task_finished(itt_sync_obj);
636 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
637 // Early exit for reaping threads releasing forkjoin barrier
638 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
641 // The worker thread may now assume that the team is valid.
642 team = __kmp_threads[gtid]->th.th_team;
643 KMP_DEBUG_ASSERT(team != NULL);
644 tid = __kmp_tid_from_gtid(gtid);
646 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
648 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
649 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
650 KMP_MB(); // Flush all pending memory write invalidates.
652 num_threads = this_thr->th.th_team_nproc;
653 other_threads = team->t.t_threads;
655 #ifdef KMP_REVERSE_HYPER_BAR
656 // Count up to correct level for parent
657 for (level = 0, offset = 1;
658 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
659 level += branch_bits, offset <<= branch_bits)
662 // Now go down from there
663 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
664 level -= branch_bits, offset >>= branch_bits)
666 // Go down the tree, level by level
667 for (level = 0, offset = 1; offset < num_threads;
668 level += branch_bits, offset <<= branch_bits)
669 #endif // KMP_REVERSE_HYPER_BAR
671 #ifdef KMP_REVERSE_HYPER_BAR
672 /* Now go in reverse order through the children, highest to lowest.
673 Initial setting of child is conservative here. */
674 child = num_threads >> ((level == 0) ? level : level - 1);
675 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
676 child_tid = tid + (child << level);
677 child >= 1; child--, child_tid -= (1 << level))
679 if (((tid >> level) & (branch_factor - 1)) != 0)
680 // No need to go lower than this, since this is the level parent would be
683 // Iterate through children on this level of the tree
684 for (child = 1, child_tid = tid + (1 << level);
685 child < branch_factor && child_tid < num_threads;
686 child++, child_tid += (1 << level))
687 #endif // KMP_REVERSE_HYPER_BAR
689 if (child_tid >= num_threads)
690 continue; // Child doesn't exist so keep going
692 kmp_info_t *child_thr = other_threads[child_tid];
693 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
695 kmp_uint32 next_child_tid = child_tid - (1 << level);
696 // Prefetch next thread's go count
697 #ifdef KMP_REVERSE_HYPER_BAR
698 if (child - 1 >= 1 && next_child_tid < num_threads)
700 if (child + 1 < branch_factor && next_child_tid < num_threads)
701 #endif // KMP_REVERSE_HYPER_BAR
703 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
704 #endif /* KMP_CACHE_MANAGE */
706 #if KMP_BARRIER_ICV_PUSH
707 if (propagate_icvs) // push my fixed ICVs to my child
708 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
709 #endif // KMP_BARRIER_ICV_PUSH
713 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
714 "go(%p): %u => %u\n",
715 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
716 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
717 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
718 // Release child from barrier
719 ANNOTATE_BARRIER_BEGIN(child_thr);
720 kmp_flag_64 flag(&child_bar->b_go, child_thr);
725 #if KMP_BARRIER_ICV_PUSH
726 if (propagate_icvs &&
727 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
728 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
730 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
731 &thr_bar->th_fixed_icvs);
736 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
737 gtid, team->t.t_id, tid, bt));
740 // Hierarchical Barrier
742 // Initialize thread barrier data
743 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
744 Performs the minimum amount of initialization required based on how the team
745 has changed. Returns true if leaf children will require both on-core and
746 traditional wake-up mechanisms. For example, if the team size increases,
747 threads already in the team will respond to on-core wakeup on their parent
748 thread, but threads newly added to the team will only be listening on the
750 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
751 kmp_bstate_t *thr_bar,
752 kmp_uint32 nproc, int gtid,
753 int tid, kmp_team_t *team) {
754 // Checks to determine if (re-)initialization is needed
755 bool uninitialized = thr_bar->team == NULL;
756 bool team_changed = team != thr_bar->team;
757 bool team_sz_changed = nproc != thr_bar->nproc;
758 bool tid_changed = tid != thr_bar->old_tid;
761 if (uninitialized || team_sz_changed) {
762 __kmp_get_hierarchy(nproc, thr_bar);
765 if (uninitialized || team_sz_changed || tid_changed) {
766 thr_bar->my_level = thr_bar->depth - 1; // default for master
767 thr_bar->parent_tid = -1; // default for master
769 tid)) { // if not master, find parent thread in hierarchy
771 while (d < thr_bar->depth) { // find parent based on level of thread in
772 // hierarchy, and note level
774 if (d == thr_bar->depth - 2) { // reached level right below the master
775 thr_bar->parent_tid = 0;
776 thr_bar->my_level = d;
778 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
779 0) { // TODO: can we make this op faster?
780 // thread is not a subtree root at next level, so this is max
781 thr_bar->parent_tid = tid - rem;
782 thr_bar->my_level = d;
788 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
789 thr_bar->old_tid = tid;
790 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
791 thr_bar->team = team;
792 thr_bar->parent_bar =
793 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
795 if (uninitialized || team_changed || tid_changed) {
796 thr_bar->team = team;
797 thr_bar->parent_bar =
798 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
801 if (uninitialized || team_sz_changed || tid_changed) {
802 thr_bar->nproc = nproc;
803 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
804 if (thr_bar->my_level == 0)
805 thr_bar->leaf_kids = 0;
806 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
807 thr_bar->leaf_kids = nproc - tid - 1;
808 thr_bar->leaf_state = 0;
809 for (int i = 0; i < thr_bar->leaf_kids; ++i)
810 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
815 static void __kmp_hierarchical_barrier_gather(
816 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
817 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
818 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
819 kmp_team_t *team = this_thr->th.th_team;
820 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
821 kmp_uint32 nproc = this_thr->th.th_team_nproc;
822 kmp_info_t **other_threads = team->t.t_threads;
823 kmp_uint64 new_state;
825 int level = team->t.t_level;
828 ->th.th_teams_microtask) // are we inside the teams construct?
829 if (this_thr->th.th_teams_size.nteams > 1)
830 ++level; // level was not increased in teams construct for team_of_masters
833 thr_bar->use_oncore_barrier = 1;
835 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
837 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
839 gtid, team->t.t_id, tid, bt));
840 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
843 // Barrier imbalance - save arrive time to the thread
844 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
845 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
849 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
852 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
855 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
856 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
857 thr_bar->use_oncore_barrier) {
858 if (thr_bar->leaf_kids) {
859 // First, wait for leaf children to check-in on my b_arrived flag
860 kmp_uint64 leaf_state =
862 ? thr_bar->b_arrived | thr_bar->leaf_state
863 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
864 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
866 gtid, team->t.t_id, tid));
867 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
868 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870 ANNOTATE_REDUCE_AFTER(reduce);
871 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
873 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
875 gtid, team->t.t_id, tid,
876 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
878 ANNOTATE_BARRIER_END(other_threads[child_tid]);
879 (*reduce)(this_thr->th.th_local.reduce_data,
880 other_threads[child_tid]->th.th_local.reduce_data);
882 ANNOTATE_REDUCE_BEFORE(reduce);
883 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
885 // clear leaf_state bits
886 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
888 // Next, wait for higher level children on each child's b_arrived flag
889 for (kmp_uint32 d = 1; d < thr_bar->my_level;
890 ++d) { // gather lowest level threads first, but skip 0
891 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
892 skip = thr_bar->skip_per_level[d];
895 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
896 kmp_info_t *child_thr = other_threads[child_tid];
897 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
898 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
900 "arrived(%p) == %llu\n",
901 gtid, team->t.t_id, tid,
902 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
903 child_tid, &child_bar->b_arrived, new_state));
904 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
905 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
906 ANNOTATE_BARRIER_END(child_thr);
908 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
910 gtid, team->t.t_id, tid,
911 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
913 ANNOTATE_REDUCE_AFTER(reduce);
914 (*reduce)(this_thr->th.th_local.reduce_data,
915 child_thr->th.th_local.reduce_data);
916 ANNOTATE_REDUCE_BEFORE(reduce);
917 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
921 } else { // Blocktime is not infinite
922 for (kmp_uint32 d = 0; d < thr_bar->my_level;
923 ++d) { // Gather lowest level threads first
924 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
925 skip = thr_bar->skip_per_level[d];
928 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
929 kmp_info_t *child_thr = other_threads[child_tid];
930 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
931 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
933 "arrived(%p) == %llu\n",
934 gtid, team->t.t_id, tid,
935 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
936 child_tid, &child_bar->b_arrived, new_state));
937 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
938 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
939 ANNOTATE_BARRIER_END(child_thr);
941 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
943 gtid, team->t.t_id, tid,
944 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
946 ANNOTATE_REDUCE_AFTER(reduce);
947 (*reduce)(this_thr->th.th_local.reduce_data,
948 child_thr->th.th_local.reduce_data);
949 ANNOTATE_REDUCE_BEFORE(reduce);
950 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
956 // All subordinates are gathered; now release parent if not master thread
958 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
959 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
960 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
961 gtid, team->t.t_id, tid,
962 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
963 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
964 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
965 /* Mark arrival to parent: After performing this write, a worker thread may
966 not assume that the team is valid any more - it could be deallocated by
967 the master thread at any time. */
968 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
969 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
971 ANNOTATE_BARRIER_BEGIN(this_thr);
972 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
975 // Leaf does special release on "offset" bits of parent's b_arrived flag
976 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
977 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
978 flag.set_waiter(other_threads[thr_bar->parent_tid]);
981 } else { // Master thread needs to update the team's b_arrived value
982 team->t.t_bar[bt].b_arrived = new_state;
983 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
984 "arrived(%p) = %llu\n",
985 gtid, team->t.t_id, tid, team->t.t_id,
986 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
988 // Is the team access below unsafe or just technically invalid?
989 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
991 gtid, team->t.t_id, tid, bt));
994 static void __kmp_hierarchical_barrier_release(
995 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
996 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
997 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
999 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1001 bool team_change = false; // indicates on-core barrier shouldn't be used
1003 if (KMP_MASTER_TID(tid)) {
1004 team = __kmp_threads[gtid]->th.th_team;
1005 KMP_DEBUG_ASSERT(team != NULL);
1006 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1007 "entered barrier type %d\n",
1008 gtid, team->t.t_id, tid, bt));
1009 } else { // Worker threads
1010 // Wait for parent thread to release me
1011 if (!thr_bar->use_oncore_barrier ||
1012 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1013 thr_bar->team == NULL) {
1014 // Use traditional method of waiting on my own b_go flag
1015 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1016 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1017 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1018 ANNOTATE_BARRIER_END(this_thr);
1019 TCW_8(thr_bar->b_go,
1020 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1021 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1022 // infinite, not nested
1023 // Wait on my "offset" bits on parent's b_go flag
1024 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1025 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1026 thr_bar->offset, bt,
1027 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1028 flag.wait(this_thr, TRUE);
1029 if (thr_bar->wait_flag ==
1030 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1031 TCW_8(thr_bar->b_go,
1032 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1033 } else { // Reset my bits on parent's b_go flag
1034 (RCAST(volatile char *,
1035 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1038 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1039 // Early exit for reaping threads releasing forkjoin barrier
1040 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1042 // The worker thread may now assume that the team is valid.
1043 team = __kmp_threads[gtid]->th.th_team;
1044 KMP_DEBUG_ASSERT(team != NULL);
1045 tid = __kmp_tid_from_gtid(gtid);
1049 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1050 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1051 KMP_MB(); // Flush all pending memory write invalidates.
1054 nproc = this_thr->th.th_team_nproc;
1055 int level = team->t.t_level;
1057 if (team->t.t_threads[0]
1058 ->th.th_teams_microtask) { // are we inside the teams construct?
1059 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1060 this_thr->th.th_teams_level == level)
1061 ++level; // level was not increased in teams construct for team_of_workers
1062 if (this_thr->th.th_teams_size.nteams > 1)
1063 ++level; // level was not increased in teams construct for team_of_masters
1067 thr_bar->use_oncore_barrier = 1;
1069 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1071 // If the team size has increased, we still communicate with old leaves via
1073 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1074 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1075 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1077 // But if the entire team changes, we won't use oncore barrier at all
1081 #if KMP_BARRIER_ICV_PUSH
1082 if (propagate_icvs) {
1083 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1086 tid)) { // master already has copy in final destination; copy
1087 copy_icvs(&thr_bar->th_fixed_icvs,
1088 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1089 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1090 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1091 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1092 // leaves (on-core children) pull parent's fixed ICVs directly to local
1094 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1095 &thr_bar->parent_bar->th_fixed_icvs);
1096 // non-leaves will get ICVs piggybacked with b_go via NGO store
1097 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1098 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1100 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1101 else // leaves copy parent's fixed ICVs directly to local ICV store
1102 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1103 &thr_bar->parent_bar->th_fixed_icvs);
1106 #endif // KMP_BARRIER_ICV_PUSH
1108 // Now, release my children
1109 if (thr_bar->my_level) { // not a leaf
1110 kmp_int32 child_tid;
1112 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1113 thr_bar->use_oncore_barrier) {
1114 if (KMP_MASTER_TID(tid)) { // do a flat release
1115 // Set local b_go to bump children via NGO store of the cache line
1116 // containing IVCs and b_go.
1117 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1118 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1120 ngo_load(&thr_bar->th_fixed_icvs);
1121 // This loops over all the threads skipping only the leaf nodes in the
1123 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1124 child_tid += thr_bar->skip_per_level[1]) {
1125 kmp_bstate_t *child_bar =
1126 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1127 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1128 "releasing T#%d(%d:%d)"
1129 " go(%p): %u => %u\n",
1130 gtid, team->t.t_id, tid,
1131 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1132 child_tid, &child_bar->b_go, child_bar->b_go,
1133 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1134 // Use ngo store (if available) to both store ICVs and release child
1136 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1140 TCW_8(thr_bar->b_go,
1141 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1142 // Now, release leaf children
1143 if (thr_bar->leaf_kids) { // if there are any
1144 // We test team_change on the off-chance that the level 1 team changed.
1146 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1147 if (old_leaf_kids) { // release old leaf kids
1148 thr_bar->b_go |= old_leaf_state;
1150 // Release new leaf kids
1151 last = tid + thr_bar->skip_per_level[1];
1154 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1155 ++child_tid) { // skip_per_level[0]=1
1156 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1157 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1160 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1161 " T#%d(%d:%d) go(%p): %u => %u\n",
1162 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1163 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1164 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1165 // Release child using child's b_go flag
1166 ANNOTATE_BARRIER_BEGIN(child_thr);
1167 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1170 } else { // Release all children at once with leaf_state bits on my own
1172 thr_bar->b_go |= thr_bar->leaf_state;
1175 } else { // Blocktime is not infinite; do a simple hierarchical release
1176 for (int d = thr_bar->my_level - 1; d >= 0;
1177 --d) { // Release highest level threads first
1178 last = tid + thr_bar->skip_per_level[d + 1];
1179 kmp_uint32 skip = thr_bar->skip_per_level[d];
1182 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1183 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1184 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1185 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1186 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1187 gtid, team->t.t_id, tid,
1188 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1189 child_tid, &child_bar->b_go, child_bar->b_go,
1190 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1191 // Release child using child's b_go flag
1192 ANNOTATE_BARRIER_BEGIN(child_thr);
1193 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1198 #if KMP_BARRIER_ICV_PUSH
1199 if (propagate_icvs && !KMP_MASTER_TID(tid))
1200 // non-leaves copy ICVs from fixed ICVs to local dest
1201 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1202 &thr_bar->th_fixed_icvs);
1203 #endif // KMP_BARRIER_ICV_PUSH
1205 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1206 "barrier type %d\n",
1207 gtid, team->t.t_id, tid, bt));
1210 // End of Barrier Algorithms
1212 // Internal function to do a barrier.
1213 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1214 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1216 Returns 0 if master thread, 1 if worker thread. */
1217 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1218 size_t reduce_size, void *reduce_data,
1219 void (*reduce)(void *, void *)) {
1220 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1221 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1222 int tid = __kmp_tid_from_gtid(gtid);
1223 kmp_info_t *this_thr = __kmp_threads[gtid];
1224 kmp_team_t *team = this_thr->th.th_team;
1226 #if OMPT_SUPPORT && OMPT_OPTIONAL
1227 ompt_data_t *my_task_data;
1228 ompt_data_t *my_parallel_data;
1229 void *return_address;
1232 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1233 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1235 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1237 if (ompt_enabled.enabled) {
1239 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1240 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1241 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1242 if (ompt_enabled.ompt_callback_sync_region) {
1243 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1244 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1245 my_task_data, return_address);
1247 if (ompt_enabled.ompt_callback_sync_region_wait) {
1248 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1249 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1250 my_task_data, return_address);
1253 // It is OK to report the barrier state after the barrier begin callback.
1254 // According to the OMPT specification, a compliant implementation may
1255 // even delay reporting this state until the barrier begins to wait.
1256 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1260 if (!team->t.t_serialized) {
1262 // This value will be used in itt notify events below.
1263 void *itt_sync_obj = NULL;
1265 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1266 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1268 #endif /* USE_ITT_BUILD */
1269 if (__kmp_tasking_mode == tskm_extra_barrier) {
1270 __kmp_tasking_barrier(team, this_thr, gtid);
1272 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1273 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1276 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1277 access it when the team struct is not guaranteed to exist. */
1278 // See note about the corresponding code in __kmp_join_barrier() being
1279 // performance-critical.
1280 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1282 this_thr->th.th_team_bt_intervals =
1283 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1284 this_thr->th.th_team_bt_set =
1285 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1287 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1292 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1293 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1294 #endif /* USE_ITT_BUILD */
1296 // Let the debugger know: the thread arrived to the barrier and waiting.
1297 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1298 team->t.t_bar[bt].b_master_arrived += 1;
1300 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1302 #endif /* USE_DEBUGGER */
1303 if (reduce != NULL) {
1304 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1305 this_thr->th.th_local.reduce_data = reduce_data;
1308 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1309 __kmp_task_team_setup(
1311 0); // use 0 to only setup the current team if nthreads > 1
1313 switch (__kmp_barrier_gather_pattern[bt]) {
1314 case bp_hyper_bar: {
1315 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1317 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1318 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1321 case bp_hierarchical_bar: {
1322 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid,
1323 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1327 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits
1329 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1330 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1334 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1335 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1341 if (KMP_MASTER_TID(tid)) {
1343 if (__kmp_tasking_mode != tskm_immediate_exec) {
1344 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1347 // Let the debugger know: All threads are arrived and starting leaving the
1349 team->t.t_bar[bt].b_team_arrived += 1;
1353 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1354 // Reset cancellation flag for worksharing constructs
1355 if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
1356 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1360 /* TODO: In case of split reduction barrier, master thread may send
1361 acquired event early, before the final summation into the shared
1362 variable is done (final summation can be a long operation for array
1364 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1365 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1366 #endif /* USE_ITT_BUILD */
1367 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1368 // Barrier - report frame end (only if active_level == 1)
1369 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1370 __kmp_forkjoin_frames_mode &&
1372 this_thr->th.th_teams_microtask == NULL &&
1374 team->t.t_active_level == 1) {
1375 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1376 kmp_uint64 cur_time = __itt_get_timestamp();
1377 kmp_info_t **other_threads = team->t.t_threads;
1378 int nproc = this_thr->th.th_team_nproc;
1380 switch (__kmp_forkjoin_frames_mode) {
1382 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1384 this_thr->th.th_frame_time = cur_time;
1386 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1388 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1392 if (__itt_metadata_add_ptr) {
1393 // Initialize with master's wait time
1394 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1395 // Set arrive time to zero to be able to check it in
1396 // __kmp_invoke_task(); the same is done inside the loop below
1397 this_thr->th.th_bar_arrive_time = 0;
1398 for (i = 1; i < nproc; ++i) {
1399 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1400 other_threads[i]->th.th_bar_arrive_time = 0;
1402 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1404 (kmp_uint64)(reduce != NULL));
1406 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1408 this_thr->th.th_frame_time = cur_time;
1412 #endif /* USE_ITT_BUILD */
1416 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1417 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1418 #endif /* USE_ITT_BUILD */
1420 if (status == 1 || !is_split) {
1421 switch (__kmp_barrier_release_pattern[bt]) {
1422 case bp_hyper_bar: {
1423 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1424 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1425 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1428 case bp_hierarchical_bar: {
1429 __kmp_hierarchical_barrier_release(
1430 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1434 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1435 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1436 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1440 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1441 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1444 if (__kmp_tasking_mode != tskm_immediate_exec) {
1445 __kmp_task_team_sync(this_thr, team);
1450 /* GEH: TODO: Move this under if-condition above and also include in
1451 __kmp_end_split_barrier(). This will more accurately represent the actual
1452 release time of the threads for split barriers. */
1453 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1454 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1455 #endif /* USE_ITT_BUILD */
1456 } else { // Team is serialized.
1458 if (__kmp_tasking_mode != tskm_immediate_exec) {
1460 if (this_thr->th.th_task_team != NULL) {
1462 void *itt_sync_obj = NULL;
1463 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1464 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1465 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1469 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1471 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1472 __kmp_task_team_setup(this_thr, team, 0);
1475 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1476 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1477 #endif /* USE_ITT_BUILD */
1480 // The task team should be NULL for serialized code (tasks will be
1481 // executed immediately)
1482 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1483 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1487 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1488 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1489 __kmp_tid_from_gtid(gtid), status));
1492 if (ompt_enabled.enabled) {
1494 if (ompt_enabled.ompt_callback_sync_region_wait) {
1495 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1496 ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1497 my_task_data, return_address);
1499 if (ompt_enabled.ompt_callback_sync_region) {
1500 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1501 ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1502 my_task_data, return_address);
1505 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1508 ANNOTATE_BARRIER_END(&team->t.t_bar);
1513 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1514 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1515 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1516 int tid = __kmp_tid_from_gtid(gtid);
1517 kmp_info_t *this_thr = __kmp_threads[gtid];
1518 kmp_team_t *team = this_thr->th.th_team;
1520 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1521 if (!team->t.t_serialized) {
1522 if (KMP_MASTER_GTID(gtid)) {
1523 switch (__kmp_barrier_release_pattern[bt]) {
1524 case bp_hyper_bar: {
1525 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1526 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1527 FALSE USE_ITT_BUILD_ARG(NULL));
1530 case bp_hierarchical_bar: {
1531 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1532 FALSE USE_ITT_BUILD_ARG(NULL));
1536 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1537 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1538 FALSE USE_ITT_BUILD_ARG(NULL));
1542 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1543 FALSE USE_ITT_BUILD_ARG(NULL));
1546 if (__kmp_tasking_mode != tskm_immediate_exec) {
1547 __kmp_task_team_sync(this_thr, team);
1551 ANNOTATE_BARRIER_END(&team->t.t_bar);
1554 void __kmp_join_barrier(int gtid) {
1555 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1556 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1557 kmp_info_t *this_thr = __kmp_threads[gtid];
1560 kmp_info_t *master_thread;
1564 #endif /* KMP_DEBUG */
1566 void *itt_sync_obj = NULL;
1568 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1569 // Get object created at fork_barrier
1570 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1572 #endif /* USE_ITT_BUILD */
1576 team = this_thr->th.th_team;
1577 nproc = this_thr->th.th_team_nproc;
1578 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1579 tid = __kmp_tid_from_gtid(gtid);
1581 team_id = team->t.t_id;
1582 #endif /* KMP_DEBUG */
1583 master_thread = this_thr->th.th_team_master;
1585 if (master_thread != team->t.t_threads[0]) {
1586 __kmp_print_structure();
1588 #endif /* KMP_DEBUG */
1589 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1593 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1594 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1595 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1596 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1597 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1598 gtid, team_id, tid));
1600 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1602 if (ompt_enabled.enabled) {
1604 ompt_data_t *my_task_data;
1605 ompt_data_t *my_parallel_data;
1606 void *codeptr = NULL;
1607 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1608 if (KMP_MASTER_TID(ds_tid) &&
1609 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1610 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1611 codeptr = team->t.ompt_team_info.master_return_address;
1612 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1613 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1614 if (ompt_enabled.ompt_callback_sync_region) {
1615 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1616 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1617 my_task_data, codeptr);
1619 if (ompt_enabled.ompt_callback_sync_region_wait) {
1620 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1621 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1622 my_task_data, codeptr);
1624 if (!KMP_MASTER_TID(ds_tid))
1625 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1627 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1631 if (__kmp_tasking_mode == tskm_extra_barrier) {
1632 __kmp_tasking_barrier(team, this_thr, gtid);
1633 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1637 if (__kmp_tasking_mode != tskm_immediate_exec) {
1638 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1639 "%p, th_task_team = %p\n",
1640 __kmp_gtid_from_thread(this_thr), team_id,
1641 team->t.t_task_team[this_thr->th.th_task_state],
1642 this_thr->th.th_task_team));
1643 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1644 team->t.t_task_team[this_thr->th.th_task_state]);
1646 #endif /* KMP_DEBUG */
1648 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1649 access it when the team struct is not guaranteed to exist. Doing these
1650 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1651 we do not perform the copy if blocktime=infinite, since the values are not
1652 used by __kmp_wait_template() in that case. */
1653 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1655 this_thr->th.th_team_bt_intervals =
1656 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1657 this_thr->th.th_team_bt_set =
1658 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1660 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1665 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1666 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1667 #endif /* USE_ITT_BUILD */
1669 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1670 case bp_hyper_bar: {
1671 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1672 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1673 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1676 case bp_hierarchical_bar: {
1677 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1678 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1682 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1683 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1684 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1688 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1689 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1693 /* From this point on, the team data structure may be deallocated at any time
1694 by the master thread - it is unsafe to reference it in any of the worker
1695 threads. Any per-team data items that need to be referenced before the
1696 end of the barrier should be moved to the kmp_task_team_t structs. */
1697 if (KMP_MASTER_TID(tid)) {
1698 if (__kmp_tasking_mode != tskm_immediate_exec) {
1699 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1702 if (__kmp_display_affinity) {
1703 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1706 #if KMP_STATS_ENABLED
1707 // Have master thread flag the workers to indicate they are now waiting for
1708 // next parallel region, Also wake them up so they switch their timers to
1710 for (int i = 0; i < team->t.t_nproc; ++i) {
1711 kmp_info_t *team_thread = team->t.t_threads[i];
1712 if (team_thread == this_thr)
1714 team_thread->th.th_stats->setIdleFlag();
1715 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1716 team_thread->th.th_sleep_loc != NULL)
1717 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1718 team_thread->th.th_sleep_loc);
1722 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1723 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1724 #endif /* USE_ITT_BUILD */
1726 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1727 // Join barrier - report frame end
1728 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1729 __kmp_forkjoin_frames_mode &&
1731 this_thr->th.th_teams_microtask == NULL &&
1733 team->t.t_active_level == 1) {
1734 kmp_uint64 cur_time = __itt_get_timestamp();
1735 ident_t *loc = team->t.t_ident;
1736 kmp_info_t **other_threads = team->t.t_threads;
1737 int nproc = this_thr->th.th_team_nproc;
1739 switch (__kmp_forkjoin_frames_mode) {
1741 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1745 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1749 if (__itt_metadata_add_ptr) {
1750 // Initialize with master's wait time
1751 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1752 // Set arrive time to zero to be able to check it in
1753 // __kmp_invoke_task(); the same is done inside the loop below
1754 this_thr->th.th_bar_arrive_time = 0;
1755 for (i = 1; i < nproc; ++i) {
1756 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1757 other_threads[i]->th.th_bar_arrive_time = 0;
1759 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1760 cur_time, delta, 0);
1762 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1764 this_thr->th.th_frame_time = cur_time;
1768 #endif /* USE_ITT_BUILD */
1772 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1773 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1775 #endif /* USE_ITT_BUILD */
1778 if (KMP_MASTER_TID(tid)) {
1781 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1782 gtid, team_id, tid, nproc));
1784 #endif /* KMP_DEBUG */
1786 // TODO now, mark worker threads as done so they may be disbanded
1787 KMP_MB(); // Flush all pending memory write invalidates.
1789 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1791 ANNOTATE_BARRIER_END(&team->t.t_bar);
1794 // TODO release worker threads' fork barriers as we are ready instead of all at
1796 void __kmp_fork_barrier(int gtid, int tid) {
1797 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1798 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1799 kmp_info_t *this_thr = __kmp_threads[gtid];
1800 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1802 void *itt_sync_obj = NULL;
1803 #endif /* USE_ITT_BUILD */
1805 ANNOTATE_BARRIER_END(&team->t.t_bar);
1807 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1808 (team != NULL) ? team->t.t_id : -1, tid));
1810 // th_team pointer only valid for master thread here
1811 if (KMP_MASTER_TID(tid)) {
1812 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1813 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1814 // Create itt barrier object
1815 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1816 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1818 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1821 kmp_info_t **other_threads = team->t.t_threads;
1827 for (i = 1; i < team->t.t_nproc; ++i) {
1829 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1831 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1832 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1833 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1835 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1836 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1837 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1841 if (__kmp_tasking_mode != tskm_immediate_exec) {
1842 // 0 indicates setup current task team if nthreads > 1
1843 __kmp_task_team_setup(this_thr, team, 0);
1846 /* The master thread may have changed its blocktime between the join barrier
1847 and the fork barrier. Copy the blocktime info to the thread, where
1848 __kmp_wait_template() can access it when the team struct is not
1849 guaranteed to exist. */
1850 // See note about the corresponding code in __kmp_join_barrier() being
1851 // performance-critical
1852 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1854 this_thr->th.th_team_bt_intervals =
1855 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1856 this_thr->th.th_team_bt_set =
1857 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1859 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1864 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1865 case bp_hyper_bar: {
1866 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1867 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1868 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1871 case bp_hierarchical_bar: {
1872 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1873 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1877 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1878 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1879 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1883 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1884 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1889 if (ompt_enabled.enabled &&
1890 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1891 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1892 ompt_data_t *task_data = (team)
1893 ? OMPT_CUR_TASK_DATA(this_thr)
1894 : &(this_thr->th.ompt_thread_info.task_data);
1895 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1897 void *codeptr = NULL;
1898 if (KMP_MASTER_TID(ds_tid) &&
1899 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1900 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1901 codeptr = team->t.ompt_team_info.master_return_address;
1902 if (ompt_enabled.ompt_callback_sync_region_wait) {
1903 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1904 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1906 if (ompt_enabled.ompt_callback_sync_region) {
1907 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1908 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
1911 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
1912 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1913 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1918 // Early exit for reaping threads releasing forkjoin barrier
1919 if (TCR_4(__kmp_global.g.g_done)) {
1920 this_thr->th.th_task_team = NULL;
1922 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1923 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1924 if (!KMP_MASTER_TID(tid)) {
1925 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1927 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1930 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1931 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1935 /* We can now assume that a valid team structure has been allocated by the
1936 master and propagated to all worker threads. The current thread, however,
1937 may not be part of the team, so we can't blindly assume that the team
1938 pointer is non-null. */
1939 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1940 KMP_DEBUG_ASSERT(team != NULL);
1941 tid = __kmp_tid_from_gtid(gtid);
1943 #if KMP_BARRIER_ICV_PULL
1944 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1945 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
1946 implicit task has this data before this function is called. We cannot
1947 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
1948 struct, because it is not always the case that the threads arrays have
1949 been allocated when __kmp_fork_call() is executed. */
1951 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1952 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1953 // Copy the initial ICVs from the master's thread struct to the implicit
1954 // task for this tid.
1956 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1957 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
1959 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1960 &team->t.t_threads[0]
1961 ->th.th_bar[bs_forkjoin_barrier]
1965 #endif // KMP_BARRIER_ICV_PULL
1967 if (__kmp_tasking_mode != tskm_immediate_exec) {
1968 __kmp_task_team_sync(this_thr, team);
1971 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1972 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1973 if (proc_bind == proc_bind_intel) {
1975 #if KMP_AFFINITY_SUPPORTED
1976 // Call dynamic affinity settings
1977 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1978 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
1980 #endif // KMP_AFFINITY_SUPPORTED
1981 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1982 } else if (proc_bind != proc_bind_false) {
1983 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1984 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1985 __kmp_gtid_from_thread(this_thr),
1986 this_thr->th.th_current_place));
1988 __kmp_affinity_set_place(gtid);
1993 // Perform the display affinity functionality
1994 if (__kmp_display_affinity) {
1995 if (team->t.t_display_affinity
1996 #if KMP_AFFINITY_SUPPORTED
1997 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2000 // NULL means use the affinity-format-var ICV
2001 __kmp_aux_display_affinity(gtid, NULL);
2002 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2003 this_thr->th.th_prev_level = team->t.t_level;
2006 if (!KMP_MASTER_TID(tid))
2007 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2010 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2011 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2012 if (!KMP_MASTER_TID(tid)) {
2013 // Get correct barrier object
2014 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2015 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2016 } // (prepare called inside barrier_release)
2018 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2019 ANNOTATE_BARRIER_END(&team->t.t_bar);
2020 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2021 team->t.t_id, tid));
2024 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2025 kmp_internal_control_t *new_icvs, ident_t *loc) {
2026 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2028 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2029 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2031 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2032 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2033 implicit task has this data before this function is called. */
2034 #if KMP_BARRIER_ICV_PULL
2035 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2036 untouched), where all of the worker threads can access them and make their
2037 own copies after the barrier. */
2038 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2039 // allocated at this point
2041 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2043 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2044 team->t.t_threads[0], team));
2045 #elif KMP_BARRIER_ICV_PUSH
2046 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2048 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2049 team->t.t_threads[0], team));
2051 // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2054 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2055 // allocated at this point
2056 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2057 // TODO: GEH - pass in better source location info since usually NULL here
2058 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2059 f, team->t.t_threads[f], team));
2060 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2061 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2062 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2063 f, team->t.t_threads[f], team));
2066 #endif // KMP_BARRIER_ICV_PULL