1 #ifndef KMP_DISPATCH_HIER_H
2 #define KMP_DISPATCH_HIER_H
4 #include "kmp_dispatch.h"
6 // Layer type for scheduling hierarchy
7 enum kmp_hier_layer_e {
17 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
18 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
20 case kmp_hier_layer_e::LAYER_THREAD:
22 case kmp_hier_layer_e::LAYER_L1:
24 case kmp_hier_layer_e::LAYER_L2:
26 case kmp_hier_layer_e::LAYER_L3:
28 case kmp_hier_layer_e::LAYER_NUMA:
30 case kmp_hier_layer_e::LAYER_LOOP:
32 case kmp_hier_layer_e::LAYER_LAST:
36 // Appease compilers, should never get here
40 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
41 typedef struct kmp_hier_sched_env_t {
44 enum sched_type *scheds;
45 kmp_int32 *small_chunks;
46 kmp_int64 *large_chunks;
47 kmp_hier_layer_e *layers;
48 // Append a level of the hierarchy
49 void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
51 scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
52 kmp_hier_layer_e::LAYER_LAST);
53 small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
54 kmp_hier_layer_e::LAYER_LAST);
55 large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
56 kmp_hier_layer_e::LAYER_LAST);
57 layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
58 kmp_hier_layer_e::LAYER_LAST);
59 capacity = kmp_hier_layer_e::LAYER_LAST;
61 int current_size = size;
62 KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
63 scheds[current_size] = sched;
64 layers[current_size] = layer;
65 small_chunks[current_size] = chunk;
66 large_chunks[current_size] = (kmp_int64)chunk;
69 // Sort the hierarchy using selection sort, size will always be small
70 // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
74 for (int i = 0; i < size; ++i) {
76 for (int j = i + 1; j < size; ++j) {
77 if (layers[j] < layers[switch_index])
80 if (switch_index != i) {
81 kmp_hier_layer_e temp1 = layers[i];
82 enum sched_type temp2 = scheds[i];
83 kmp_int32 temp3 = small_chunks[i];
84 kmp_int64 temp4 = large_chunks[i];
85 layers[i] = layers[switch_index];
86 scheds[i] = scheds[switch_index];
87 small_chunks[i] = small_chunks[switch_index];
88 large_chunks[i] = large_chunks[switch_index];
89 layers[switch_index] = temp1;
90 scheds[switch_index] = temp2;
91 small_chunks[switch_index] = temp3;
92 large_chunks[switch_index] = temp4;
101 __kmp_free(small_chunks);
102 __kmp_free(large_chunks);
111 } kmp_hier_sched_env_t;
113 extern int __kmp_dispatch_hand_threading;
114 extern kmp_hier_sched_env_t __kmp_hier_scheds;
116 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
117 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
118 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
120 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
121 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
122 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
123 kmp_hier_layer_e t2);
124 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
126 template <typename T> struct kmp_hier_shared_bdata_t {
127 typedef typename traits_t<T>::signed_t ST;
128 volatile kmp_uint64 val[2];
133 dispatch_shared_info_template<T> sh[2];
136 status[0] = status[1] = 0;
140 sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
142 void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
147 status[1 - index] = nstatus;
149 void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
153 status[1 - index] = nstatus;
154 sh[1 - index].u.s.iteration = 0;
157 kmp_int32 get_next_status(kmp_uint64 index) const {
158 return status[1 - index];
160 T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
161 T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
162 ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
163 dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
164 return &(sh[1 - index]);
167 kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
168 T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
169 T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
170 ST get_curr_st(kmp_uint64 index) const { return st[index]; }
171 dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
177 * In the barrier implementations, num_active is the number of threads that are
178 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
179 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
180 * structure. tdata is the thread private data that resides on the thread
183 * The reset_shared() method is used to initialize the barrier data on the
184 * kmp_hier_top_unit_t hierarchy structure
186 * The reset_private() method is used to initialize the barrier data on the
187 * thread's private dispatch buffer structure
189 * The barrier() method takes an id, which is that thread's id for the
190 * kmp_hier_top_unit_t structure, and implements the barrier. All threads wait
191 * inside barrier() until all fellow threads who are attached to that
192 * kmp_hier_top_unit_t structure have arrived.
195 // Core barrier implementation
196 // Can be used in a unit with between 2 to 8 threads
197 template <typename T> class core_barrier_impl {
198 static inline kmp_uint64 get_wait_val(int num_active) {
200 switch (num_active) {
205 wait_val = 0x010101LL;
208 wait_val = 0x01010101LL;
211 wait_val = 0x0101010101LL;
214 wait_val = 0x010101010101LL;
217 wait_val = 0x01010101010101LL;
220 wait_val = 0x0101010101010101LL;
223 // don't use the core_barrier_impl for more than 8 threads
230 static void reset_private(kmp_int32 num_active,
231 kmp_hier_private_bdata_t *tdata);
232 static void reset_shared(kmp_int32 num_active,
233 kmp_hier_shared_bdata_t<T> *bdata);
234 static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
235 kmp_hier_private_bdata_t *tdata);
238 template <typename T>
239 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
240 kmp_hier_private_bdata_t *tdata) {
241 tdata->num_active = num_active;
243 tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
245 template <typename T>
246 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
247 kmp_hier_shared_bdata_t<T> *bdata) {
248 bdata->val[0] = bdata->val[1] = 0LL;
249 bdata->status[0] = bdata->status[1] = 0LL;
251 template <typename T>
252 void core_barrier_impl<T>::barrier(kmp_int32 id,
253 kmp_hier_shared_bdata_t<T> *bdata,
254 kmp_hier_private_bdata_t *tdata) {
255 kmp_uint64 current_index = tdata->index;
256 kmp_uint64 next_index = 1 - current_index;
257 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
258 kmp_uint64 next_wait_value =
259 (current_wait_value ? 0 : get_wait_val(tdata->num_active));
260 KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
261 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
262 __kmp_get_gtid(), current_index, next_index, current_wait_value,
264 char v = (current_wait_value ? 0x1 : 0x0);
265 (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
266 __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
267 __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
268 tdata->wait_val[current_index] = next_wait_value;
269 tdata->index = next_index;
272 // Counter barrier implementation
273 // Can be used in a unit with arbitrary number of active threads
274 template <typename T> class counter_barrier_impl {
276 static void reset_private(kmp_int32 num_active,
277 kmp_hier_private_bdata_t *tdata);
278 static void reset_shared(kmp_int32 num_active,
279 kmp_hier_shared_bdata_t<T> *bdata);
280 static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
281 kmp_hier_private_bdata_t *tdata);
284 template <typename T>
285 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
286 kmp_hier_private_bdata_t *tdata) {
287 tdata->num_active = num_active;
289 tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
291 template <typename T>
292 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
293 kmp_hier_shared_bdata_t<T> *bdata) {
294 bdata->val[0] = bdata->val[1] = 0LL;
295 bdata->status[0] = bdata->status[1] = 0LL;
297 template <typename T>
298 void counter_barrier_impl<T>::barrier(kmp_int32 id,
299 kmp_hier_shared_bdata_t<T> *bdata,
300 kmp_hier_private_bdata_t *tdata) {
301 volatile kmp_int64 *val;
302 kmp_uint64 current_index = tdata->index;
303 kmp_uint64 next_index = 1 - current_index;
304 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
305 kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
307 KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
308 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
309 __kmp_get_gtid(), current_index, next_index, current_wait_value,
311 val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
312 KMP_TEST_THEN_INC64(val);
313 __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
314 __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
315 tdata->wait_val[current_index] = next_wait_value;
316 tdata->index = next_index;
319 // Data associated with topology unit within a layer
320 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
321 template <typename T> struct kmp_hier_top_unit_t {
322 typedef typename traits_t<T>::signed_t ST;
323 typedef typename traits_t<T>::unsigned_t UT;
324 kmp_int32 active; // number of topology units that communicate with this unit
325 // chunk information (lower/upper bound, stride, etc.)
326 dispatch_private_info_template<T> hier_pr;
327 kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
328 kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
330 kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
331 void reset_shared_barrier() {
332 KMP_DEBUG_ASSERT(active > 0);
336 if (active >= 2 && active <= 8) {
337 core_barrier_impl<T>::reset_shared(active, &hier_barrier);
339 counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
342 void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
343 KMP_DEBUG_ASSERT(tdata);
344 KMP_DEBUG_ASSERT(active > 0);
347 if (active >= 2 && active <= 8) {
348 core_barrier_impl<T>::reset_private(active, tdata);
350 counter_barrier_impl<T>::reset_private(active, tdata);
353 void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
354 KMP_DEBUG_ASSERT(tdata);
355 KMP_DEBUG_ASSERT(active > 0);
356 KMP_DEBUG_ASSERT(id >= 0 && id < active);
358 tdata->index = 1 - tdata->index;
361 if (active >= 2 && active <= 8) {
362 core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
364 counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
368 kmp_int32 get_next_status(kmp_uint64 index) const {
369 return hier_barrier.get_next_status(index);
371 T get_next_lb(kmp_uint64 index) const {
372 return hier_barrier.get_next_lb(index);
374 T get_next_ub(kmp_uint64 index) const {
375 return hier_barrier.get_next_ub(index);
377 ST get_next_st(kmp_uint64 index) const {
378 return hier_barrier.get_next_st(index);
380 dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
381 return hier_barrier.get_next_sh(index);
384 kmp_int32 get_curr_status(kmp_uint64 index) const {
385 return hier_barrier.get_curr_status(index);
387 T get_curr_lb(kmp_uint64 index) const {
388 return hier_barrier.get_curr_lb(index);
390 T get_curr_ub(kmp_uint64 index) const {
391 return hier_barrier.get_curr_ub(index);
393 ST get_curr_st(kmp_uint64 index) const {
394 return hier_barrier.get_curr_st(index);
396 dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
397 return hier_barrier.get_curr_sh(index);
400 void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
402 hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
404 void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
405 hier_barrier.set_next(lb, ub, st, status, index);
407 dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
408 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
409 dispatch_private_info_template<T> *get_parent_pr() {
410 return &(hier_parent->hier_pr);
413 kmp_int32 is_active() const { return active; }
414 kmp_int32 get_num_active() const { return active; }
418 (" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
419 active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
424 // Information regarding a single layer within the scheduling hierarchy
425 template <typename T> struct kmp_hier_layer_info_t {
426 int num_active; // number of threads active in this level
427 kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
428 enum sched_type sched; // static, dynamic, guided, etc.
429 typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
430 int length; // length of the kmp_hier_top_unit_t array
432 // Print this layer's information
434 const char *t = __kmp_get_hier_str(type);
437 (" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
439 num_active, t, sched, chunk, length));
444 * Structure to implement entire hierarchy
446 * The hierarchy is kept as an array of arrays to represent the different
447 * layers. Layer 0 is the lowest layer to layer num_layers - 1 which is the
450 * [ 2 ] -> [ L3 | L3 ]
451 * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
452 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
453 * There is also an array of layer_info_t which has information regarding
456 template <typename T> struct kmp_hier_t {
458 typedef typename traits_t<T>::unsigned_t UT;
459 typedef typename traits_t<T>::signed_t ST;
462 int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
463 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
464 kmp_int32 previous_id, int hier_level) {
466 kmp_info_t *th = __kmp_threads[gtid];
467 auto parent = current->get_parent();
468 bool last_layer = (hier_level == get_num_layers() - 1);
469 KMP_DEBUG_ASSERT(th);
470 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
471 KMP_DEBUG_ASSERT(current);
472 KMP_DEBUG_ASSERT(hier_level >= 0);
473 KMP_DEBUG_ASSERT(hier_level < get_num_layers());
474 KMP_DEBUG_ASSERT(tdata);
475 KMP_DEBUG_ASSERT(parent || last_layer);
478 1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
480 T hier_id = (T)current->get_hier_id();
481 // Attempt to grab next iteration range for this level
482 if (previous_id == 0) {
483 KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
485 kmp_int32 contains_last;
489 dispatch_shared_info_template<T> volatile *my_sh;
490 dispatch_private_info_template<T> *my_pr;
492 // last layer below the very top uses the single shared buffer
493 // from the team struct.
495 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
497 my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
498 th->th.th_dispatch->th_dispatch_sh_current);
499 nproc = (T)get_top_level_nproc();
501 // middle layers use the shared buffer inside the kmp_hier_top_unit_t
503 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
506 parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
507 nproc = (T)parent->get_num_active();
509 my_pr = current->get_my_pr();
510 KMP_DEBUG_ASSERT(my_sh);
511 KMP_DEBUG_ASSERT(my_pr);
512 enum sched_type schedule = get_sched(hier_level);
513 ST chunk = (ST)get_chunk(hier_level);
514 status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
515 &contains_last, &my_lb, &my_ub,
516 &my_st, nproc, hier_id);
519 ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
520 gtid, hier_level, status));
521 // When no iterations are found (status == 0) and this is not the last
522 // layer, attempt to go up the hierarchy for more iterations
523 if (status == 0 && !last_layer) {
524 status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
525 &my_st, hier_id, hier_level + 1);
528 ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
529 gtid, hier_level, status));
531 kmp_hier_private_bdata_t *upper_tdata =
532 &(th->th.th_hier_bar_data[hier_level + 1]);
533 my_sh = parent->get_curr_sh(upper_tdata->index);
534 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
536 __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
537 parent->get_curr_lb(upper_tdata->index),
538 parent->get_curr_ub(upper_tdata->index),
539 parent->get_curr_st(upper_tdata->index),
543 chunk, nproc, hier_id);
544 status = __kmp_dispatch_next_algorithm<T>(
545 gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
548 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
555 current->set_next(my_lb, my_ub, my_st, status, tdata->index);
556 // Propagate whether a unit holds the actual global last iteration
557 // The contains_last attribute is sent downwards from the top to the
558 // bottom of the hierarchy via the contains_last flag inside the
559 // private dispatch buffers in the hierarchy's middle layers
561 // If the next_algorithm() method returns 1 for p_last and it is the
562 // last layer or our parent contains the last serial chunk, then the
563 // chunk must contain the last serial iteration.
564 if (last_layer || parent->hier_pr.flags.contains_last) {
565 KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
566 "to contain last.\n",
568 current->hier_pr.flags.contains_last = contains_last;
570 if (!current->hier_pr.flags.contains_last)
571 contains_last = FALSE;
574 *p_last = contains_last;
575 } // if master thread of this unit
576 if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
578 ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
580 current->barrier(previous_id, tdata);
582 ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
583 gtid, hier_level, current->get_curr_status(tdata->index)));
585 KMP_DEBUG_ASSERT(previous_id == 0);
588 return current->get_curr_status(tdata->index);
596 kmp_hier_layer_info_t<T> *info;
597 kmp_hier_top_unit_t<T> **layers;
598 // Deallocate all memory from this hierarchy
600 for (int i = 0; i < num_layers; ++i)
601 if (layers[i] != NULL) {
602 __kmp_free(layers[i]);
604 if (layers != NULL) {
615 // Returns true if reallocation is needed else false
616 bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
617 const enum sched_type *new_scheds,
618 const ST *new_chunks) const {
619 if (!valid || layers == NULL || info == NULL ||
620 traits_t<T>::type_size != type_size || n != num_layers)
622 for (int i = 0; i < n; ++i) {
623 if (info[i].type != new_layers[i])
625 if (info[i].sched != new_scheds[i])
627 if (info[i].chunk != new_chunks[i])
632 // A single thread should call this function while the other threads wait
633 // create a new scheduling hierarchy consisting of new_layers, new_scheds
634 // and new_chunks. These should come pre-sorted according to
635 // kmp_hier_layer_e value. This function will try to avoid reallocation
637 void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
638 const enum sched_type *new_scheds, const ST *new_chunks) {
640 if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
643 ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
644 for (int i = 0; i < n; ++i) {
645 info[i].num_active = 0;
646 for (int j = 0; j < get_length(i); ++j)
647 layers[i][j].active = 0;
651 KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
653 type_size = traits_t<T>::type_size;
655 info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
656 sizeof(kmp_hier_layer_info_t<T>) * n);
657 layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
658 sizeof(kmp_hier_top_unit_t<T> *) * n);
659 for (int i = 0; i < n; ++i) {
661 kmp_hier_layer_e layer = new_layers[i];
662 info[i].num_active = 0;
663 info[i].type = layer;
664 info[i].sched = new_scheds[i];
665 info[i].chunk = new_chunks[i];
666 max = __kmp_hier_max_units[layer + 1];
669 KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
673 info[i].length = max;
674 layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
675 sizeof(kmp_hier_top_unit_t<T>) * max);
676 for (int j = 0; j < max; ++j) {
677 layers[i][j].active = 0;
682 // loc - source file location
683 // gtid - global thread identifier
684 // pr - this thread's private dispatch buffer (corresponding with gtid)
685 // p_last (return value) - pointer to flag indicating this set of iterations
688 // p_lb (return value) - lower bound for this chunk of iterations
689 // p_ub (return value) - upper bound for this chunk of iterations
690 // p_st (return value) - stride for this chunk of iterations
692 // Returns 1 if there are more iterations to perform, 0 otherwise
693 int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
694 kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
696 kmp_int32 contains_last = 0;
697 kmp_info_t *th = __kmp_threads[gtid];
698 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
699 auto parent = pr->get_parent();
700 KMP_DEBUG_ASSERT(parent);
701 KMP_DEBUG_ASSERT(th);
702 KMP_DEBUG_ASSERT(tdata);
703 KMP_DEBUG_ASSERT(parent);
704 T nproc = (T)parent->get_num_active();
705 T unit_id = (T)pr->get_hier_id();
708 ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
709 gtid, nproc, unit_id));
710 // Handthreading implementation
711 // Each iteration is performed by all threads on last unit (typically
713 // e.g., threads 0,1,2,3 all execute iteration 0
714 // threads 0,1,2,3 all execute iteration 1
715 // threads 4,5,6,7 all execute iteration 2
716 // threads 4,5,6,7 all execute iteration 3
718 if (__kmp_dispatch_hand_threading) {
720 ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
723 // For hand threading, the sh buffer on the lowest level is only ever
724 // modified and read by the master thread on that level. Because of
725 // this, we can always use the first sh buffer.
726 auto sh = &(parent->hier_barrier.sh[0]);
727 KMP_DEBUG_ASSERT(sh);
728 status = __kmp_dispatch_next_algorithm<T>(
729 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
734 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
737 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
738 parent->get_next_lb(tdata->index),
739 parent->get_next_ub(tdata->index),
740 parent->get_next_st(tdata->index),
744 pr->u.p.parm1, nproc, unit_id);
745 sh->u.s.iteration = 0;
746 status = __kmp_dispatch_next_algorithm<T>(
747 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
751 ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
757 } else if (status == 2) {
758 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
765 parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
766 } // if master thread of lowest unit level
767 parent->barrier(pr->get_hier_id(), tdata);
769 *p_lb = parent->get_curr_lb(tdata->index);
770 *p_ub = parent->get_curr_ub(tdata->index);
771 *p_st = parent->get_curr_st(tdata->index);
772 status = parent->get_curr_status(tdata->index);
775 // Normal implementation
776 // Each thread grabs an iteration chunk and executes it (no cooperation)
777 auto sh = parent->get_curr_sh(tdata->index);
778 KMP_DEBUG_ASSERT(sh);
779 status = __kmp_dispatch_next_algorithm<T>(
780 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
782 ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
783 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
784 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
789 status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
792 sh = parent->get_curr_sh(tdata->index);
793 __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
794 parent->get_curr_lb(tdata->index),
795 parent->get_curr_ub(tdata->index),
796 parent->get_curr_st(tdata->index),
800 pr->u.p.parm1, nproc, unit_id);
801 status = __kmp_dispatch_next_algorithm<T>(
802 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
804 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
810 } else if (status == 2) {
811 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
819 if (contains_last && !parent->hier_pr.flags.contains_last) {
820 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
821 "contains_last to FALSE\n",
823 contains_last = FALSE;
826 *p_last = contains_last;
827 KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
831 // These functions probe the layer info structure
832 // Returns the type of topology unit given level
833 kmp_hier_layer_e get_type(int level) const {
834 KMP_DEBUG_ASSERT(level >= 0);
835 KMP_DEBUG_ASSERT(level < num_layers);
836 return info[level].type;
838 // Returns the schedule type at given level
839 enum sched_type get_sched(int level) const {
840 KMP_DEBUG_ASSERT(level >= 0);
841 KMP_DEBUG_ASSERT(level < num_layers);
842 return info[level].sched;
844 // Returns the chunk size at given level
845 ST get_chunk(int level) const {
846 KMP_DEBUG_ASSERT(level >= 0);
847 KMP_DEBUG_ASSERT(level < num_layers);
848 return info[level].chunk;
850 // Returns the number of active threads at given level
851 int get_num_active(int level) const {
852 KMP_DEBUG_ASSERT(level >= 0);
853 KMP_DEBUG_ASSERT(level < num_layers);
854 return info[level].num_active;
856 // Returns the length of topology unit array at given level
857 int get_length(int level) const {
858 KMP_DEBUG_ASSERT(level >= 0);
859 KMP_DEBUG_ASSERT(level < num_layers);
860 return info[level].length;
862 // Returns the topology unit given the level and index
863 kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
864 KMP_DEBUG_ASSERT(level >= 0);
865 KMP_DEBUG_ASSERT(level < num_layers);
866 KMP_DEBUG_ASSERT(index >= 0);
867 KMP_DEBUG_ASSERT(index < get_length(level));
868 return &(layers[level][index]);
870 // Returns the number of layers in the hierarchy
871 int get_num_layers() const { return num_layers; }
872 // Returns the number of threads in the top layer
873 // This is necessary because we don't store a topology unit as
874 // the very top level and the scheduling algorithms need this information
875 int get_top_level_nproc() const { return top_level_nproc; }
876 // Return whether this hierarchy is valid or not
877 bool is_valid() const { return valid; }
878 // Print the hierarchy
880 KD_TRACE(10, ("kmp_hier_t:\n"));
881 for (int i = num_layers - 1; i >= 0; --i) {
882 KD_TRACE(10, ("Info[%d] = ", i));
885 for (int i = num_layers - 1; i >= 0; --i) {
886 KD_TRACE(10, ("Layer[%d] =\n", i));
887 for (int j = 0; j < info[i].length; ++j) {
888 layers[i][j].print();
894 template <typename T>
895 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
896 kmp_hier_layer_e *new_layers,
897 enum sched_type *new_scheds,
898 typename traits_t<T>::signed_t *new_chunks,
900 typename traits_t<T>::signed_t st) {
901 typedef typename traits_t<T>::signed_t ST;
902 typedef typename traits_t<T>::unsigned_t UT;
903 int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
907 dispatch_private_info_template<T> *pr;
908 dispatch_shared_info_template<T> volatile *sh;
909 gtid = __kmp_entry_gtid();
910 tid = __kmp_tid_from_gtid(gtid);
912 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
914 for (int i = 0; i < n; ++i) {
915 const char *layer = __kmp_get_hier_str(new_layers[i]);
916 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
917 "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
918 gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
921 KMP_DEBUG_ASSERT(n > 0);
922 KMP_DEBUG_ASSERT(new_layers);
923 KMP_DEBUG_ASSERT(new_scheds);
924 KMP_DEBUG_ASSERT(new_chunks);
925 if (!TCR_4(__kmp_init_parallel))
926 __kmp_parallel_initialize();
927 th = __kmp_threads[gtid];
928 team = th->th.th_team;
929 active = !team->t.t_serialized;
930 th->th.th_ident = loc;
931 num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
933 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
934 "Using normal dispatch functions.\n",
936 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
937 th->th.th_dispatch->th_disp_buffer);
938 KMP_DEBUG_ASSERT(pr);
939 pr->flags.use_hier = FALSE;
940 pr->flags.contains_last = FALSE;
943 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
944 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
946 my_buffer_index = th->th.th_dispatch->th_disp_index;
947 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
949 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
950 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
951 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
952 KMP_DEBUG_ASSERT(pr);
953 KMP_DEBUG_ASSERT(sh);
954 pr->flags.use_hier = TRUE;
956 // Have master allocate the hierarchy
957 if (__kmp_tid_from_gtid(gtid) == 0) {
958 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
961 if (sh->hier == NULL) {
962 sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
964 sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
965 sh->u.s.iteration = 0;
967 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
968 // Check to make sure the hierarchy is valid
969 kmp_hier_t<T> *hier = sh->hier;
970 if (!sh->hier->is_valid()) {
971 pr->flags.use_hier = FALSE;
974 // Have threads allocate their thread-private barrier data if it hasn't
975 // already been allocated
976 if (th->th.th_hier_bar_data == NULL) {
977 th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
978 sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
980 // Have threads "register" themselves by modifiying the active count for each
981 // level they are involved in. The active count will act as nthreads for that
982 // level regarding the scheduling algorithms
983 for (int i = 0; i < n; ++i) {
984 int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
985 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
986 // Setup the thread's private dispatch buffer's hierarchy pointers
988 pr->hier_parent = my_unit;
989 // If this unit is already active, then increment active count and wait
990 if (my_unit->is_active()) {
991 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
992 "is already active (%d)\n",
993 gtid, my_unit, my_unit->active));
994 KMP_TEST_THEN_INC32(&(my_unit->active));
997 // Flag that this unit is active
998 if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
999 // Do not setup parent pointer for top level unit since it has no parent
1001 // Setup middle layer pointers to parents
1002 my_unit->get_my_pr()->hier_id =
1003 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1004 hier->get_type(i + 1));
1005 int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1006 my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1008 // Setup top layer information (no parent pointers are set)
1009 my_unit->get_my_pr()->hier_id =
1010 index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1011 kmp_hier_layer_e::LAYER_LOOP);
1012 KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1013 my_unit->hier_parent = nullptr;
1015 // Set trip count to 0 so that next() operation will initially climb up
1016 // the hierarchy to get more iterations (early exit in next() for tc == 0)
1017 my_unit->get_my_pr()->u.p.tc = 0;
1018 // Increment this layer's number of active units
1019 KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1020 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1021 "incrementing num_active\n",
1024 KMP_TEST_THEN_INC32(&(my_unit->active));
1028 // Set this thread's id
1029 num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1030 kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1031 pr->hier_id = tid % num_threads_per_layer1;
1032 // For oversubscribed threads, increment their index within the lowest unit
1033 // This is done to prevent having two or more threads with id 0, id 1, etc.
1034 if (tid >= num_hw_threads)
1035 pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1037 10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1038 gtid, pr->hier_id));
1040 pr->flags.contains_last = FALSE;
1041 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1043 // Now that the number of active threads at each level is determined,
1044 // the barrier data for each unit can be initialized and the last layer's
1045 // loop information can be initialized.
1046 int prev_id = pr->get_hier_id();
1047 for (int i = 0; i < n; ++i) {
1050 int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1051 kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1052 // Only master threads of this unit within the hierarchy do initialization
1053 KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1055 my_unit->reset_shared_barrier();
1056 my_unit->hier_pr.flags.contains_last = FALSE;
1057 // Last layer, initialize the private buffers with entire loop information
1058 // Now the next next_algorithim() call will get the first chunk of
1059 // iterations properly
1061 __kmp_dispatch_init_algorithm<T>(
1062 loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1066 hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1068 prev_id = my_unit->get_hier_id();
1070 // Initialize each layer of the thread's private barrier data
1071 kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1072 for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1073 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1074 unit->reset_private_barrier(tdata);
1076 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1079 if (__kmp_tid_from_gtid(gtid) == 0) {
1080 for (int i = 0; i < n; ++i) {
1082 ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1083 gtid, i, hier->get_num_active(i)));
1087 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);