contrib/openmp/runtime/src/kmp_dispatch_hier.h

   1 #ifndef KMP_DISPATCH_HIER_H
   2 #define KMP_DISPATCH_HIER_H
   3 #include "kmp.h"
   4 #include "kmp_dispatch.h"
   5
   6 // Layer type for scheduling hierarchy
   7 enum kmp_hier_layer_e {
   8   LAYER_THREAD = -1,
   9   LAYER_L1,
  10   LAYER_L2,
  11   LAYER_L3,
  12   LAYER_NUMA,
  13   LAYER_LOOP,
  14   LAYER_LAST
  15 };
  16
  17 // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
  18 static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
  19   switch (type) {
  20   case kmp_hier_layer_e::LAYER_THREAD:
  21     return "THREAD";
  22   case kmp_hier_layer_e::LAYER_L1:
  23     return "L1";
  24   case kmp_hier_layer_e::LAYER_L2:
  25     return "L2";
  26   case kmp_hier_layer_e::LAYER_L3:
  27     return "L3";
  28   case kmp_hier_layer_e::LAYER_NUMA:
  29     return "NUMA";
  30   case kmp_hier_layer_e::LAYER_LOOP:
  31     return "WHOLE_LOOP";
  32   case kmp_hier_layer_e::LAYER_LAST:
  33     return "LAST";
  34   }
  35   KMP_ASSERT(0);
  36   // Appease compilers, should never get here
  37   return "ERROR";
  38 }
  39
  40 // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
  41 typedef struct kmp_hier_sched_env_t {
  42   int size;
  43   int capacity;
  44   enum sched_type *scheds;
  45   kmp_int32 *small_chunks;
  46   kmp_int64 *large_chunks;
  47   kmp_hier_layer_e *layers;
  48   // Append a level of the hierarchy
  49   void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
  50     if (capacity == 0) {
  51       scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
  52                                                  kmp_hier_layer_e::LAYER_LAST);
  53       small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
  54                                                  kmp_hier_layer_e::LAYER_LAST);
  55       large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
  56                                                  kmp_hier_layer_e::LAYER_LAST);
  57       layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
  58                                                   kmp_hier_layer_e::LAYER_LAST);
  59       capacity = kmp_hier_layer_e::LAYER_LAST;
  60     }
  61     int current_size = size;
  62     KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
  63     scheds[current_size] = sched;
  64     layers[current_size] = layer;
  65     small_chunks[current_size] = chunk;
  66     large_chunks[current_size] = (kmp_int64)chunk;
  67     size++;
  68   }
  69   // Sort the hierarchy using selection sort, size will always be small
  70   // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
  71   void sort() {
  72     if (size <= 1)
  73       return;
  74     for (int i = 0; i < size; ++i) {
  75       int switch_index = i;
  76       for (int j = i + 1; j < size; ++j) {
  77         if (layers[j] < layers[switch_index])
  78           switch_index = j;
  79       }
  80       if (switch_index != i) {
  81         kmp_hier_layer_e temp1 = layers[i];
  82         enum sched_type temp2 = scheds[i];
  83         kmp_int32 temp3 = small_chunks[i];
  84         kmp_int64 temp4 = large_chunks[i];
  85         layers[i] = layers[switch_index];
  86         scheds[i] = scheds[switch_index];
  87         small_chunks[i] = small_chunks[switch_index];
  88         large_chunks[i] = large_chunks[switch_index];
  89         layers[switch_index] = temp1;
  90         scheds[switch_index] = temp2;
  91         small_chunks[switch_index] = temp3;
  92         large_chunks[switch_index] = temp4;
  93       }
  94     }
  95   }
  96   // Free all memory
  97   void deallocate() {
  98     if (capacity > 0) {
  99       __kmp_free(scheds);
 100       __kmp_free(layers);
 101       __kmp_free(small_chunks);
 102       __kmp_free(large_chunks);
 103       scheds = NULL;
 104       layers = NULL;
 105       small_chunks = NULL;
 106       large_chunks = NULL;
 107     }
 108     size = 0;
 109     capacity = 0;
 110   }
 111 } kmp_hier_sched_env_t;
 112
 113 extern int __kmp_dispatch_hand_threading;
 114 extern kmp_hier_sched_env_t __kmp_hier_scheds;
 115
 116 // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
 117 extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
 118 extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
 119
 120 extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
 121 extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
 122 extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
 123                                         kmp_hier_layer_e t2);
 124 extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
 125
 126 template <typename T> struct kmp_hier_shared_bdata_t {
 127   typedef typename traits_t<T>::signed_t ST;
 128   volatile kmp_uint64 val[2];
 129   kmp_int32 status[2];
 130   T lb[2];
 131   T ub[2];
 132   ST st[2];
 133   dispatch_shared_info_template<T> sh[2];
 134   void zero() {
 135     val[0] = val[1] = 0;
 136     status[0] = status[1] = 0;
 137     lb[0] = lb[1] = 0;
 138     ub[0] = ub[1] = 0;
 139     st[0] = st[1] = 0;
 140     sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
 141   }
 142   void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
 143                             kmp_uint64 index) {
 144     lb[1 - index] = nlb;
 145     ub[1 - index] = nub;
 146     st[1 - index] = nst;
 147     status[1 - index] = nstatus;
 148   }
 149   void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
 150     lb[1 - index] = nlb;
 151     ub[1 - index] = nub;
 152     st[1 - index] = nst;
 153     status[1 - index] = nstatus;
 154     sh[1 - index].u.s.iteration = 0;
 155   }
 156
 157   kmp_int32 get_next_status(kmp_uint64 index) const {
 158     return status[1 - index];
 159   }
 160   T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
 161   T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
 162   ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
 163   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
 164     return &(sh[1 - index]);
 165   }
 166
 167   kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
 168   T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
 169   T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
 170   ST get_curr_st(kmp_uint64 index) const { return st[index]; }
 171   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
 172     return &(sh[index]);
 173   }
 174 };
 175
 176 /*
 177  * In the barrier implementations, num_active is the number of threads that are
 178  * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
 179  * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
 180  * structure. tdata is the thread private data that resides on the thread
 181  * data structure.
 182  *
 183  * The reset_shared() method is used to initialize the barrier data on the
 184  * kmp_hier_top_unit_t hierarchy structure
 185  *
 186  * The reset_private() method is used to initialize the barrier data on the
 187  * thread's private dispatch buffer structure
 188  *
 189  * The barrier() method takes an id, which is that thread's id for the
 190  * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
 191  * inside barrier() until all fellow threads who are attached to that
 192  * kmp_hier_top_unit_t structure have arrived.
 193  */
 194
 195 // Core barrier implementation
 196 // Can be used in a unit with between 2 to 8 threads
 197 template <typename T> class core_barrier_impl {
 198   static inline kmp_uint64 get_wait_val(int num_active) {
 199     kmp_uint64 wait_val;
 200     switch (num_active) {
 201     case 2:
 202       wait_val = 0x0101LL;
 203       break;
 204     case 3:
 205       wait_val = 0x010101LL;
 206       break;
 207     case 4:
 208       wait_val = 0x01010101LL;
 209       break;
 210     case 5:
 211       wait_val = 0x0101010101LL;
 212       break;
 213     case 6:
 214       wait_val = 0x010101010101LL;
 215       break;
 216     case 7:
 217       wait_val = 0x01010101010101LL;
 218       break;
 219     case 8:
 220       wait_val = 0x0101010101010101LL;
 221       break;
 222     default:
 223       // don't use the core_barrier_impl for more than 8 threads
 224       KMP_ASSERT(0);
 225     }
 226     return wait_val;
 227   }
 228
 229 public:
 230   static void reset_private(kmp_int32 num_active,
 231                             kmp_hier_private_bdata_t *tdata);
 232   static void reset_shared(kmp_int32 num_active,
 233                            kmp_hier_shared_bdata_t<T> *bdata);
 234   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
 235                       kmp_hier_private_bdata_t *tdata);
 236 };
 237
 238 template <typename T>
 239 void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
 240                                          kmp_hier_private_bdata_t *tdata) {
 241   tdata->num_active = num_active;
 242   tdata->index = 0;
 243   tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
 244 }
 245 template <typename T>
 246 void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
 247                                         kmp_hier_shared_bdata_t<T> *bdata) {
 248   bdata->val[0] = bdata->val[1] = 0LL;
 249   bdata->status[0] = bdata->status[1] = 0LL;
 250 }
 251 template <typename T>
 252 void core_barrier_impl<T>::barrier(kmp_int32 id,
 253                                    kmp_hier_shared_bdata_t<T> *bdata,
 254                                    kmp_hier_private_bdata_t *tdata) {
 255   kmp_uint64 current_index = tdata->index;
 256   kmp_uint64 next_index = 1 - current_index;
 257   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
 258   kmp_uint64 next_wait_value =
 259       (current_wait_value ? 0 : get_wait_val(tdata->num_active));
 260   KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
 261                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
 262                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
 263                 next_wait_value));
 264   char v = (current_wait_value ? 0x1 : 0x0);
 265   (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
 266   __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
 267                                __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
 268   tdata->wait_val[current_index] = next_wait_value;
 269   tdata->index = next_index;
 270 }
 271
 272 // Counter barrier implementation
 273 // Can be used in a unit with arbitrary number of active threads
 274 template <typename T> class counter_barrier_impl {
 275 public:
 276   static void reset_private(kmp_int32 num_active,
 277                             kmp_hier_private_bdata_t *tdata);
 278   static void reset_shared(kmp_int32 num_active,
 279                            kmp_hier_shared_bdata_t<T> *bdata);
 280   static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
 281                       kmp_hier_private_bdata_t *tdata);
 282 };
 283
 284 template <typename T>
 285 void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
 286                                             kmp_hier_private_bdata_t *tdata) {
 287   tdata->num_active = num_active;
 288   tdata->index = 0;
 289   tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
 290 }
 291 template <typename T>
 292 void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
 293                                            kmp_hier_shared_bdata_t<T> *bdata) {
 294   bdata->val[0] = bdata->val[1] = 0LL;
 295   bdata->status[0] = bdata->status[1] = 0LL;
 296 }
 297 template <typename T>
 298 void counter_barrier_impl<T>::barrier(kmp_int32 id,
 299                                       kmp_hier_shared_bdata_t<T> *bdata,
 300                                       kmp_hier_private_bdata_t *tdata) {
 301   volatile kmp_int64 *val;
 302   kmp_uint64 current_index = tdata->index;
 303   kmp_uint64 next_index = 1 - current_index;
 304   kmp_uint64 current_wait_value = tdata->wait_val[current_index];
 305   kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
 306
 307   KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
 308                 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
 309                 __kmp_get_gtid(), current_index, next_index, current_wait_value,
 310                 next_wait_value));
 311   val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
 312   KMP_TEST_THEN_INC64(val);
 313   __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
 314                                __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
 315   tdata->wait_val[current_index] = next_wait_value;
 316   tdata->index = next_index;
 317 }
 318
 319 // Data associated with topology unit within a layer
 320 // For example, one kmp_hier_top_unit_t corresponds to one L1 cache
 321 template <typename T> struct kmp_hier_top_unit_t {
 322   typedef typename traits_t<T>::signed_t ST;
 323   typedef typename traits_t<T>::unsigned_t UT;
 324   kmp_int32 active; // number of topology units that communicate with this unit
 325   // chunk information (lower/upper bound, stride, etc.)
 326   dispatch_private_info_template<T> hier_pr;
 327   kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
 328   kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
 329
 330   kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
 331   void reset_shared_barrier() {
 332     KMP_DEBUG_ASSERT(active > 0);
 333     if (active == 1)
 334       return;
 335     hier_barrier.zero();
 336     if (active >= 2 && active <= 8) {
 337       core_barrier_impl<T>::reset_shared(active, &hier_barrier);
 338     } else {
 339       counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
 340     }
 341   }
 342   void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
 343     KMP_DEBUG_ASSERT(tdata);
 344     KMP_DEBUG_ASSERT(active > 0);
 345     if (active == 1)
 346       return;
 347     if (active >= 2 && active <= 8) {
 348       core_barrier_impl<T>::reset_private(active, tdata);
 349     } else {
 350       counter_barrier_impl<T>::reset_private(active, tdata);
 351     }
 352   }
 353   void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
 354     KMP_DEBUG_ASSERT(tdata);
 355     KMP_DEBUG_ASSERT(active > 0);
 356     KMP_DEBUG_ASSERT(id >= 0 && id < active);
 357     if (active == 1) {
 358       tdata->index = 1 - tdata->index;
 359       return;
 360     }
 361     if (active >= 2 && active <= 8) {
 362       core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
 363     } else {
 364       counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
 365     }
 366   }
 367
 368   kmp_int32 get_next_status(kmp_uint64 index) const {
 369     return hier_barrier.get_next_status(index);
 370   }
 371   T get_next_lb(kmp_uint64 index) const {
 372     return hier_barrier.get_next_lb(index);
 373   }
 374   T get_next_ub(kmp_uint64 index) const {
 375     return hier_barrier.get_next_ub(index);
 376   }
 377   ST get_next_st(kmp_uint64 index) const {
 378     return hier_barrier.get_next_st(index);
 379   }
 380   dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
 381     return hier_barrier.get_next_sh(index);
 382   }
 383
 384   kmp_int32 get_curr_status(kmp_uint64 index) const {
 385     return hier_barrier.get_curr_status(index);
 386   }
 387   T get_curr_lb(kmp_uint64 index) const {
 388     return hier_barrier.get_curr_lb(index);
 389   }
 390   T get_curr_ub(kmp_uint64 index) const {
 391     return hier_barrier.get_curr_ub(index);
 392   }
 393   ST get_curr_st(kmp_uint64 index) const {
 394     return hier_barrier.get_curr_st(index);
 395   }
 396   dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
 397     return hier_barrier.get_curr_sh(index);
 398   }
 399
 400   void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
 401                             kmp_uint64 index) {
 402     hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
 403   }
 404   void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
 405     hier_barrier.set_next(lb, ub, st, status, index);
 406   }
 407   dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
 408   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 409   dispatch_private_info_template<T> *get_parent_pr() {
 410     return &(hier_parent->hier_pr);
 411   }
 412
 413   kmp_int32 is_active() const { return active; }
 414   kmp_int32 get_num_active() const { return active; }
 415   void print() {
 416     KD_TRACE(
 417         10,
 418         ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
 419          active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
 420          hier_pr.u.p.tc));
 421   }
 422 };
 423
 424 // Information regarding a single layer within the scheduling hierarchy
 425 template <typename T> struct kmp_hier_layer_info_t {
 426   int num_active; // number of threads active in this level
 427   kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
 428   enum sched_type sched; // static, dynamic, guided, etc.
 429   typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
 430   int length; // length of the kmp_hier_top_unit_t array
 431
 432   // Print this layer's information
 433   void print() {
 434     const char *t = __kmp_get_hier_str(type);
 435     KD_TRACE(
 436         10,
 437         ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
 438          "length:%d\n",
 439          num_active, t, sched, chunk, length));
 440   }
 441 };
 442
 443 /*
 444  * Structure to implement entire hierarchy
 445  *
 446  * The hierarchy is kept as an array of arrays to represent the different
 447  * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
 448  * highest layer.
 449  * Example:
 450  * [ 2 ] -> [ L3 | L3 ]
 451  * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
 452  * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
 453  * There is also an array of layer_info_t which has information regarding
 454  * each layer
 455  */
 456 template <typename T> struct kmp_hier_t {
 457 public:
 458   typedef typename traits_t<T>::unsigned_t UT;
 459   typedef typename traits_t<T>::signed_t ST;
 460
 461 private:
 462   int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
 463                    kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
 464                    kmp_int32 previous_id, int hier_level) {
 465     int status;
 466     kmp_info_t *th = __kmp_threads[gtid];
 467     auto parent = current->get_parent();
 468     bool last_layer = (hier_level == get_num_layers() - 1);
 469     KMP_DEBUG_ASSERT(th);
 470     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
 471     KMP_DEBUG_ASSERT(current);
 472     KMP_DEBUG_ASSERT(hier_level >= 0);
 473     KMP_DEBUG_ASSERT(hier_level < get_num_layers());
 474     KMP_DEBUG_ASSERT(tdata);
 475     KMP_DEBUG_ASSERT(parent || last_layer);
 476
 477     KD_TRACE(
 478         1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
 479
 480     T hier_id = (T)current->get_hier_id();
 481     // Attempt to grab next iteration range for this level
 482     if (previous_id == 0) {
 483       KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
 484                    gtid, hier_level));
 485       kmp_int32 contains_last;
 486       T my_lb, my_ub;
 487       ST my_st;
 488       T nproc;
 489       dispatch_shared_info_template<T> volatile *my_sh;
 490       dispatch_private_info_template<T> *my_pr;
 491       if (last_layer) {
 492         // last layer below the very top uses the single shared buffer
 493         // from the team struct.
 494         KD_TRACE(10,
 495                  ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
 496                   gtid, hier_level));
 497         my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 498             th->th.th_dispatch->th_dispatch_sh_current);
 499         nproc = (T)get_top_level_nproc();
 500       } else {
 501         // middle layers use the shared buffer inside the kmp_hier_top_unit_t
 502         // structure
 503         KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
 504                       gtid, hier_level));
 505         my_sh =
 506             parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
 507         nproc = (T)parent->get_num_active();
 508       }
 509       my_pr = current->get_my_pr();
 510       KMP_DEBUG_ASSERT(my_sh);
 511       KMP_DEBUG_ASSERT(my_pr);
 512       enum sched_type schedule = get_sched(hier_level);
 513       ST chunk = (ST)get_chunk(hier_level);
 514       status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
 515                                                 &contains_last, &my_lb, &my_ub,
 516                                                 &my_st, nproc, hier_id);
 517       KD_TRACE(
 518           10,
 519           ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
 520            gtid, hier_level, status));
 521       // When no iterations are found (status == 0) and this is not the last
 522       // layer, attempt to go up the hierarchy for more iterations
 523       if (status == 0 && !last_layer) {
 524         status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
 525                               &my_st, hier_id, hier_level + 1);
 526         KD_TRACE(
 527             10,
 528             ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
 529              gtid, hier_level, status));
 530         if (status == 1) {
 531           kmp_hier_private_bdata_t *upper_tdata =
 532               &(th->th.th_hier_bar_data[hier_level + 1]);
 533           my_sh = parent->get_curr_sh(upper_tdata->index);
 534           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
 535                         gtid, hier_level));
 536           __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
 537                                         parent->get_curr_lb(upper_tdata->index),
 538                                         parent->get_curr_ub(upper_tdata->index),
 539                                         parent->get_curr_st(upper_tdata->index),
 540 #if USE_ITT_BUILD
 541                                         NULL,
 542 #endif
 543                                         chunk, nproc, hier_id);
 544           status = __kmp_dispatch_next_algorithm<T>(
 545               gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
 546               hier_id);
 547           if (!status) {
 548             KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
 549                           "setting to 2!\n",
 550                           gtid, hier_level));
 551             status = 2;
 552           }
 553         }
 554       }
 555       current->set_next(my_lb, my_ub, my_st, status, tdata->index);
 556       // Propagate whether a unit holds the actual global last iteration
 557       // The contains_last attribute is sent downwards from the top to the
 558       // bottom of the hierarchy via the contains_last flag inside the
 559       // private dispatch buffers in the hierarchy's middle layers
 560       if (contains_last) {
 561         // If the next_algorithm() method returns 1 for p_last and it is the
 562         // last layer or our parent contains the last serial chunk, then the
 563         // chunk must contain the last serial iteration.
 564         if (last_layer || parent->hier_pr.flags.contains_last) {
 565           KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
 566                         "to contain last.\n",
 567                         gtid, hier_level));
 568           current->hier_pr.flags.contains_last = contains_last;
 569         }
 570         if (!current->hier_pr.flags.contains_last)
 571           contains_last = FALSE;
 572       }
 573       if (p_last)
 574         *p_last = contains_last;
 575     } // if master thread of this unit
 576     if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
 577       KD_TRACE(10,
 578                ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
 579                 gtid, hier_level));
 580       current->barrier(previous_id, tdata);
 581       KD_TRACE(10,
 582                ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
 583                 gtid, hier_level, current->get_curr_status(tdata->index)));
 584     } else {
 585       KMP_DEBUG_ASSERT(previous_id == 0);
 586       return status;
 587     }
 588     return current->get_curr_status(tdata->index);
 589   }
 590
 591 public:
 592   int top_level_nproc;
 593   int num_layers;
 594   bool valid;
 595   int type_size;
 596   kmp_hier_layer_info_t<T> *info;
 597   kmp_hier_top_unit_t<T> **layers;
 598   // Deallocate all memory from this hierarchy
 599   void deallocate() {
 600     for (int i = 0; i < num_layers; ++i)
 601       if (layers[i] != NULL) {
 602         __kmp_free(layers[i]);
 603       }
 604     if (layers != NULL) {
 605       __kmp_free(layers);
 606       layers = NULL;
 607     }
 608     if (info != NULL) {
 609       __kmp_free(info);
 610       info = NULL;
 611     }
 612     num_layers = 0;
 613     valid = false;
 614   }
 615   // Returns true if reallocation is needed else false
 616   bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
 617                           const enum sched_type *new_scheds,
 618                           const ST *new_chunks) const {
 619     if (!valid || layers == NULL || info == NULL ||
 620         traits_t<T>::type_size != type_size || n != num_layers)
 621       return true;
 622     for (int i = 0; i < n; ++i) {
 623       if (info[i].type != new_layers[i])
 624         return true;
 625       if (info[i].sched != new_scheds[i])
 626         return true;
 627       if (info[i].chunk != new_chunks[i])
 628         return true;
 629     }
 630     return false;
 631   }
 632   // A single thread should call this function while the other threads wait
 633   // create a new scheduling hierarchy consisting of new_layers, new_scheds
 634   // and new_chunks.  These should come pre-sorted according to
 635   // kmp_hier_layer_e value.  This function will try to avoid reallocation
 636   // if it can
 637   void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
 638                      const enum sched_type *new_scheds, const ST *new_chunks) {
 639     top_level_nproc = 0;
 640     if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
 641       KD_TRACE(
 642           10,
 643           ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
 644       for (int i = 0; i < n; ++i) {
 645         info[i].num_active = 0;
 646         for (int j = 0; j < get_length(i); ++j)
 647           layers[i][j].active = 0;
 648       }
 649       return;
 650     }
 651     KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
 652     deallocate();
 653     type_size = traits_t<T>::type_size;
 654     num_layers = n;
 655     info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
 656         sizeof(kmp_hier_layer_info_t<T>) * n);
 657     layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
 658         sizeof(kmp_hier_top_unit_t<T> *) * n);
 659     for (int i = 0; i < n; ++i) {
 660       int max = 0;
 661       kmp_hier_layer_e layer = new_layers[i];
 662       info[i].num_active = 0;
 663       info[i].type = layer;
 664       info[i].sched = new_scheds[i];
 665       info[i].chunk = new_chunks[i];
 666       max = __kmp_hier_max_units[layer + 1];
 667       if (max == 0) {
 668         valid = false;
 669         KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
 670         deallocate();
 671         return;
 672       }
 673       info[i].length = max;
 674       layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
 675           sizeof(kmp_hier_top_unit_t<T>) * max);
 676       for (int j = 0; j < max; ++j) {
 677         layers[i][j].active = 0;
 678       }
 679     }
 680     valid = true;
 681   }
 682   // loc - source file location
 683   // gtid - global thread identifier
 684   // pr - this thread's private dispatch buffer (corresponding with gtid)
 685   // p_last (return value) - pointer to flag indicating this set of iterations
 686   // contains last
 687   //          iteration
 688   // p_lb (return value) - lower bound for this chunk of iterations
 689   // p_ub (return value) - upper bound for this chunk of iterations
 690   // p_st (return value) - stride for this chunk of iterations
 691   //
 692   // Returns 1 if there are more iterations to perform, 0 otherwise
 693   int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
 694            kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
 695     int status;
 696     kmp_int32 contains_last = 0;
 697     kmp_info_t *th = __kmp_threads[gtid];
 698     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
 699     auto parent = pr->get_parent();
 700     KMP_DEBUG_ASSERT(parent);
 701     KMP_DEBUG_ASSERT(th);
 702     KMP_DEBUG_ASSERT(tdata);
 703     KMP_DEBUG_ASSERT(parent);
 704     T nproc = (T)parent->get_num_active();
 705     T unit_id = (T)pr->get_hier_id();
 706     KD_TRACE(
 707         10,
 708         ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
 709          gtid, nproc, unit_id));
 710     // Handthreading implementation
 711     // Each iteration is performed by all threads on last unit (typically
 712     // cores/tiles)
 713     // e.g., threads 0,1,2,3 all execute iteration 0
 714     //       threads 0,1,2,3 all execute iteration 1
 715     //       threads 4,5,6,7 all execute iteration 2
 716     //       threads 4,5,6,7 all execute iteration 3
 717     //       ... etc.
 718     if (__kmp_dispatch_hand_threading) {
 719       KD_TRACE(10,
 720                ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
 721                 gtid));
 722       if (unit_id == 0) {
 723         // For hand threading, the sh buffer on the lowest level is only ever
 724         // modified and read by the master thread on that level.  Because of
 725         // this, we can always use the first sh buffer.
 726         auto sh = &(parent->hier_barrier.sh[0]);
 727         KMP_DEBUG_ASSERT(sh);
 728         status = __kmp_dispatch_next_algorithm<T>(
 729             gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 730         if (!status) {
 731           bool done = false;
 732           while (!done) {
 733             done = true;
 734             status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
 735                                   p_st, unit_id, 0);
 736             if (status == 1) {
 737               __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
 738                                             parent->get_next_lb(tdata->index),
 739                                             parent->get_next_ub(tdata->index),
 740                                             parent->get_next_st(tdata->index),
 741 #if USE_ITT_BUILD
 742                                             NULL,
 743 #endif
 744                                             pr->u.p.parm1, nproc, unit_id);
 745               sh->u.s.iteration = 0;
 746               status = __kmp_dispatch_next_algorithm<T>(
 747                   gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
 748                   unit_id);
 749               if (!status) {
 750                 KD_TRACE(10,
 751                          ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
 752                           "after next_pr_sh()"
 753                           "trying again.\n",
 754                           gtid));
 755                 done = false;
 756               }
 757             } else if (status == 2) {
 758               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
 759                             "trying again.\n",
 760                             gtid));
 761               done = false;
 762             }
 763           }
 764         }
 765         parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
 766       } // if master thread of lowest unit level
 767       parent->barrier(pr->get_hier_id(), tdata);
 768       if (unit_id != 0) {
 769         *p_lb = parent->get_curr_lb(tdata->index);
 770         *p_ub = parent->get_curr_ub(tdata->index);
 771         *p_st = parent->get_curr_st(tdata->index);
 772         status = parent->get_curr_status(tdata->index);
 773       }
 774     } else {
 775       // Normal implementation
 776       // Each thread grabs an iteration chunk and executes it (no cooperation)
 777       auto sh = parent->get_curr_sh(tdata->index);
 778       KMP_DEBUG_ASSERT(sh);
 779       status = __kmp_dispatch_next_algorithm<T>(
 780           gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 781       KD_TRACE(10,
 782                ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
 783                 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
 784                 gtid, status, contains_last, *p_lb, *p_ub, *p_st));
 785       if (!status) {
 786         bool done = false;
 787         while (!done) {
 788           done = true;
 789           status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
 790                                 p_st, unit_id, 0);
 791           if (status == 1) {
 792             sh = parent->get_curr_sh(tdata->index);
 793             __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
 794                                           parent->get_curr_lb(tdata->index),
 795                                           parent->get_curr_ub(tdata->index),
 796                                           parent->get_curr_st(tdata->index),
 797 #if USE_ITT_BUILD
 798                                           NULL,
 799 #endif
 800                                           pr->u.p.parm1, nproc, unit_id);
 801             status = __kmp_dispatch_next_algorithm<T>(
 802                 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
 803             if (!status) {
 804               KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
 805                             "after next_pr_sh()"
 806                             "trying again.\n",
 807                             gtid));
 808               done = false;
 809             }
 810           } else if (status == 2) {
 811             KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
 812                           "trying again.\n",
 813                           gtid));
 814             done = false;
 815           }
 816         }
 817       }
 818     }
 819     if (contains_last && !parent->hier_pr.flags.contains_last) {
 820       KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
 821                     "contains_last to FALSE\n",
 822                     gtid));
 823       contains_last = FALSE;
 824     }
 825     if (p_last)
 826       *p_last = contains_last;
 827     KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
 828                   status));
 829     return status;
 830   }
 831   // These functions probe the layer info structure
 832   // Returns the type of topology unit given level
 833   kmp_hier_layer_e get_type(int level) const {
 834     KMP_DEBUG_ASSERT(level >= 0);
 835     KMP_DEBUG_ASSERT(level < num_layers);
 836     return info[level].type;
 837   }
 838   // Returns the schedule type at given level
 839   enum sched_type get_sched(int level) const {
 840     KMP_DEBUG_ASSERT(level >= 0);
 841     KMP_DEBUG_ASSERT(level < num_layers);
 842     return info[level].sched;
 843   }
 844   // Returns the chunk size at given level
 845   ST get_chunk(int level) const {
 846     KMP_DEBUG_ASSERT(level >= 0);
 847     KMP_DEBUG_ASSERT(level < num_layers);
 848     return info[level].chunk;
 849   }
 850   // Returns the number of active threads at given level
 851   int get_num_active(int level) const {
 852     KMP_DEBUG_ASSERT(level >= 0);
 853     KMP_DEBUG_ASSERT(level < num_layers);
 854     return info[level].num_active;
 855   }
 856   // Returns the length of topology unit array at given level
 857   int get_length(int level) const {
 858     KMP_DEBUG_ASSERT(level >= 0);
 859     KMP_DEBUG_ASSERT(level < num_layers);
 860     return info[level].length;
 861   }
 862   // Returns the topology unit given the level and index
 863   kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
 864     KMP_DEBUG_ASSERT(level >= 0);
 865     KMP_DEBUG_ASSERT(level < num_layers);
 866     KMP_DEBUG_ASSERT(index >= 0);
 867     KMP_DEBUG_ASSERT(index < get_length(level));
 868     return &(layers[level][index]);
 869   }
 870   // Returns the number of layers in the hierarchy
 871   int get_num_layers() const { return num_layers; }
 872   // Returns the number of threads in the top layer
 873   // This is necessary because we don't store a topology unit as
 874   // the very top level and the scheduling algorithms need this information
 875   int get_top_level_nproc() const { return top_level_nproc; }
 876   // Return whether this hierarchy is valid or not
 877   bool is_valid() const { return valid; }
 878   // Print the hierarchy
 879   void print() {
 880     KD_TRACE(10, ("kmp_hier_t:\n"));
 881     for (int i = num_layers - 1; i >= 0; --i) {
 882       KD_TRACE(10, ("Info[%d] = ", i));
 883       info[i].print();
 884     }
 885     for (int i = num_layers - 1; i >= 0; --i) {
 886       KD_TRACE(10, ("Layer[%d] =\n", i));
 887       for (int j = 0; j < info[i].length; ++j) {
 888         layers[i][j].print();
 889       }
 890     }
 891   }
 892 };
 893
 894 template <typename T>
 895 void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
 896                                    kmp_hier_layer_e *new_layers,
 897                                    enum sched_type *new_scheds,
 898                                    typename traits_t<T>::signed_t *new_chunks,
 899                                    T lb, T ub,
 900                                    typename traits_t<T>::signed_t st) {
 901   typedef typename traits_t<T>::signed_t ST;
 902   typedef typename traits_t<T>::unsigned_t UT;
 903   int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
 904   int my_buffer_index;
 905   kmp_info_t *th;
 906   kmp_team_t *team;
 907   dispatch_private_info_template<T> *pr;
 908   dispatch_shared_info_template<T> volatile *sh;
 909   gtid = __kmp_entry_gtid();
 910   tid = __kmp_tid_from_gtid(gtid);
 911 #ifdef KMP_DEBUG
 912   KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
 913                 gtid, n));
 914   for (int i = 0; i < n; ++i) {
 915     const char *layer = __kmp_get_hier_str(new_layers[i]);
 916     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
 917                   "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
 918                   gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
 919   }
 920 #endif // KMP_DEBUG
 921   KMP_DEBUG_ASSERT(n > 0);
 922   KMP_DEBUG_ASSERT(new_layers);
 923   KMP_DEBUG_ASSERT(new_scheds);
 924   KMP_DEBUG_ASSERT(new_chunks);
 925   if (!TCR_4(__kmp_init_parallel))
 926     __kmp_parallel_initialize();
 927   th = __kmp_threads[gtid];
 928   team = th->th.th_team;
 929   active = !team->t.t_serialized;
 930   th->th.th_ident = loc;
 931   num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
 932   if (!active) {
 933     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
 934                   "Using normal dispatch functions.\n",
 935                   gtid));
 936     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 937         th->th.th_dispatch->th_disp_buffer);
 938     KMP_DEBUG_ASSERT(pr);
 939     pr->flags.use_hier = FALSE;
 940     pr->flags.contains_last = FALSE;
 941     return;
 942   }
 943   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 944                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 945
 946   my_buffer_index = th->th.th_dispatch->th_disp_index;
 947   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 948       &th->th.th_dispatch
 949            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 950   sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 951       &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 952   KMP_DEBUG_ASSERT(pr);
 953   KMP_DEBUG_ASSERT(sh);
 954   pr->flags.use_hier = TRUE;
 955   pr->u.p.tc = 0;
 956   // Have master allocate the hierarchy
 957   if (__kmp_tid_from_gtid(gtid) == 0) {
 958     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
 959                   "hierarchy\n",
 960                   gtid, pr, sh));
 961     if (sh->hier == NULL) {
 962       sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
 963     }
 964     sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
 965     sh->u.s.iteration = 0;
 966   }
 967   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 968   // Check to make sure the hierarchy is valid
 969   kmp_hier_t<T> *hier = sh->hier;
 970   if (!sh->hier->is_valid()) {
 971     pr->flags.use_hier = FALSE;
 972     return;
 973   }
 974   // Have threads allocate their thread-private barrier data if it hasn't
 975   // already been allocated
 976   if (th->th.th_hier_bar_data == NULL) {
 977     th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
 978         sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
 979   }
 980   // Have threads "register" themselves by modifiying the active count for each
 981   // level they are involved in. The active count will act as nthreads for that
 982   // level regarding the scheduling algorithms
 983   for (int i = 0; i < n; ++i) {
 984     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
 985     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
 986     // Setup the thread's private dispatch buffer's hierarchy pointers
 987     if (i == 0)
 988       pr->hier_parent = my_unit;
 989     // If this unit is already active, then increment active count and wait
 990     if (my_unit->is_active()) {
 991       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
 992                     "is already active (%d)\n",
 993                     gtid, my_unit, my_unit->active));
 994       KMP_TEST_THEN_INC32(&(my_unit->active));
 995       break;
 996     }
 997     // Flag that this unit is active
 998     if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
 999       // Do not setup parent pointer for top level unit since it has no parent
1000       if (i < n - 1) {
1001         // Setup middle layer pointers to parents
1002         my_unit->get_my_pr()->hier_id =
1003             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1004                                                  hier->get_type(i + 1));
1005         int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
1006         my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
1007       } else {
1008         // Setup top layer information (no parent pointers are set)
1009         my_unit->get_my_pr()->hier_id =
1010             index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
1011                                                  kmp_hier_layer_e::LAYER_LOOP);
1012         KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
1013         my_unit->hier_parent = nullptr;
1014       }
1015       // Set trip count to 0 so that next() operation will initially climb up
1016       // the hierarchy to get more iterations (early exit in next() for tc == 0)
1017       my_unit->get_my_pr()->u.p.tc = 0;
1018       // Increment this layer's number of active units
1019       KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
1020       KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1021                     "incrementing num_active\n",
1022                     gtid, my_unit));
1023     } else {
1024       KMP_TEST_THEN_INC32(&(my_unit->active));
1025       break;
1026     }
1027   }
1028   // Set this thread's id
1029   num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
1030       kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
1031   pr->hier_id = tid % num_threads_per_layer1;
1032   // For oversubscribed threads, increment their index within the lowest unit
1033   // This is done to prevent having two or more threads with id 0, id 1, etc.
1034   if (tid >= num_hw_threads)
1035     pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1036   KD_TRACE(
1037       10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1038            gtid, pr->hier_id));
1039
1040   pr->flags.contains_last = FALSE;
1041   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1042
1043   // Now that the number of active threads at each level is determined,
1044   // the barrier data for each unit can be initialized and the last layer's
1045   // loop information can be initialized.
1046   int prev_id = pr->get_hier_id();
1047   for (int i = 0; i < n; ++i) {
1048     if (prev_id != 0)
1049       break;
1050     int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
1051     kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
1052     // Only master threads of this unit within the hierarchy do initialization
1053     KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1054                   gtid, i));
1055     my_unit->reset_shared_barrier();
1056     my_unit->hier_pr.flags.contains_last = FALSE;
1057     // Last layer, initialize the private buffers with entire loop information
1058     // Now the next next_algorithim() call will get the first chunk of
1059     // iterations properly
1060     if (i == n - 1) {
1061       __kmp_dispatch_init_algorithm<T>(
1062           loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
1063 #if USE_ITT_BUILD
1064           NULL,
1065 #endif
1066           hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
1067     }
1068     prev_id = my_unit->get_hier_id();
1069   }
1070   // Initialize each layer of the thread's private barrier data
1071   kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
1072   for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
1073     kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
1074     unit->reset_private_barrier(tdata);
1075   }
1076   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1077
1078 #ifdef KMP_DEBUG
1079   if (__kmp_tid_from_gtid(gtid) == 0) {
1080     for (int i = 0; i < n; ++i) {
1081       KD_TRACE(10,
1082                ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
1083                 gtid, i, hier->get_num_active(i)));
1084     }
1085     hier->print();
1086   }
1087   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1088 #endif // KMP_DEBUG
1089 }
1090 #endif