contrib/openmp/runtime/src/kmp_dispatch.cpp

   1 /*
   2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 //                     The LLVM Compiler Infrastructure
   8 //
   9 // This file is dual licensed under the MIT and the University of Illinois Open
  10 // Source Licenses. See LICENSE.txt for details.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 /* Dynamic scheduling initialization and dispatch.
  15  *
  16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
  17  *       it may change values between parallel regions.  __kmp_max_nth
  18  *       is the largest value __kmp_nth may take, 1 is the smallest.
  19  */
  20
  21 #include "kmp.h"
  22 #include "kmp_error.h"
  23 #include "kmp_i18n.h"
  24 #include "kmp_itt.h"
  25 #include "kmp_stats.h"
  26 #include "kmp_str.h"
  27 #if KMP_USE_X87CONTROL
  28 #include <float.h>
  29 #endif
  30 #include "kmp_lock.h"
  31 #include "kmp_dispatch.h"
  32 #if KMP_USE_HIER_SCHED
  33 #include "kmp_dispatch_hier.h"
  34 #endif
  35
  36 #if OMPT_SUPPORT
  37 #include "ompt-specific.h"
  38 #endif
  39
  40 /* ------------------------------------------------------------------------ */
  41 /* ------------------------------------------------------------------------ */
  42
  43 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  44   kmp_info_t *th;
  45
  46   KMP_DEBUG_ASSERT(gtid_ref);
  47
  48   if (__kmp_env_consistency_check) {
  49     th = __kmp_threads[*gtid_ref];
  50     if (th->th.th_root->r.r_active &&
  51         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
  52 #if KMP_USE_DYNAMIC_LOCK
  53       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
  54 #else
  55       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
  56 #endif
  57     }
  58   }
  59 }
  60
  61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
  62   kmp_info_t *th;
  63
  64   if (__kmp_env_consistency_check) {
  65     th = __kmp_threads[*gtid_ref];
  66     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
  67       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
  68     }
  69   }
  70 }
  71
  72 // Initialize a dispatch_private_info_template<T> buffer for a particular
  73 // type of schedule,chunk.  The loop description is found in lb (lower bound),
  74 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
  75 // to the scheduling (often the number of threads in a team, but not always if
  76 // hierarchical scheduling is used).  tid is the id of the thread calling
  77 // the function within the group of nproc threads.  It will have a value
  78 // between 0 and nproc - 1.  This is often just the thread id within a team, but
  79 // is not necessarily the case when using hierarchical scheduling.
  80 // loc is the source file location of the corresponding loop
  81 // gtid is the global thread id
  82 template <typename T>
  83 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  84                                    dispatch_private_info_template<T> *pr,
  85                                    enum sched_type schedule, T lb, T ub,
  86                                    typename traits_t<T>::signed_t st,
  87 #if USE_ITT_BUILD
  88                                    kmp_uint64 *cur_chunk,
  89 #endif
  90                                    typename traits_t<T>::signed_t chunk,
  91                                    T nproc, T tid) {
  92   typedef typename traits_t<T>::unsigned_t UT;
  93   typedef typename traits_t<T>::floating_t DBL;
  94
  95   int active;
  96   T tc;
  97   kmp_info_t *th;
  98   kmp_team_t *team;
  99
 100 #ifdef KMP_DEBUG
 101   typedef typename traits_t<T>::signed_t ST;
 102   {
 103     char *buff;
 104     // create format specifiers before the debug output
 105     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
 106                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
 107                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
 108                             traits_t<T>::spec, traits_t<T>::spec,
 109                             traits_t<ST>::spec, traits_t<ST>::spec,
 110                             traits_t<T>::spec, traits_t<T>::spec);
 111     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
 112     __kmp_str_free(&buff);
 113   }
 114 #endif
 115   /* setup data */
 116   th = __kmp_threads[gtid];
 117   team = th->th.th_team;
 118   active = !team->t.t_serialized;
 119
 120 #if USE_ITT_BUILD
 121   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
 122                                     __kmp_forkjoin_frames_mode == 3 &&
 123                                     KMP_MASTER_GTID(gtid) &&
 124 #if OMP_40_ENABLED
 125                                     th->th.th_teams_microtask == NULL &&
 126 #endif
 127                                     team->t.t_active_level == 1;
 128 #endif
 129 #if (KMP_STATIC_STEAL_ENABLED)
 130   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
 131     // AC: we now have only one implementation of stealing, so use it
 132     schedule = kmp_sch_static_steal;
 133   else
 134 #endif
 135     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
 136
 137   /* Pick up the nomerge/ordered bits from the scheduling type */
 138   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
 139     pr->flags.nomerge = TRUE;
 140     schedule =
 141         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
 142   } else {
 143     pr->flags.nomerge = FALSE;
 144   }
 145   pr->type_size = traits_t<T>::type_size; // remember the size of variables
 146   if (kmp_ord_lower & schedule) {
 147     pr->flags.ordered = TRUE;
 148     schedule =
 149         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
 150   } else {
 151     pr->flags.ordered = FALSE;
 152   }
 153
 154   if (schedule == kmp_sch_static) {
 155     schedule = __kmp_static;
 156   } else {
 157     if (schedule == kmp_sch_runtime) {
 158       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
 159       // not specified)
 160       schedule = team->t.t_sched.r_sched_type;
 161       // Detail the schedule if needed (global controls are differentiated
 162       // appropriately)
 163       if (schedule == kmp_sch_guided_chunked) {
 164         schedule = __kmp_guided;
 165       } else if (schedule == kmp_sch_static) {
 166         schedule = __kmp_static;
 167       }
 168       // Use the chunk size specified by OMP_SCHEDULE (or default if not
 169       // specified)
 170       chunk = team->t.t_sched.chunk;
 171 #if USE_ITT_BUILD
 172       if (cur_chunk)
 173         *cur_chunk = chunk;
 174 #endif
 175 #ifdef KMP_DEBUG
 176       {
 177         char *buff;
 178         // create format specifiers before the debug output
 179         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
 180                                 "schedule:%%d chunk:%%%s\n",
 181                                 traits_t<ST>::spec);
 182         KD_TRACE(10, (buff, gtid, schedule, chunk));
 183         __kmp_str_free(&buff);
 184       }
 185 #endif
 186     } else {
 187       if (schedule == kmp_sch_guided_chunked) {
 188         schedule = __kmp_guided;
 189       }
 190       if (chunk <= 0) {
 191         chunk = KMP_DEFAULT_CHUNK;
 192       }
 193     }
 194
 195     if (schedule == kmp_sch_auto) {
 196       // mapping and differentiation: in the __kmp_do_serial_initialize()
 197       schedule = __kmp_auto;
 198 #ifdef KMP_DEBUG
 199       {
 200         char *buff;
 201         // create format specifiers before the debug output
 202         buff = __kmp_str_format(
 203             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
 204             "schedule:%%d chunk:%%%s\n",
 205             traits_t<ST>::spec);
 206         KD_TRACE(10, (buff, gtid, schedule, chunk));
 207         __kmp_str_free(&buff);
 208       }
 209 #endif
 210     }
 211
 212     /* guided analytical not safe for too many threads */
 213     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
 214       schedule = kmp_sch_guided_iterative_chunked;
 215       KMP_WARNING(DispatchManyThreads);
 216     }
 217 #if OMP_45_ENABLED
 218     if (schedule == kmp_sch_runtime_simd) {
 219       // compiler provides simd_width in the chunk parameter
 220       schedule = team->t.t_sched.r_sched_type;
 221       // Detail the schedule if needed (global controls are differentiated
 222       // appropriately)
 223       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
 224           schedule == __kmp_static) {
 225         schedule = kmp_sch_static_balanced_chunked;
 226       } else {
 227         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
 228           schedule = kmp_sch_guided_simd;
 229         }
 230         chunk = team->t.t_sched.chunk * chunk;
 231       }
 232 #if USE_ITT_BUILD
 233       if (cur_chunk)
 234         *cur_chunk = chunk;
 235 #endif
 236 #ifdef KMP_DEBUG
 237       {
 238         char *buff;
 239         // create format specifiers before the debug output
 240         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
 241                                 " chunk:%%%s\n",
 242                                 traits_t<ST>::spec);
 243         KD_TRACE(10, (buff, gtid, schedule, chunk));
 244         __kmp_str_free(&buff);
 245       }
 246 #endif
 247     }
 248 #endif // OMP_45_ENABLED
 249     pr->u.p.parm1 = chunk;
 250   }
 251   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
 252               "unknown scheduling type");
 253
 254   pr->u.p.count = 0;
 255
 256   if (__kmp_env_consistency_check) {
 257     if (st == 0) {
 258       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
 259                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
 260     }
 261   }
 262   // compute trip count
 263   if (st == 1) { // most common case
 264     if (ub >= lb) {
 265       tc = ub - lb + 1;
 266     } else { // ub < lb
 267       tc = 0; // zero-trip
 268     }
 269   } else if (st < 0) {
 270     if (lb >= ub) {
 271       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
 272       // where the division needs to be unsigned regardless of the result type
 273       tc = (UT)(lb - ub) / (-st) + 1;
 274     } else { // lb < ub
 275       tc = 0; // zero-trip
 276     }
 277   } else { // st > 0
 278     if (ub >= lb) {
 279       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
 280       // where the division needs to be unsigned regardless of the result type
 281       tc = (UT)(ub - lb) / st + 1;
 282     } else { // ub < lb
 283       tc = 0; // zero-trip
 284     }
 285   }
 286
 287   pr->u.p.lb = lb;
 288   pr->u.p.ub = ub;
 289   pr->u.p.st = st;
 290   pr->u.p.tc = tc;
 291
 292 #if KMP_OS_WINDOWS
 293   pr->u.p.last_upper = ub + st;
 294 #endif /* KMP_OS_WINDOWS */
 295
 296   /* NOTE: only the active parallel region(s) has active ordered sections */
 297
 298   if (active) {
 299     if (pr->flags.ordered) {
 300       pr->ordered_bumped = 0;
 301       pr->u.p.ordered_lower = 1;
 302       pr->u.p.ordered_upper = 0;
 303     }
 304   }
 305
 306   switch (schedule) {
 307 #if (KMP_STATIC_STEAL_ENABLED)
 308   case kmp_sch_static_steal: {
 309     T ntc, init;
 310
 311     KD_TRACE(100,
 312              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
 313               gtid));
 314
 315     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
 316     if (nproc > 1 && ntc >= nproc) {
 317       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
 318       T id = tid;
 319       T small_chunk, extras;
 320
 321       small_chunk = ntc / nproc;
 322       extras = ntc % nproc;
 323
 324       init = id * small_chunk + (id < extras ? id : extras);
 325       pr->u.p.count = init;
 326       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
 327
 328       pr->u.p.parm2 = lb;
 329       // pr->pfields.parm3 = 0; // it's not used in static_steal
 330       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
 331       pr->u.p.st = st;
 332       if (traits_t<T>::type_size > 4) {
 333         // AC: TODO: check if 16-byte CAS available and use it to
 334         // improve performance (probably wait for explicit request
 335         // before spending time on this).
 336         // For now use dynamically allocated per-thread lock,
 337         // free memory in __kmp_dispatch_next when status==0.
 338         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
 339         th->th.th_dispatch->th_steal_lock =
 340             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
 341         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
 342       }
 343       break;
 344     } else {
 345       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 346                      "kmp_sch_static_balanced\n",
 347                      gtid));
 348       schedule = kmp_sch_static_balanced;
 349       /* too few iterations: fall-through to kmp_sch_static_balanced */
 350     } // if
 351     /* FALL-THROUGH to static balanced */
 352   } // case
 353 #endif
 354   case kmp_sch_static_balanced: {
 355     T init, limit;
 356
 357     KD_TRACE(
 358         100,
 359         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
 360          gtid));
 361
 362     if (nproc > 1) {
 363       T id = tid;
 364
 365       if (tc < nproc) {
 366         if (id < tc) {
 367           init = id;
 368           limit = id;
 369           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
 370         } else {
 371           pr->u.p.count = 1; /* means no more chunks to execute */
 372           pr->u.p.parm1 = FALSE;
 373           break;
 374         }
 375       } else {
 376         T small_chunk = tc / nproc;
 377         T extras = tc % nproc;
 378         init = id * small_chunk + (id < extras ? id : extras);
 379         limit = init + small_chunk - (id < extras ? 0 : 1);
 380         pr->u.p.parm1 = (id == nproc - 1);
 381       }
 382     } else {
 383       if (tc > 0) {
 384         init = 0;
 385         limit = tc - 1;
 386         pr->u.p.parm1 = TRUE;
 387       } else {
 388         // zero trip count
 389         pr->u.p.count = 1; /* means no more chunks to execute */
 390         pr->u.p.parm1 = FALSE;
 391         break;
 392       }
 393     }
 394 #if USE_ITT_BUILD
 395     // Calculate chunk for metadata report
 396     if (itt_need_metadata_reporting)
 397       if (cur_chunk)
 398         *cur_chunk = limit - init + 1;
 399 #endif
 400     if (st == 1) {
 401       pr->u.p.lb = lb + init;
 402       pr->u.p.ub = lb + limit;
 403     } else {
 404       // calculated upper bound, "ub" is user-defined upper bound
 405       T ub_tmp = lb + limit * st;
 406       pr->u.p.lb = lb + init * st;
 407       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
 408       // it exactly
 409       if (st > 0) {
 410         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
 411       } else {
 412         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
 413       }
 414     }
 415     if (pr->flags.ordered) {
 416       pr->u.p.ordered_lower = init;
 417       pr->u.p.ordered_upper = limit;
 418     }
 419     break;
 420   } // case
 421 #if OMP_45_ENABLED
 422   case kmp_sch_static_balanced_chunked: {
 423     // similar to balanced, but chunk adjusted to multiple of simd width
 424     T nth = nproc;
 425     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
 426                    " -> falling-through to static_greedy\n",
 427                    gtid));
 428     schedule = kmp_sch_static_greedy;
 429     if (nth > 1)
 430       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
 431     else
 432       pr->u.p.parm1 = tc;
 433     break;
 434   } // case
 435   case kmp_sch_guided_simd:
 436 #endif // OMP_45_ENABLED
 437   case kmp_sch_guided_iterative_chunked: {
 438     KD_TRACE(
 439         100,
 440         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
 441          " case\n",
 442          gtid));
 443
 444     if (nproc > 1) {
 445       if ((2L * chunk + 1) * nproc >= tc) {
 446         /* chunk size too large, switch to dynamic */
 447         schedule = kmp_sch_dynamic_chunked;
 448       } else {
 449         // when remaining iters become less than parm2 - switch to dynamic
 450         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
 451         *(double *)&pr->u.p.parm3 =
 452             guided_flt_param / nproc; // may occupy parm3 and parm4
 453       }
 454     } else {
 455       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 456                      "kmp_sch_static_greedy\n",
 457                      gtid));
 458       schedule = kmp_sch_static_greedy;
 459       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 460       KD_TRACE(
 461           100,
 462           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 463            gtid));
 464       pr->u.p.parm1 = tc;
 465     } // if
 466   } // case
 467   break;
 468   case kmp_sch_guided_analytical_chunked: {
 469     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 470                    "kmp_sch_guided_analytical_chunked case\n",
 471                    gtid));
 472
 473     if (nproc > 1) {
 474       if ((2L * chunk + 1) * nproc >= tc) {
 475         /* chunk size too large, switch to dynamic */
 476         schedule = kmp_sch_dynamic_chunked;
 477       } else {
 478         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 479         DBL x;
 480
 481 #if KMP_USE_X87CONTROL
 482         /* Linux* OS already has 64-bit computation by default for long double,
 483            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
 484            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
 485            instead of the default 53-bit. Even though long double doesn't work
 486            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
 487            expected to impact the correctness of the algorithm, but this has not
 488            been mathematically proven. */
 489         // save original FPCW and set precision to 64-bit, as
 490         // Windows* OS on IA-32 architecture defaults to 53-bit
 491         unsigned int oldFpcw = _control87(0, 0);
 492         _control87(_PC_64, _MCW_PC); // 0,0x30000
 493 #endif
 494         /* value used for comparison in solver for cross-over point */
 495         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
 496
 497         /* crossover point--chunk indexes equal to or greater than
 498            this point switch to dynamic-style scheduling */
 499         UT cross;
 500
 501         /* commonly used term: (2 nproc - 1)/(2 nproc) */
 502         x = (long double)1.0 - (long double)0.5 / nproc;
 503
 504 #ifdef KMP_DEBUG
 505         { // test natural alignment
 506           struct _test_a {
 507             char a;
 508             union {
 509               char b;
 510               DBL d;
 511             };
 512           } t;
 513           ptrdiff_t natural_alignment =
 514               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
 515           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
 516           // long)natural_alignment );
 517           KMP_DEBUG_ASSERT(
 518               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
 519         }
 520 #endif // KMP_DEBUG
 521
 522         /* save the term in thread private dispatch structure */
 523         *(DBL *)&pr->u.p.parm3 = x;
 524
 525         /* solve for the crossover point to the nearest integer i for which C_i
 526            <= chunk */
 527         {
 528           UT left, right, mid;
 529           long double p;
 530
 531           /* estimate initial upper and lower bound */
 532
 533           /* doesn't matter what value right is as long as it is positive, but
 534              it affects performance of the solver */
 535           right = 229;
 536           p = __kmp_pow<UT>(x, right);
 537           if (p > target) {
 538             do {
 539               p *= p;
 540               right <<= 1;
 541             } while (p > target && right < (1 << 27));
 542             /* lower bound is previous (failed) estimate of upper bound */
 543             left = right >> 1;
 544           } else {
 545             left = 0;
 546           }
 547
 548           /* bisection root-finding method */
 549           while (left + 1 < right) {
 550             mid = (left + right) / 2;
 551             if (__kmp_pow<UT>(x, mid) > target) {
 552               left = mid;
 553             } else {
 554               right = mid;
 555             }
 556           } // while
 557           cross = right;
 558         }
 559         /* assert sanity of computed crossover point */
 560         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
 561                    __kmp_pow<UT>(x, cross) <= target);
 562
 563         /* save the crossover point in thread private dispatch structure */
 564         pr->u.p.parm2 = cross;
 565
 566 // C75803
 567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
 568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
 569 #else
 570 #define GUIDED_ANALYTICAL_WORKAROUND (x)
 571 #endif
 572         /* dynamic-style scheduling offset */
 573         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
 574                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
 575                         cross * chunk;
 576 #if KMP_USE_X87CONTROL
 577         // restore FPCW
 578         _control87(oldFpcw, _MCW_PC);
 579 #endif
 580       } // if
 581     } else {
 582       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
 583                      "kmp_sch_static_greedy\n",
 584                      gtid));
 585       schedule = kmp_sch_static_greedy;
 586       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
 587       pr->u.p.parm1 = tc;
 588     } // if
 589   } // case
 590   break;
 591   case kmp_sch_static_greedy:
 592     KD_TRACE(
 593         100,
 594         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
 595          gtid));
 596     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
 597     break;
 598   case kmp_sch_static_chunked:
 599   case kmp_sch_dynamic_chunked:
 600     if (pr->u.p.parm1 <= 0) {
 601       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
 602     }
 603     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
 604                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
 605                    gtid));
 606     break;
 607   case kmp_sch_trapezoidal: {
 608     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
 609
 610     T parm1, parm2, parm3, parm4;
 611     KD_TRACE(100,
 612              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
 613               gtid));
 614
 615     parm1 = chunk;
 616
 617     /* F : size of the first cycle */
 618     parm2 = (tc / (2 * nproc));
 619
 620     if (parm2 < 1) {
 621       parm2 = 1;
 622     }
 623
 624     /* L : size of the last cycle.  Make sure the last cycle is not larger
 625        than the first cycle. */
 626     if (parm1 < 1) {
 627       parm1 = 1;
 628     } else if (parm1 > parm2) {
 629       parm1 = parm2;
 630     }
 631
 632     /* N : number of cycles */
 633     parm3 = (parm2 + parm1);
 634     parm3 = (2 * tc + parm3 - 1) / parm3;
 635
 636     if (parm3 < 2) {
 637       parm3 = 2;
 638     }
 639
 640     /* sigma : decreasing incr of the trapezoid */
 641     parm4 = (parm3 - 1);
 642     parm4 = (parm2 - parm1) / parm4;
 643
 644     // pointless check, because parm4 >= 0 always
 645     // if ( parm4 < 0 ) {
 646     //    parm4 = 0;
 647     //}
 648
 649     pr->u.p.parm1 = parm1;
 650     pr->u.p.parm2 = parm2;
 651     pr->u.p.parm3 = parm3;
 652     pr->u.p.parm4 = parm4;
 653   } // case
 654   break;
 655
 656   default: {
 657     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
 658                 KMP_HNT(GetNewerLibrary), // Hint
 659                 __kmp_msg_null // Variadic argument list terminator
 660                 );
 661   } break;
 662   } // switch
 663   pr->schedule = schedule;
 664 }
 665
 666 #if KMP_USE_HIER_SCHED
 667 template <typename T>
 668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
 669                                              typename traits_t<T>::signed_t st);
 670 template <>
 671 inline void
 672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
 673                                             kmp_int32 ub, kmp_int32 st) {
 674   __kmp_dispatch_init_hierarchy<kmp_int32>(
 675       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 676       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 677 }
 678 template <>
 679 inline void
 680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
 681                                              kmp_uint32 ub, kmp_int32 st) {
 682   __kmp_dispatch_init_hierarchy<kmp_uint32>(
 683       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 684       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
 685 }
 686 template <>
 687 inline void
 688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
 689                                             kmp_int64 ub, kmp_int64 st) {
 690   __kmp_dispatch_init_hierarchy<kmp_int64>(
 691       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 692       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 693 }
 694 template <>
 695 inline void
 696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
 697                                              kmp_uint64 ub, kmp_int64 st) {
 698   __kmp_dispatch_init_hierarchy<kmp_uint64>(
 699       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
 700       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
 701 }
 702
 703 // free all the hierarchy scheduling memory associated with the team
 704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
 705   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
 706   for (int i = 0; i < num_disp_buff; ++i) {
 707     // type does not matter here so use kmp_int32
 708     auto sh =
 709         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
 710             &team->t.t_disp_buffer[i]);
 711     if (sh->hier) {
 712       sh->hier->deallocate();
 713       __kmp_free(sh->hier);
 714     }
 715   }
 716 }
 717 #endif
 718
 719 // UT - unsigned flavor of T, ST - signed flavor of T,
 720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
 721 template <typename T>
 722 static void
 723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 724                     T ub, typename traits_t<T>::signed_t st,
 725                     typename traits_t<T>::signed_t chunk, int push_ws) {
 726   typedef typename traits_t<T>::unsigned_t UT;
 727
 728   int active;
 729   kmp_info_t *th;
 730   kmp_team_t *team;
 731   kmp_uint32 my_buffer_index;
 732   dispatch_private_info_template<T> *pr;
 733   dispatch_shared_info_template<T> volatile *sh;
 734
 735   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
 736                    sizeof(dispatch_private_info));
 737   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
 738                    sizeof(dispatch_shared_info));
 739
 740   if (!TCR_4(__kmp_init_parallel))
 741     __kmp_parallel_initialize();
 742
 743 #if INCLUDE_SSC_MARKS
 744   SSC_MARK_DISPATCH_INIT();
 745 #endif
 746 #ifdef KMP_DEBUG
 747   typedef typename traits_t<T>::signed_t ST;
 748   {
 749     char *buff;
 750     // create format specifiers before the debug output
 751     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
 752                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
 753                             traits_t<ST>::spec, traits_t<T>::spec,
 754                             traits_t<T>::spec, traits_t<ST>::spec);
 755     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
 756     __kmp_str_free(&buff);
 757   }
 758 #endif
 759   /* setup data */
 760   th = __kmp_threads[gtid];
 761   team = th->th.th_team;
 762   active = !team->t.t_serialized;
 763   th->th.th_ident = loc;
 764
 765   // Any half-decent optimizer will remove this test when the blocks are empty
 766   // since the macros expand to nothing
 767   // when statistics are disabled.
 768   if (schedule == __kmp_static) {
 769     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
 770   } else {
 771     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
 772   }
 773
 774 #if KMP_USE_HIER_SCHED
 775   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
 776   // Hierarchical scheduling does not work with ordered, so if ordered is
 777   // detected, then revert back to threaded scheduling.
 778   bool ordered;
 779   enum sched_type my_sched = schedule;
 780   my_buffer_index = th->th.th_dispatch->th_disp_index;
 781   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 782       &th->th.th_dispatch
 783            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 784   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
 785   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
 786     my_sched =
 787         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
 788   ordered = (kmp_ord_lower & my_sched);
 789   if (pr->flags.use_hier) {
 790     if (ordered) {
 791       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
 792                      "Disabling hierarchical scheduling.\n",
 793                      gtid));
 794       pr->flags.use_hier = FALSE;
 795     }
 796   }
 797   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
 798     // Don't use hierarchical for ordered parallel loops and don't
 799     // use the runtime hierarchy if one was specified in the program
 800     if (!ordered && !pr->flags.use_hier)
 801       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
 802   }
 803 #endif // KMP_USE_HIER_SCHED
 804
 805 #if USE_ITT_BUILD
 806   kmp_uint64 cur_chunk = chunk;
 807   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
 808                                     __kmp_forkjoin_frames_mode == 3 &&
 809                                     KMP_MASTER_GTID(gtid) &&
 810 #if OMP_40_ENABLED
 811                                     th->th.th_teams_microtask == NULL &&
 812 #endif
 813                                     team->t.t_active_level == 1;
 814 #endif
 815   if (!active) {
 816     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 817         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
 818   } else {
 819     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 820                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 821
 822     my_buffer_index = th->th.th_dispatch->th_disp_index++;
 823
 824     /* What happens when number of threads changes, need to resize buffer? */
 825     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
 826         &th->th.th_dispatch
 827              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 828     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
 829         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
 830     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
 831                   my_buffer_index));
 832   }
 833
 834   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
 835 #if USE_ITT_BUILD
 836                                 &cur_chunk,
 837 #endif
 838                                 chunk, (T)th->th.th_team_nproc,
 839                                 (T)th->th.th_info.ds.ds_tid);
 840   if (active) {
 841     if (pr->flags.ordered == 0) {
 842       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
 843       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
 844     } else {
 845       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
 846       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
 847     }
 848   }
 849
 850   if (active) {
 851     /* The name of this buffer should be my_buffer_index when it's free to use
 852      * it */
 853
 854     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
 855                    "sh->buffer_index:%d\n",
 856                    gtid, my_buffer_index, sh->buffer_index));
 857     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
 858                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
 859     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
 860     // my_buffer_index are *always* 32-bit integers.
 861     KMP_MB(); /* is this necessary? */
 862     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
 863                    "sh->buffer_index:%d\n",
 864                    gtid, my_buffer_index, sh->buffer_index));
 865
 866     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
 867     th->th.th_dispatch->th_dispatch_sh_current =
 868         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
 869 #if USE_ITT_BUILD
 870     if (pr->flags.ordered) {
 871       __kmp_itt_ordered_init(gtid);
 872     }
 873     // Report loop metadata
 874     if (itt_need_metadata_reporting) {
 875       // Only report metadata by master of active team at level 1
 876       kmp_uint64 schedtype = 0;
 877       switch (schedule) {
 878       case kmp_sch_static_chunked:
 879       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
 880         break;
 881       case kmp_sch_static_greedy:
 882         cur_chunk = pr->u.p.parm1;
 883         break;
 884       case kmp_sch_dynamic_chunked:
 885         schedtype = 1;
 886         break;
 887       case kmp_sch_guided_iterative_chunked:
 888       case kmp_sch_guided_analytical_chunked:
 889 #if OMP_45_ENABLED
 890       case kmp_sch_guided_simd:
 891 #endif
 892         schedtype = 2;
 893         break;
 894       default:
 895         // Should we put this case under "static"?
 896         // case kmp_sch_static_steal:
 897         schedtype = 3;
 898         break;
 899       }
 900       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
 901     }
 902 #if KMP_USE_HIER_SCHED
 903     if (pr->flags.use_hier) {
 904       pr->u.p.count = 0;
 905       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
 906     }
 907 #endif // KMP_USER_HIER_SCHED
 908 #endif /* USE_ITT_BUILD */
 909   }
 910
 911 #ifdef KMP_DEBUG
 912   {
 913     char *buff;
 914     // create format specifiers before the debug output
 915     buff = __kmp_str_format(
 916         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
 917         "lb:%%%s ub:%%%s"
 918         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
 919         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
 920         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
 921         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
 922         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
 923         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
 924     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
 925                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
 926                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
 927                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
 928     __kmp_str_free(&buff);
 929   }
 930 #endif
 931 #if (KMP_STATIC_STEAL_ENABLED)
 932   // It cannot be guaranteed that after execution of a loop with some other
 933   // schedule kind all the parm3 variables will contain the same value. Even if
 934   // all parm3 will be the same, it still exists a bad case like using 0 and 1
 935   // rather than program life-time increment. So the dedicated variable is
 936   // required. The 'static_steal_counter' is used.
 937   if (schedule == kmp_sch_static_steal) {
 938     // Other threads will inspect this variable when searching for a victim.
 939     // This is a flag showing that other threads may steal from this thread
 940     // since then.
 941     volatile T *p = &pr->u.p.static_steal_counter;
 942     *p = *p + 1;
 943   }
 944 #endif // ( KMP_STATIC_STEAL_ENABLED )
 945
 946 #if OMPT_SUPPORT && OMPT_OPTIONAL
 947   if (ompt_enabled.ompt_callback_work) {
 948     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
 949     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
 950     ompt_callbacks.ompt_callback(ompt_callback_work)(
 951         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
 952         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
 953   }
 954 #endif
 955   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
 956 }
 957
 958 /* For ordered loops, either __kmp_dispatch_finish() should be called after
 959  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
 960  * every chunk of iterations.  If the ordered section(s) were not executed
 961  * for this iteration (or every iteration in this chunk), we need to set the
 962  * ordered iteration counters so that the next thread can proceed. */
 963 template <typename UT>
 964 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
 965   typedef typename traits_t<UT>::signed_t ST;
 966   kmp_info_t *th = __kmp_threads[gtid];
 967
 968   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
 969   if (!th->th.th_team->t.t_serialized) {
 970
 971     dispatch_private_info_template<UT> *pr =
 972         reinterpret_cast<dispatch_private_info_template<UT> *>(
 973             th->th.th_dispatch->th_dispatch_pr_current);
 974     dispatch_shared_info_template<UT> volatile *sh =
 975         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
 976             th->th.th_dispatch->th_dispatch_sh_current);
 977     KMP_DEBUG_ASSERT(pr);
 978     KMP_DEBUG_ASSERT(sh);
 979     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
 980                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
 981
 982     if (pr->ordered_bumped) {
 983       KD_TRACE(
 984           1000,
 985           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
 986            gtid));
 987       pr->ordered_bumped = 0;
 988     } else {
 989       UT lower = pr->u.p.ordered_lower;
 990
 991 #ifdef KMP_DEBUG
 992       {
 993         char *buff;
 994         // create format specifiers before the debug output
 995         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
 996                                 "ordered_iteration:%%%s lower:%%%s\n",
 997                                 traits_t<UT>::spec, traits_t<UT>::spec);
 998         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 999         __kmp_str_free(&buff);
1000       }
1001 #endif
1002
1003       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1004                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1005       KMP_MB(); /* is this necessary? */
1006 #ifdef KMP_DEBUG
1007       {
1008         char *buff;
1009         // create format specifiers before the debug output
1010         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1011                                 "ordered_iteration:%%%s lower:%%%s\n",
1012                                 traits_t<UT>::spec, traits_t<UT>::spec);
1013         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1014         __kmp_str_free(&buff);
1015       }
1016 #endif
1017
1018       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1019     } // if
1020   } // if
1021   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1022 }
1023
1024 #ifdef KMP_GOMP_COMPAT
1025
1026 template <typename UT>
1027 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1028   typedef typename traits_t<UT>::signed_t ST;
1029   kmp_info_t *th = __kmp_threads[gtid];
1030
1031   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1032   if (!th->th.th_team->t.t_serialized) {
1033     //        int cid;
1034     dispatch_private_info_template<UT> *pr =
1035         reinterpret_cast<dispatch_private_info_template<UT> *>(
1036             th->th.th_dispatch->th_dispatch_pr_current);
1037     dispatch_shared_info_template<UT> volatile *sh =
1038         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1039             th->th.th_dispatch->th_dispatch_sh_current);
1040     KMP_DEBUG_ASSERT(pr);
1041     KMP_DEBUG_ASSERT(sh);
1042     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1043                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1044
1045     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1046     UT lower = pr->u.p.ordered_lower;
1047     UT upper = pr->u.p.ordered_upper;
1048     UT inc = upper - lower + 1;
1049
1050     if (pr->ordered_bumped == inc) {
1051       KD_TRACE(
1052           1000,
1053           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1054            gtid));
1055       pr->ordered_bumped = 0;
1056     } else {
1057       inc -= pr->ordered_bumped;
1058
1059 #ifdef KMP_DEBUG
1060       {
1061         char *buff;
1062         // create format specifiers before the debug output
1063         buff = __kmp_str_format(
1064             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1065             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1066             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1067         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1068         __kmp_str_free(&buff);
1069       }
1070 #endif
1071
1072       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1073                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1074
1075       KMP_MB(); /* is this necessary? */
1076       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1077                       "ordered_bumped to zero\n",
1078                       gtid));
1079       pr->ordered_bumped = 0;
1080 //!!!!! TODO check if the inc should be unsigned, or signed???
1081 #ifdef KMP_DEBUG
1082       {
1083         char *buff;
1084         // create format specifiers before the debug output
1085         buff = __kmp_str_format(
1086             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1087             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1088             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1089             traits_t<UT>::spec);
1090         KD_TRACE(1000,
1091                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1092         __kmp_str_free(&buff);
1093       }
1094 #endif
1095
1096       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1097     }
1098     //        }
1099   }
1100   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1101 }
1102
1103 #endif /* KMP_GOMP_COMPAT */
1104
1105 template <typename T>
1106 int __kmp_dispatch_next_algorithm(int gtid,
1107                                   dispatch_private_info_template<T> *pr,
1108                                   dispatch_shared_info_template<T> volatile *sh,
1109                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1110                                   typename traits_t<T>::signed_t *p_st, T nproc,
1111                                   T tid) {
1112   typedef typename traits_t<T>::unsigned_t UT;
1113   typedef typename traits_t<T>::signed_t ST;
1114   typedef typename traits_t<T>::floating_t DBL;
1115   int status = 0;
1116   kmp_int32 last = 0;
1117   T start;
1118   ST incr;
1119   UT limit, trip, init;
1120   kmp_info_t *th = __kmp_threads[gtid];
1121   kmp_team_t *team = th->th.th_team;
1122
1123   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1124                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1125   KMP_DEBUG_ASSERT(pr);
1126   KMP_DEBUG_ASSERT(sh);
1127   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1128 #ifdef KMP_DEBUG
1129   {
1130     char *buff;
1131     // create format specifiers before the debug output
1132     buff =
1133         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1134                          "sh:%%p nproc:%%%s tid:%%%s\n",
1135                          traits_t<T>::spec, traits_t<T>::spec);
1136     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1137     __kmp_str_free(&buff);
1138   }
1139 #endif
1140
1141   // zero trip count
1142   if (pr->u.p.tc == 0) {
1143     KD_TRACE(10,
1144              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1145               "zero status:%d\n",
1146               gtid, status));
1147     return 0;
1148   }
1149
1150   switch (pr->schedule) {
1151 #if (KMP_STATIC_STEAL_ENABLED)
1152   case kmp_sch_static_steal: {
1153     T chunk = pr->u.p.parm1;
1154
1155     KD_TRACE(100,
1156              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1157               gtid));
1158
1159     trip = pr->u.p.tc - 1;
1160
1161     if (traits_t<T>::type_size > 4) {
1162       // use lock for 8-byte and CAS for 4-byte induction
1163       // variable. TODO (optional): check and use 16-byte CAS
1164       kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1165       KMP_DEBUG_ASSERT(lck != NULL);
1166       if (pr->u.p.count < (UT)pr->u.p.ub) {
1167         __kmp_acquire_lock(lck, gtid);
1168         // try to get own chunk of iterations
1169         init = (pr->u.p.count)++;
1170         status = (init < (UT)pr->u.p.ub);
1171         __kmp_release_lock(lck, gtid);
1172       } else {
1173         status = 0; // no own chunks
1174       }
1175       if (!status) { // try to steal
1176         kmp_info_t **other_threads = team->t.t_threads;
1177         int while_limit = nproc; // nproc attempts to find a victim
1178         int while_index = 0;
1179         // TODO: algorithm of searching for a victim
1180         // should be cleaned up and measured
1181         while ((!status) && (while_limit != ++while_index)) {
1182           T remaining;
1183           T victimIdx = pr->u.p.parm4;
1184           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1185           dispatch_private_info_template<T> *victim =
1186               reinterpret_cast<dispatch_private_info_template<T> *>(
1187                   other_threads[victimIdx]
1188                       ->th.th_dispatch->th_dispatch_pr_current);
1189           while ((victim == NULL || victim == pr ||
1190                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1191                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1192                  oldVictimIdx != victimIdx) {
1193             victimIdx = (victimIdx + 1) % nproc;
1194             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1195                 other_threads[victimIdx]
1196                     ->th.th_dispatch->th_dispatch_pr_current);
1197           }
1198           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1199                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1200             continue; // try once more (nproc attempts in total)
1201             // no victim is ready yet to participate in stealing
1202             // because all victims are still in kmp_init_dispatch
1203           }
1204           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1205             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1206             continue; // not enough chunks to steal, goto next victim
1207           }
1208
1209           lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1210           KMP_ASSERT(lck != NULL);
1211           __kmp_acquire_lock(lck, gtid);
1212           limit = victim->u.p.ub; // keep initial ub
1213           if (victim->u.p.count >= limit ||
1214               (remaining = limit - victim->u.p.count) < 2) {
1215             __kmp_release_lock(lck, gtid);
1216             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1217             continue; // not enough chunks to steal
1218           }
1219           // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1220           // by 1
1221           if (remaining > 3) {
1222             // steal 1/4 of remaining
1223             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1224             init = (victim->u.p.ub -= (remaining >> 2));
1225           } else {
1226             // steal 1 chunk of 2 or 3 remaining
1227             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1228             init = (victim->u.p.ub -= 1);
1229           }
1230           __kmp_release_lock(lck, gtid);
1231
1232           KMP_DEBUG_ASSERT(init + 1 <= limit);
1233           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1234           status = 1;
1235           while_index = 0;
1236           // now update own count and ub with stolen range but init chunk
1237           __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1238           pr->u.p.count = init + 1;
1239           pr->u.p.ub = limit;
1240           __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1241         } // while (search for victim)
1242       } // if (try to find victim and steal)
1243     } else {
1244       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1245       typedef union {
1246         struct {
1247           UT count;
1248           T ub;
1249         } p;
1250         kmp_int64 b;
1251       } union_i4;
1252       // All operations on 'count' or 'ub' must be combined atomically
1253       // together.
1254       {
1255         union_i4 vold, vnew;
1256         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1257         vnew = vold;
1258         vnew.p.count++;
1259         while (!KMP_COMPARE_AND_STORE_ACQ64(
1260             (volatile kmp_int64 *)&pr->u.p.count,
1261             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1262             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1263           KMP_CPU_PAUSE();
1264           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1265           vnew = vold;
1266           vnew.p.count++;
1267         }
1268         vnew = vold;
1269         init = vnew.p.count;
1270         status = (init < (UT)vnew.p.ub);
1271       }
1272
1273       if (!status) {
1274         kmp_info_t **other_threads = team->t.t_threads;
1275         int while_limit = nproc; // nproc attempts to find a victim
1276         int while_index = 0;
1277
1278         // TODO: algorithm of searching for a victim
1279         // should be cleaned up and measured
1280         while ((!status) && (while_limit != ++while_index)) {
1281           union_i4 vold, vnew;
1282           kmp_int32 remaining;
1283           T victimIdx = pr->u.p.parm4;
1284           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1285           dispatch_private_info_template<T> *victim =
1286               reinterpret_cast<dispatch_private_info_template<T> *>(
1287                   other_threads[victimIdx]
1288                       ->th.th_dispatch->th_dispatch_pr_current);
1289           while ((victim == NULL || victim == pr ||
1290                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1291                    *(volatile T *)&pr->u.p.static_steal_counter)) &&
1292                  oldVictimIdx != victimIdx) {
1293             victimIdx = (victimIdx + 1) % nproc;
1294             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1295                 other_threads[victimIdx]
1296                     ->th.th_dispatch->th_dispatch_pr_current);
1297           }
1298           if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1299                           *(volatile T *)&pr->u.p.static_steal_counter)) {
1300             continue; // try once more (nproc attempts in total)
1301             // no victim is ready yet to participate in stealing
1302             // because all victims are still in kmp_init_dispatch
1303           }
1304           pr->u.p.parm4 = victimIdx; // new victim found
1305           while (1) { // CAS loop if victim has enough chunks to steal
1306             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1307             vnew = vold;
1308
1309             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1310             if (vnew.p.count >= (UT)vnew.p.ub ||
1311                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1312               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1313               break; // not enough chunks to steal, goto next victim
1314             }
1315             if (remaining > 3) {
1316               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1317             } else {
1318               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1319             }
1320             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1321             // TODO: Should this be acquire or release?
1322             if (KMP_COMPARE_AND_STORE_ACQ64(
1323                     (volatile kmp_int64 *)&victim->u.p.count,
1324                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1325                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1326               // stealing succedded
1327               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1328                                         vold.p.ub - vnew.p.ub);
1329               status = 1;
1330               while_index = 0;
1331               // now update own count and ub
1332               init = vnew.p.ub;
1333               vold.p.count = init + 1;
1334 #if KMP_ARCH_X86
1335               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1336 #else
1337               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1338 #endif
1339               break;
1340             } // if (check CAS result)
1341             KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1342           } // while (try to steal from particular victim)
1343         } // while (search for victim)
1344       } // if (try to find victim and steal)
1345     } // if (4-byte induction variable)
1346     if (!status) {
1347       *p_lb = 0;
1348       *p_ub = 0;
1349       if (p_st != NULL)
1350         *p_st = 0;
1351     } else {
1352       start = pr->u.p.parm2;
1353       init *= chunk;
1354       limit = chunk + init - 1;
1355       incr = pr->u.p.st;
1356       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1357
1358       KMP_DEBUG_ASSERT(init <= trip);
1359       if ((last = (limit >= trip)) != 0)
1360         limit = trip;
1361       if (p_st != NULL)
1362         *p_st = incr;
1363
1364       if (incr == 1) {
1365         *p_lb = start + init;
1366         *p_ub = start + limit;
1367       } else {
1368         *p_lb = start + init * incr;
1369         *p_ub = start + limit * incr;
1370       }
1371
1372       if (pr->flags.ordered) {
1373         pr->u.p.ordered_lower = init;
1374         pr->u.p.ordered_upper = limit;
1375       } // if
1376     } // if
1377     break;
1378   } // case
1379 #endif // ( KMP_STATIC_STEAL_ENABLED )
1380   case kmp_sch_static_balanced: {
1381     KD_TRACE(
1382         10,
1383         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1384          gtid));
1385     /* check if thread has any iteration to do */
1386     if ((status = !pr->u.p.count) != 0) {
1387       pr->u.p.count = 1;
1388       *p_lb = pr->u.p.lb;
1389       *p_ub = pr->u.p.ub;
1390       last = pr->u.p.parm1;
1391       if (p_st != NULL)
1392         *p_st = pr->u.p.st;
1393     } else { /* no iterations to do */
1394       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1395     }
1396   } // case
1397   break;
1398   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1399                                  merged here */
1400   case kmp_sch_static_chunked: {
1401     T parm1;
1402
1403     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1404                    "kmp_sch_static_[affinity|chunked] case\n",
1405                    gtid));
1406     parm1 = pr->u.p.parm1;
1407
1408     trip = pr->u.p.tc - 1;
1409     init = parm1 * (pr->u.p.count + tid);
1410
1411     if ((status = (init <= trip)) != 0) {
1412       start = pr->u.p.lb;
1413       incr = pr->u.p.st;
1414       limit = parm1 + init - 1;
1415
1416       if ((last = (limit >= trip)) != 0)
1417         limit = trip;
1418
1419       if (p_st != NULL)
1420         *p_st = incr;
1421
1422       pr->u.p.count += nproc;
1423
1424       if (incr == 1) {
1425         *p_lb = start + init;
1426         *p_ub = start + limit;
1427       } else {
1428         *p_lb = start + init * incr;
1429         *p_ub = start + limit * incr;
1430       }
1431
1432       if (pr->flags.ordered) {
1433         pr->u.p.ordered_lower = init;
1434         pr->u.p.ordered_upper = limit;
1435       } // if
1436     } // if
1437   } // case
1438   break;
1439
1440   case kmp_sch_dynamic_chunked: {
1441     T chunk = pr->u.p.parm1;
1442
1443     KD_TRACE(
1444         100,
1445         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1446          gtid));
1447
1448     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1449     trip = pr->u.p.tc - 1;
1450
1451     if ((status = (init <= trip)) == 0) {
1452       *p_lb = 0;
1453       *p_ub = 0;
1454       if (p_st != NULL)
1455         *p_st = 0;
1456     } else {
1457       start = pr->u.p.lb;
1458       limit = chunk + init - 1;
1459       incr = pr->u.p.st;
1460
1461       if ((last = (limit >= trip)) != 0)
1462         limit = trip;
1463
1464       if (p_st != NULL)
1465         *p_st = incr;
1466
1467       if (incr == 1) {
1468         *p_lb = start + init;
1469         *p_ub = start + limit;
1470       } else {
1471         *p_lb = start + init * incr;
1472         *p_ub = start + limit * incr;
1473       }
1474
1475       if (pr->flags.ordered) {
1476         pr->u.p.ordered_lower = init;
1477         pr->u.p.ordered_upper = limit;
1478       } // if
1479     } // if
1480   } // case
1481   break;
1482
1483   case kmp_sch_guided_iterative_chunked: {
1484     T chunkspec = pr->u.p.parm1;
1485     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1486                    "iterative case\n",
1487                    gtid));
1488     trip = pr->u.p.tc;
1489     // Start atomic part of calculations
1490     while (1) {
1491       ST remaining; // signed, because can be < 0
1492       init = sh->u.s.iteration; // shared value
1493       remaining = trip - init;
1494       if (remaining <= 0) { // AC: need to compare with 0 first
1495         // nothing to do, don't try atomic op
1496         status = 0;
1497         break;
1498       }
1499       if ((T)remaining <
1500           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1501         // use dynamic-style shcedule
1502         // atomically inrement iterations, get old value
1503         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1504                                  (ST)chunkspec);
1505         remaining = trip - init;
1506         if (remaining <= 0) {
1507           status = 0; // all iterations got by other threads
1508         } else {
1509           // got some iterations to work on
1510           status = 1;
1511           if ((T)remaining > chunkspec) {
1512             limit = init + chunkspec - 1;
1513           } else {
1514             last = 1; // the last chunk
1515             limit = init + remaining - 1;
1516           } // if
1517         } // if
1518         break;
1519       } // if
1520       limit = init +
1521               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1522       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1523                                (ST)init, (ST)limit)) {
1524         // CAS was successful, chunk obtained
1525         status = 1;
1526         --limit;
1527         break;
1528       } // if
1529     } // while
1530     if (status != 0) {
1531       start = pr->u.p.lb;
1532       incr = pr->u.p.st;
1533       if (p_st != NULL)
1534         *p_st = incr;
1535       *p_lb = start + init * incr;
1536       *p_ub = start + limit * incr;
1537       if (pr->flags.ordered) {
1538         pr->u.p.ordered_lower = init;
1539         pr->u.p.ordered_upper = limit;
1540       } // if
1541     } else {
1542       *p_lb = 0;
1543       *p_ub = 0;
1544       if (p_st != NULL)
1545         *p_st = 0;
1546     } // if
1547   } // case
1548   break;
1549
1550 #if OMP_45_ENABLED
1551   case kmp_sch_guided_simd: {
1552     // same as iterative but curr-chunk adjusted to be multiple of given
1553     // chunk
1554     T chunk = pr->u.p.parm1;
1555     KD_TRACE(100,
1556              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1557               gtid));
1558     trip = pr->u.p.tc;
1559     // Start atomic part of calculations
1560     while (1) {
1561       ST remaining; // signed, because can be < 0
1562       init = sh->u.s.iteration; // shared value
1563       remaining = trip - init;
1564       if (remaining <= 0) { // AC: need to compare with 0 first
1565         status = 0; // nothing to do, don't try atomic op
1566         break;
1567       }
1568       KMP_DEBUG_ASSERT(init % chunk == 0);
1569       // compare with K*nproc*(chunk+1), K=2 by default
1570       if ((T)remaining < pr->u.p.parm2) {
1571         // use dynamic-style shcedule
1572         // atomically inrement iterations, get old value
1573         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1574                                  (ST)chunk);
1575         remaining = trip - init;
1576         if (remaining <= 0) {
1577           status = 0; // all iterations got by other threads
1578         } else {
1579           // got some iterations to work on
1580           status = 1;
1581           if ((T)remaining > chunk) {
1582             limit = init + chunk - 1;
1583           } else {
1584             last = 1; // the last chunk
1585             limit = init + remaining - 1;
1586           } // if
1587         } // if
1588         break;
1589       } // if
1590       // divide by K*nproc
1591       UT span = remaining * (*(double *)&pr->u.p.parm3);
1592       UT rem = span % chunk;
1593       if (rem) // adjust so that span%chunk == 0
1594         span += chunk - rem;
1595       limit = init + span;
1596       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1597                                (ST)init, (ST)limit)) {
1598         // CAS was successful, chunk obtained
1599         status = 1;
1600         --limit;
1601         break;
1602       } // if
1603     } // while
1604     if (status != 0) {
1605       start = pr->u.p.lb;
1606       incr = pr->u.p.st;
1607       if (p_st != NULL)
1608         *p_st = incr;
1609       *p_lb = start + init * incr;
1610       *p_ub = start + limit * incr;
1611       if (pr->flags.ordered) {
1612         pr->u.p.ordered_lower = init;
1613         pr->u.p.ordered_upper = limit;
1614       } // if
1615     } else {
1616       *p_lb = 0;
1617       *p_ub = 0;
1618       if (p_st != NULL)
1619         *p_st = 0;
1620     } // if
1621   } // case
1622   break;
1623 #endif // OMP_45_ENABLED
1624
1625   case kmp_sch_guided_analytical_chunked: {
1626     T chunkspec = pr->u.p.parm1;
1627     UT chunkIdx;
1628 #if KMP_USE_X87CONTROL
1629     /* for storing original FPCW value for Windows* OS on
1630        IA-32 architecture 8-byte version */
1631     unsigned int oldFpcw;
1632     unsigned int fpcwSet = 0;
1633 #endif
1634     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1635                    "kmp_sch_guided_analytical_chunked case\n",
1636                    gtid));
1637
1638     trip = pr->u.p.tc;
1639
1640     KMP_DEBUG_ASSERT(nproc > 1);
1641     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1642
1643     while (1) { /* this while loop is a safeguard against unexpected zero
1644                    chunk sizes */
1645       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1646       if (chunkIdx >= (UT)pr->u.p.parm2) {
1647         --trip;
1648         /* use dynamic-style scheduling */
1649         init = chunkIdx * chunkspec + pr->u.p.count;
1650         /* need to verify init > 0 in case of overflow in the above
1651          * calculation */
1652         if ((status = (init > 0 && init <= trip)) != 0) {
1653           limit = init + chunkspec - 1;
1654
1655           if ((last = (limit >= trip)) != 0)
1656             limit = trip;
1657         }
1658         break;
1659       } else {
1660 /* use exponential-style scheduling */
1661 /* The following check is to workaround the lack of long double precision on
1662    Windows* OS.
1663    This check works around the possible effect that init != 0 for chunkIdx == 0.
1664  */
1665 #if KMP_USE_X87CONTROL
1666         /* If we haven't already done so, save original
1667            FPCW and set precision to 64-bit, as Windows* OS
1668            on IA-32 architecture defaults to 53-bit */
1669         if (!fpcwSet) {
1670           oldFpcw = _control87(0, 0);
1671           _control87(_PC_64, _MCW_PC);
1672           fpcwSet = 0x30000;
1673         }
1674 #endif
1675         if (chunkIdx) {
1676           init = __kmp_dispatch_guided_remaining<T>(
1677               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1678           KMP_DEBUG_ASSERT(init);
1679           init = trip - init;
1680         } else
1681           init = 0;
1682         limit = trip - __kmp_dispatch_guided_remaining<T>(
1683                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1684         KMP_ASSERT(init <= limit);
1685         if (init < limit) {
1686           KMP_DEBUG_ASSERT(limit <= trip);
1687           --limit;
1688           status = 1;
1689           break;
1690         } // if
1691       } // if
1692     } // while (1)
1693 #if KMP_USE_X87CONTROL
1694     /* restore FPCW if necessary
1695        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1696     */
1697     if (fpcwSet && (oldFpcw & fpcwSet))
1698       _control87(oldFpcw, _MCW_PC);
1699 #endif
1700     if (status != 0) {
1701       start = pr->u.p.lb;
1702       incr = pr->u.p.st;
1703       if (p_st != NULL)
1704         *p_st = incr;
1705       *p_lb = start + init * incr;
1706       *p_ub = start + limit * incr;
1707       if (pr->flags.ordered) {
1708         pr->u.p.ordered_lower = init;
1709         pr->u.p.ordered_upper = limit;
1710       }
1711     } else {
1712       *p_lb = 0;
1713       *p_ub = 0;
1714       if (p_st != NULL)
1715         *p_st = 0;
1716     }
1717   } // case
1718   break;
1719
1720   case kmp_sch_trapezoidal: {
1721     UT index;
1722     T parm2 = pr->u.p.parm2;
1723     T parm3 = pr->u.p.parm3;
1724     T parm4 = pr->u.p.parm4;
1725     KD_TRACE(100,
1726              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1727               gtid));
1728
1729     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1730
1731     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1732     trip = pr->u.p.tc - 1;
1733
1734     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1735       *p_lb = 0;
1736       *p_ub = 0;
1737       if (p_st != NULL)
1738         *p_st = 0;
1739     } else {
1740       start = pr->u.p.lb;
1741       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1742       incr = pr->u.p.st;
1743
1744       if ((last = (limit >= trip)) != 0)
1745         limit = trip;
1746
1747       if (p_st != NULL)
1748         *p_st = incr;
1749
1750       if (incr == 1) {
1751         *p_lb = start + init;
1752         *p_ub = start + limit;
1753       } else {
1754         *p_lb = start + init * incr;
1755         *p_ub = start + limit * incr;
1756       }
1757
1758       if (pr->flags.ordered) {
1759         pr->u.p.ordered_lower = init;
1760         pr->u.p.ordered_upper = limit;
1761       } // if
1762     } // if
1763   } // case
1764   break;
1765   default: {
1766     status = 0; // to avoid complaints on uninitialized variable use
1767     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1768                 KMP_HNT(GetNewerLibrary), // Hint
1769                 __kmp_msg_null // Variadic argument list terminator
1770                 );
1771   } break;
1772   } // switch
1773   if (p_last)
1774     *p_last = last;
1775 #ifdef KMP_DEBUG
1776   if (pr->flags.ordered) {
1777     char *buff;
1778     // create format specifiers before the debug output
1779     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1780                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1781                             traits_t<UT>::spec, traits_t<UT>::spec);
1782     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1783     __kmp_str_free(&buff);
1784   }
1785   {
1786     char *buff;
1787     // create format specifiers before the debug output
1788     buff = __kmp_str_format(
1789         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1790         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1791         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1792     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1793     __kmp_str_free(&buff);
1794   }
1795 #endif
1796   return status;
1797 }
1798
1799 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1800    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1801    is not called. */
1802 #if OMPT_SUPPORT && OMPT_OPTIONAL
1803 #define OMPT_LOOP_END                                                          \
1804   if (status == 0) {                                                           \
1805     if (ompt_enabled.ompt_callback_work) {                                     \
1806       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1807       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1808       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1809           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1810           &(task_info->task_data), 0, codeptr);                                \
1811     }                                                                          \
1812   }
1813 // TODO: implement count
1814 #else
1815 #define OMPT_LOOP_END // no-op
1816 #endif
1817
1818 #if KMP_STATS_ENABLED
1819 #define KMP_STATS_LOOP_END                                                     \
1820   {                                                                            \
1821     kmp_int64 u, l, t, i;                                                      \
1822     l = (kmp_int64)(*p_lb);                                                    \
1823     u = (kmp_int64)(*p_ub);                                                    \
1824     i = (kmp_int64)(pr->u.p.st);                                               \
1825     if (status == 0) {                                                         \
1826       t = 0;                                                                   \
1827       KMP_POP_PARTITIONED_TIMER();                                             \
1828     } else if (i == 1) {                                                       \
1829       if (u >= l)                                                              \
1830         t = u - l + 1;                                                         \
1831       else                                                                     \
1832         t = 0;                                                                 \
1833     } else if (i < 0) {                                                        \
1834       if (l >= u)                                                              \
1835         t = (l - u) / (-i) + 1;                                                \
1836       else                                                                     \
1837         t = 0;                                                                 \
1838     } else {                                                                   \
1839       if (u >= l)                                                              \
1840         t = (u - l) / i + 1;                                                   \
1841       else                                                                     \
1842         t = 0;                                                                 \
1843     }                                                                          \
1844     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1845   }
1846 #else
1847 #define KMP_STATS_LOOP_END /* Nothing */
1848 #endif
1849
1850 template <typename T>
1851 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1852                                T *p_lb, T *p_ub,
1853                                typename traits_t<T>::signed_t *p_st
1854 #if OMPT_SUPPORT && OMPT_OPTIONAL
1855                                ,
1856                                void *codeptr
1857 #endif
1858                                ) {
1859
1860   typedef typename traits_t<T>::unsigned_t UT;
1861   typedef typename traits_t<T>::signed_t ST;
1862   // This is potentially slightly misleading, schedule(runtime) will appear here
1863   // even if the actual runtme schedule is static. (Which points out a
1864   // disadavantage of schedule(runtime): even when static scheduling is used it
1865   // costs more than a compile time choice to use static scheduling would.)
1866   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1867
1868   int status;
1869   dispatch_private_info_template<T> *pr;
1870   kmp_info_t *th = __kmp_threads[gtid];
1871   kmp_team_t *team = th->th.th_team;
1872
1873   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1874   KD_TRACE(
1875       1000,
1876       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1877        gtid, p_lb, p_ub, p_st, p_last));
1878
1879   if (team->t.t_serialized) {
1880     /* NOTE: serialize this dispatch becase we are not at the active level */
1881     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1882         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1883     KMP_DEBUG_ASSERT(pr);
1884
1885     if ((status = (pr->u.p.tc != 0)) == 0) {
1886       *p_lb = 0;
1887       *p_ub = 0;
1888       //            if ( p_last != NULL )
1889       //                *p_last = 0;
1890       if (p_st != NULL)
1891         *p_st = 0;
1892       if (__kmp_env_consistency_check) {
1893         if (pr->pushed_ws != ct_none) {
1894           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1895         }
1896       }
1897     } else if (pr->flags.nomerge) {
1898       kmp_int32 last;
1899       T start;
1900       UT limit, trip, init;
1901       ST incr;
1902       T chunk = pr->u.p.parm1;
1903
1904       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1905                      gtid));
1906
1907       init = chunk * pr->u.p.count++;
1908       trip = pr->u.p.tc - 1;
1909
1910       if ((status = (init <= trip)) == 0) {
1911         *p_lb = 0;
1912         *p_ub = 0;
1913         //                if ( p_last != NULL )
1914         //                    *p_last = 0;
1915         if (p_st != NULL)
1916           *p_st = 0;
1917         if (__kmp_env_consistency_check) {
1918           if (pr->pushed_ws != ct_none) {
1919             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1920           }
1921         }
1922       } else {
1923         start = pr->u.p.lb;
1924         limit = chunk + init - 1;
1925         incr = pr->u.p.st;
1926
1927         if ((last = (limit >= trip)) != 0) {
1928           limit = trip;
1929 #if KMP_OS_WINDOWS
1930           pr->u.p.last_upper = pr->u.p.ub;
1931 #endif /* KMP_OS_WINDOWS */
1932         }
1933         if (p_last != NULL)
1934           *p_last = last;
1935         if (p_st != NULL)
1936           *p_st = incr;
1937         if (incr == 1) {
1938           *p_lb = start + init;
1939           *p_ub = start + limit;
1940         } else {
1941           *p_lb = start + init * incr;
1942           *p_ub = start + limit * incr;
1943         }
1944
1945         if (pr->flags.ordered) {
1946           pr->u.p.ordered_lower = init;
1947           pr->u.p.ordered_upper = limit;
1948 #ifdef KMP_DEBUG
1949           {
1950             char *buff;
1951             // create format specifiers before the debug output
1952             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1953                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1954                                     traits_t<UT>::spec, traits_t<UT>::spec);
1955             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1956                             pr->u.p.ordered_upper));
1957             __kmp_str_free(&buff);
1958           }
1959 #endif
1960         } // if
1961       } // if
1962     } else {
1963       pr->u.p.tc = 0;
1964       *p_lb = pr->u.p.lb;
1965       *p_ub = pr->u.p.ub;
1966 #if KMP_OS_WINDOWS
1967       pr->u.p.last_upper = *p_ub;
1968 #endif /* KMP_OS_WINDOWS */
1969       if (p_last != NULL)
1970         *p_last = TRUE;
1971       if (p_st != NULL)
1972         *p_st = pr->u.p.st;
1973     } // if
1974 #ifdef KMP_DEBUG
1975     {
1976       char *buff;
1977       // create format specifiers before the debug output
1978       buff = __kmp_str_format(
1979           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1980           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1981           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1982       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1983       __kmp_str_free(&buff);
1984     }
1985 #endif
1986 #if INCLUDE_SSC_MARKS
1987     SSC_MARK_DISPATCH_NEXT();
1988 #endif
1989     OMPT_LOOP_END;
1990     KMP_STATS_LOOP_END;
1991     return status;
1992   } else {
1993     kmp_int32 last = 0;
1994     dispatch_shared_info_template<T> volatile *sh;
1995
1996     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1997                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1998
1999     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2000         th->th.th_dispatch->th_dispatch_pr_current);
2001     KMP_DEBUG_ASSERT(pr);
2002     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2003         th->th.th_dispatch->th_dispatch_sh_current);
2004     KMP_DEBUG_ASSERT(sh);
2005
2006 #if KMP_USE_HIER_SCHED
2007     if (pr->flags.use_hier)
2008       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2009     else
2010 #endif // KMP_USE_HIER_SCHED
2011       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2012                                                 p_st, th->th.th_team_nproc,
2013                                                 th->th.th_info.ds.ds_tid);
2014     // status == 0: no more iterations to execute
2015     if (status == 0) {
2016       UT num_done;
2017
2018       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2019 #ifdef KMP_DEBUG
2020       {
2021         char *buff;
2022         // create format specifiers before the debug output
2023         buff = __kmp_str_format(
2024             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2025             traits_t<UT>::spec);
2026         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2027         __kmp_str_free(&buff);
2028       }
2029 #endif
2030
2031 #if KMP_USE_HIER_SCHED
2032       pr->flags.use_hier = FALSE;
2033 #endif
2034       if ((ST)num_done == th->th.th_team_nproc - 1) {
2035 #if (KMP_STATIC_STEAL_ENABLED)
2036         if (pr->schedule == kmp_sch_static_steal &&
2037             traits_t<T>::type_size > 4) {
2038           int i;
2039           kmp_info_t **other_threads = team->t.t_threads;
2040           // loop complete, safe to destroy locks used for stealing
2041           for (i = 0; i < th->th.th_team_nproc; ++i) {
2042             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2043             KMP_ASSERT(lck != NULL);
2044             __kmp_destroy_lock(lck);
2045             __kmp_free(lck);
2046             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2047           }
2048         }
2049 #endif
2050         /* NOTE: release this buffer to be reused */
2051
2052         KMP_MB(); /* Flush all pending memory write invalidates.  */
2053
2054         sh->u.s.num_done = 0;
2055         sh->u.s.iteration = 0;
2056
2057         /* TODO replace with general release procedure? */
2058         if (pr->flags.ordered) {
2059           sh->u.s.ordered_iteration = 0;
2060         }
2061
2062         KMP_MB(); /* Flush all pending memory write invalidates.  */
2063
2064         sh->buffer_index += __kmp_dispatch_num_buffers;
2065         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2066                        gtid, sh->buffer_index));
2067
2068         KMP_MB(); /* Flush all pending memory write invalidates.  */
2069
2070       } // if
2071       if (__kmp_env_consistency_check) {
2072         if (pr->pushed_ws != ct_none) {
2073           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2074         }
2075       }
2076
2077       th->th.th_dispatch->th_deo_fcn = NULL;
2078       th->th.th_dispatch->th_dxo_fcn = NULL;
2079       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2080       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2081     } // if (status == 0)
2082 #if KMP_OS_WINDOWS
2083     else if (last) {
2084       pr->u.p.last_upper = pr->u.p.ub;
2085     }
2086 #endif /* KMP_OS_WINDOWS */
2087     if (p_last != NULL && status != 0)
2088       *p_last = last;
2089   } // if
2090
2091 #ifdef KMP_DEBUG
2092   {
2093     char *buff;
2094     // create format specifiers before the debug output
2095     buff = __kmp_str_format(
2096         "__kmp_dispatch_next: T#%%d normal case: "
2097         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2098         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2099     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2100                   (p_last ? *p_last : 0), status));
2101     __kmp_str_free(&buff);
2102   }
2103 #endif
2104 #if INCLUDE_SSC_MARKS
2105   SSC_MARK_DISPATCH_NEXT();
2106 #endif
2107   OMPT_LOOP_END;
2108   KMP_STATS_LOOP_END;
2109   return status;
2110 }
2111
2112 template <typename T>
2113 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2114                                   kmp_int32 *plastiter, T *plower, T *pupper,
2115                                   typename traits_t<T>::signed_t incr) {
2116   typedef typename traits_t<T>::unsigned_t UT;
2117   kmp_uint32 team_id;
2118   kmp_uint32 nteams;
2119   UT trip_count;
2120   kmp_team_t *team;
2121   kmp_info_t *th;
2122
2123   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2124   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2125 #ifdef KMP_DEBUG
2126   typedef typename traits_t<T>::signed_t ST;
2127   {
2128     char *buff;
2129     // create format specifiers before the debug output
2130     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2131                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2132                             traits_t<T>::spec, traits_t<T>::spec,
2133                             traits_t<ST>::spec, traits_t<T>::spec);
2134     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2135     __kmp_str_free(&buff);
2136   }
2137 #endif
2138
2139   if (__kmp_env_consistency_check) {
2140     if (incr == 0) {
2141       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2142                             loc);
2143     }
2144     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2145       // The loop is illegal.
2146       // Some zero-trip loops maintained by compiler, e.g.:
2147       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2148       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2149       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2150       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2151       // Compiler does not check the following illegal loops:
2152       //   for(i=0;i<10;i+=incr) // where incr<0
2153       //   for(i=10;i>0;i-=incr) // where incr<0
2154       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2155     }
2156   }
2157   th = __kmp_threads[gtid];
2158   team = th->th.th_team;
2159 #if OMP_40_ENABLED
2160   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2161   nteams = th->th.th_teams_size.nteams;
2162 #endif
2163   team_id = team->t.t_master_tid;
2164   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2165
2166   // compute global trip count
2167   if (incr == 1) {
2168     trip_count = *pupper - *plower + 1;
2169   } else if (incr == -1) {
2170     trip_count = *plower - *pupper + 1;
2171   } else if (incr > 0) {
2172     // upper-lower can exceed the limit of signed type
2173     trip_count = (UT)(*pupper - *plower) / incr + 1;
2174   } else {
2175     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2176   }
2177
2178   if (trip_count <= nteams) {
2179     KMP_DEBUG_ASSERT(
2180         __kmp_static == kmp_sch_static_greedy ||
2181         __kmp_static ==
2182             kmp_sch_static_balanced); // Unknown static scheduling type.
2183     // only some teams get single iteration, others get nothing
2184     if (team_id < trip_count) {
2185       *pupper = *plower = *plower + team_id * incr;
2186     } else {
2187       *plower = *pupper + incr; // zero-trip loop
2188     }
2189     if (plastiter != NULL)
2190       *plastiter = (team_id == trip_count - 1);
2191   } else {
2192     if (__kmp_static == kmp_sch_static_balanced) {
2193       UT chunk = trip_count / nteams;
2194       UT extras = trip_count % nteams;
2195       *plower +=
2196           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2197       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2198       if (plastiter != NULL)
2199         *plastiter = (team_id == nteams - 1);
2200     } else {
2201       T chunk_inc_count =
2202           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2203       T upper = *pupper;
2204       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2205       // Unknown static scheduling type.
2206       *plower += team_id * chunk_inc_count;
2207       *pupper = *plower + chunk_inc_count - incr;
2208       // Check/correct bounds if needed
2209       if (incr > 0) {
2210         if (*pupper < *plower)
2211           *pupper = traits_t<T>::max_value;
2212         if (plastiter != NULL)
2213           *plastiter = *plower <= upper && *pupper > upper - incr;
2214         if (*pupper > upper)
2215           *pupper = upper; // tracker C73258
2216       } else {
2217         if (*pupper > *plower)
2218           *pupper = traits_t<T>::min_value;
2219         if (plastiter != NULL)
2220           *plastiter = *plower >= upper && *pupper < upper - incr;
2221         if (*pupper < upper)
2222           *pupper = upper; // tracker C73258
2223       }
2224     }
2225   }
2226 }
2227
2228 //-----------------------------------------------------------------------------
2229 // Dispatch routines
2230 //    Transfer call to template< type T >
2231 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2232 //                         T lb, T ub, ST st, ST chunk )
2233 extern "C" {
2234
2235 /*!
2236 @ingroup WORK_SHARING
2237 @{
2238 @param loc Source location
2239 @param gtid Global thread id
2240 @param schedule Schedule type
2241 @param lb  Lower bound
2242 @param ub  Upper bound
2243 @param st  Step (or increment if you prefer)
2244 @param chunk The chunk size to block with
2245
2246 This function prepares the runtime to start a dynamically scheduled for loop,
2247 saving the loop arguments.
2248 These functions are all identical apart from the types of the arguments.
2249 */
2250
2251 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2252                             enum sched_type schedule, kmp_int32 lb,
2253                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2254   KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL
2256   OMPT_STORE_RETURN_ADDRESS(gtid);
2257 #endif
2258   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2259 }
2260 /*!
2261 See @ref __kmpc_dispatch_init_4
2262 */
2263 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2264                              enum sched_type schedule, kmp_uint32 lb,
2265                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2266   KMP_DEBUG_ASSERT(__kmp_init_serial);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL
2268   OMPT_STORE_RETURN_ADDRESS(gtid);
2269 #endif
2270   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2271 }
2272
2273 /*!
2274 See @ref __kmpc_dispatch_init_4
2275 */
2276 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2277                             enum sched_type schedule, kmp_int64 lb,
2278                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2279   KMP_DEBUG_ASSERT(__kmp_init_serial);
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL
2281   OMPT_STORE_RETURN_ADDRESS(gtid);
2282 #endif
2283   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2284 }
2285
2286 /*!
2287 See @ref __kmpc_dispatch_init_4
2288 */
2289 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2290                              enum sched_type schedule, kmp_uint64 lb,
2291                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2292   KMP_DEBUG_ASSERT(__kmp_init_serial);
2293 #if OMPT_SUPPORT && OMPT_OPTIONAL
2294   OMPT_STORE_RETURN_ADDRESS(gtid);
2295 #endif
2296   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2297 }
2298
2299 /*!
2300 See @ref __kmpc_dispatch_init_4
2301
2302 Difference from __kmpc_dispatch_init set of functions is these functions
2303 are called for composite distribute parallel for construct. Thus before
2304 regular iterations dispatching we need to calc per-team iteration space.
2305
2306 These functions are all identical apart from the types of the arguments.
2307 */
2308 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2309                                  enum sched_type schedule, kmp_int32 *p_last,
2310                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2311                                  kmp_int32 chunk) {
2312   KMP_DEBUG_ASSERT(__kmp_init_serial);
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314   OMPT_STORE_RETURN_ADDRESS(gtid);
2315 #endif
2316   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2317   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2318 }
2319
2320 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2321                                   enum sched_type schedule, kmp_int32 *p_last,
2322                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2323                                   kmp_int32 chunk) {
2324   KMP_DEBUG_ASSERT(__kmp_init_serial);
2325 #if OMPT_SUPPORT && OMPT_OPTIONAL
2326   OMPT_STORE_RETURN_ADDRESS(gtid);
2327 #endif
2328   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2329   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2330 }
2331
2332 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2333                                  enum sched_type schedule, kmp_int32 *p_last,
2334                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2335                                  kmp_int64 chunk) {
2336   KMP_DEBUG_ASSERT(__kmp_init_serial);
2337 #if OMPT_SUPPORT && OMPT_OPTIONAL
2338   OMPT_STORE_RETURN_ADDRESS(gtid);
2339 #endif
2340   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2341   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2342 }
2343
2344 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2345                                   enum sched_type schedule, kmp_int32 *p_last,
2346                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2347                                   kmp_int64 chunk) {
2348   KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350   OMPT_STORE_RETURN_ADDRESS(gtid);
2351 #endif
2352   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2353   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2354 }
2355
2356 /*!
2357 @param loc Source code location
2358 @param gtid Global thread id
2359 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2360 otherwise
2361 @param p_lb   Pointer to the lower bound for the next chunk of work
2362 @param p_ub   Pointer to the upper bound for the next chunk of work
2363 @param p_st   Pointer to the stride for the next chunk of work
2364 @return one if there is work to be done, zero otherwise
2365
2366 Get the next dynamically allocated chunk of work for this thread.
2367 If there is no more work, then the lb,ub and stride need not be modified.
2368 */
2369 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2370                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2371 #if OMPT_SUPPORT && OMPT_OPTIONAL
2372   OMPT_STORE_RETURN_ADDRESS(gtid);
2373 #endif
2374   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376                                         ,
2377                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2378 #endif
2379                                             );
2380 }
2381
2382 /*!
2383 See @ref __kmpc_dispatch_next_4
2384 */
2385 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2386                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2387                             kmp_int32 *p_st) {
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL
2389   OMPT_STORE_RETURN_ADDRESS(gtid);
2390 #endif
2391   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393                                          ,
2394                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2395 #endif
2396                                              );
2397 }
2398
2399 /*!
2400 See @ref __kmpc_dispatch_next_4
2401 */
2402 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2403                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405   OMPT_STORE_RETURN_ADDRESS(gtid);
2406 #endif
2407   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409                                         ,
2410                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2411 #endif
2412                                             );
2413 }
2414
2415 /*!
2416 See @ref __kmpc_dispatch_next_4
2417 */
2418 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2419                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2420                             kmp_int64 *p_st) {
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422   OMPT_STORE_RETURN_ADDRESS(gtid);
2423 #endif
2424   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426                                          ,
2427                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 #endif
2429                                              );
2430 }
2431
2432 /*!
2433 @param loc Source code location
2434 @param gtid Global thread id
2435
2436 Mark the end of a dynamic loop.
2437 */
2438 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2439   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2440 }
2441
2442 /*!
2443 See @ref __kmpc_dispatch_fini_4
2444 */
2445 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2446   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2447 }
2448
2449 /*!
2450 See @ref __kmpc_dispatch_fini_4
2451 */
2452 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2453   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2454 }
2455
2456 /*!
2457 See @ref __kmpc_dispatch_fini_4
2458 */
2459 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2460   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2461 }
2462 /*! @} */
2463
2464 //-----------------------------------------------------------------------------
2465 // Non-template routines from kmp_dispatch.cpp used in other sources
2466
2467 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2468   return value == checker;
2469 }
2470
2471 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2472   return value != checker;
2473 }
2474
2475 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2476   return value < checker;
2477 }
2478
2479 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2480   return value >= checker;
2481 }
2482
2483 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2484   return value <= checker;
2485 }
2486
2487 kmp_uint32
2488 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2489                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2490                    void *obj // Higher-level synchronization object, or NULL.
2491                    ) {
2492   // note: we may not belong to a team at this point
2493   volatile kmp_uint32 *spin = spinner;
2494   kmp_uint32 check = checker;
2495   kmp_uint32 spins;
2496   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2497   kmp_uint32 r;
2498
2499   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2500   KMP_INIT_YIELD(spins);
2501   // main wait spin loop
2502   while (!f(r = TCR_4(*spin), check)) {
2503     KMP_FSYNC_SPIN_PREPARE(obj);
2504     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2505        split. It causes problems with infinite recursion because of exit lock */
2506     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2507         __kmp_abort_thread(); */
2508
2509     /* if we have waited a bit, or are oversubscribed, yield */
2510     /* pause is in the following code */
2511     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2512     KMP_YIELD_SPIN(spins);
2513   }
2514   KMP_FSYNC_SPIN_ACQUIRED(obj);
2515   return r;
2516 }
2517
2518 void __kmp_wait_yield_4_ptr(
2519     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2520     void *obj // Higher-level synchronization object, or NULL.
2521     ) {
2522   // note: we may not belong to a team at this point
2523   void *spin = spinner;
2524   kmp_uint32 check = checker;
2525   kmp_uint32 spins;
2526   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2527
2528   KMP_FSYNC_SPIN_INIT(obj, spin);
2529   KMP_INIT_YIELD(spins);
2530   // main wait spin loop
2531   while (!f(spin, check)) {
2532     KMP_FSYNC_SPIN_PREPARE(obj);
2533     /* if we have waited a bit, or are oversubscribed, yield */
2534     /* pause is in the following code */
2535     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2536     KMP_YIELD_SPIN(spins);
2537   }
2538   KMP_FSYNC_SPIN_ACQUIRED(obj);
2539 }
2540
2541 } // extern "C"
2542
2543 #ifdef KMP_GOMP_COMPAT
2544
2545 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2546                                enum sched_type schedule, kmp_int32 lb,
2547                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2548                                int push_ws) {
2549   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2550                                  push_ws);
2551 }
2552
2553 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2554                                 enum sched_type schedule, kmp_uint32 lb,
2555                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2556                                 int push_ws) {
2557   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2558                                   push_ws);
2559 }
2560
2561 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2562                                enum sched_type schedule, kmp_int64 lb,
2563                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2564                                int push_ws) {
2565   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2566                                  push_ws);
2567 }
2568
2569 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2570                                 enum sched_type schedule, kmp_uint64 lb,
2571                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2572                                 int push_ws) {
2573   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2574                                   push_ws);
2575 }
2576
2577 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2578   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2579 }
2580
2581 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2582   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2583 }
2584
2585 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2586   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2587 }
2588
2589 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2590   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2591 }
2592
2593 #endif /* KMP_GOMP_COMPAT */
2594
2595 /* ------------------------------------------------------------------------ */