2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
5 //===----------------------------------------------------------------------===//
7 // The LLVM Compiler Infrastructure
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
12 //===----------------------------------------------------------------------===//
14 /* Dynamic scheduling initialization and dispatch.
16 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17 * it may change values between parallel regions. __kmp_max_nth
18 * is the largest value __kmp_nth may take, 1 is the smallest.
22 #include "kmp_error.h"
25 #include "kmp_stats.h"
27 #if KMP_USE_X87CONTROL
31 #include "kmp_dispatch.h"
32 #if KMP_USE_HIER_SCHED
33 #include "kmp_dispatch_hier.h"
37 #include "ompt-specific.h"
40 /* ------------------------------------------------------------------------ */
41 /* ------------------------------------------------------------------------ */
43 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
46 KMP_DEBUG_ASSERT(gtid_ref);
48 if (__kmp_env_consistency_check) {
49 th = __kmp_threads[*gtid_ref];
50 if (th->th.th_root->r.r_active &&
51 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
52 #if KMP_USE_DYNAMIC_LOCK
53 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
55 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
64 if (__kmp_env_consistency_check) {
65 th = __kmp_threads[*gtid_ref];
66 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
67 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
72 // Initialize a dispatch_private_info_template<T> buffer for a particular
73 // type of schedule,chunk. The loop description is found in lb (lower bound),
74 // ub (upper bound), and st (stride). nproc is the number of threads relevant
75 // to the scheduling (often the number of threads in a team, but not always if
76 // hierarchical scheduling is used). tid is the id of the thread calling
77 // the function within the group of nproc threads. It will have a value
78 // between 0 and nproc - 1. This is often just the thread id within a team, but
79 // is not necessarily the case when using hierarchical scheduling.
80 // loc is the source file location of the corresponding loop
81 // gtid is the global thread id
83 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
84 dispatch_private_info_template<T> *pr,
85 enum sched_type schedule, T lb, T ub,
86 typename traits_t<T>::signed_t st,
88 kmp_uint64 *cur_chunk,
90 typename traits_t<T>::signed_t chunk,
92 typedef typename traits_t<T>::unsigned_t UT;
93 typedef typename traits_t<T>::floating_t DBL;
101 typedef typename traits_t<T>::signed_t ST;
104 // create format specifiers before the debug output
105 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
106 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
107 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
108 traits_t<T>::spec, traits_t<T>::spec,
109 traits_t<ST>::spec, traits_t<ST>::spec,
110 traits_t<T>::spec, traits_t<T>::spec);
111 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
112 __kmp_str_free(&buff);
116 th = __kmp_threads[gtid];
117 team = th->th.th_team;
118 active = !team->t.t_serialized;
121 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
122 __kmp_forkjoin_frames_mode == 3 &&
123 KMP_MASTER_GTID(gtid) &&
125 th->th.th_teams_microtask == NULL &&
127 team->t.t_active_level == 1;
129 #if (KMP_STATIC_STEAL_ENABLED)
130 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
131 // AC: we now have only one implementation of stealing, so use it
132 schedule = kmp_sch_static_steal;
135 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
137 /* Pick up the nomerge/ordered bits from the scheduling type */
138 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
139 pr->flags.nomerge = TRUE;
141 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
143 pr->flags.nomerge = FALSE;
145 pr->type_size = traits_t<T>::type_size; // remember the size of variables
146 if (kmp_ord_lower & schedule) {
147 pr->flags.ordered = TRUE;
149 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
151 pr->flags.ordered = FALSE;
154 if (schedule == kmp_sch_static) {
155 schedule = __kmp_static;
157 if (schedule == kmp_sch_runtime) {
158 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
160 schedule = team->t.t_sched.r_sched_type;
161 // Detail the schedule if needed (global controls are differentiated
163 if (schedule == kmp_sch_guided_chunked) {
164 schedule = __kmp_guided;
165 } else if (schedule == kmp_sch_static) {
166 schedule = __kmp_static;
168 // Use the chunk size specified by OMP_SCHEDULE (or default if not
170 chunk = team->t.t_sched.chunk;
178 // create format specifiers before the debug output
179 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
180 "schedule:%%d chunk:%%%s\n",
182 KD_TRACE(10, (buff, gtid, schedule, chunk));
183 __kmp_str_free(&buff);
187 if (schedule == kmp_sch_guided_chunked) {
188 schedule = __kmp_guided;
191 chunk = KMP_DEFAULT_CHUNK;
195 if (schedule == kmp_sch_auto) {
196 // mapping and differentiation: in the __kmp_do_serial_initialize()
197 schedule = __kmp_auto;
201 // create format specifiers before the debug output
202 buff = __kmp_str_format(
203 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
204 "schedule:%%d chunk:%%%s\n",
206 KD_TRACE(10, (buff, gtid, schedule, chunk));
207 __kmp_str_free(&buff);
212 /* guided analytical not safe for too many threads */
213 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
214 schedule = kmp_sch_guided_iterative_chunked;
215 KMP_WARNING(DispatchManyThreads);
218 if (schedule == kmp_sch_runtime_simd) {
219 // compiler provides simd_width in the chunk parameter
220 schedule = team->t.t_sched.r_sched_type;
221 // Detail the schedule if needed (global controls are differentiated
223 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
224 schedule == __kmp_static) {
225 schedule = kmp_sch_static_balanced_chunked;
227 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
228 schedule = kmp_sch_guided_simd;
230 chunk = team->t.t_sched.chunk * chunk;
239 // create format specifiers before the debug output
240 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
243 KD_TRACE(10, (buff, gtid, schedule, chunk));
244 __kmp_str_free(&buff);
248 #endif // OMP_45_ENABLED
249 pr->u.p.parm1 = chunk;
251 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
252 "unknown scheduling type");
256 if (__kmp_env_consistency_check) {
258 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
259 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
262 // compute trip count
263 if (st == 1) { // most common case
271 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
272 // where the division needs to be unsigned regardless of the result type
273 tc = (UT)(lb - ub) / (-st) + 1;
279 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
280 // where the division needs to be unsigned regardless of the result type
281 tc = (UT)(ub - lb) / st + 1;
293 pr->u.p.last_upper = ub + st;
294 #endif /* KMP_OS_WINDOWS */
296 /* NOTE: only the active parallel region(s) has active ordered sections */
299 if (pr->flags.ordered) {
300 pr->ordered_bumped = 0;
301 pr->u.p.ordered_lower = 1;
302 pr->u.p.ordered_upper = 0;
307 #if (KMP_STATIC_STEAL_ENABLED)
308 case kmp_sch_static_steal: {
312 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
315 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
316 if (nproc > 1 && ntc >= nproc) {
317 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
319 T small_chunk, extras;
321 small_chunk = ntc / nproc;
322 extras = ntc % nproc;
324 init = id * small_chunk + (id < extras ? id : extras);
325 pr->u.p.count = init;
326 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
329 // pr->pfields.parm3 = 0; // it's not used in static_steal
330 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
332 if (traits_t<T>::type_size > 4) {
333 // AC: TODO: check if 16-byte CAS available and use it to
334 // improve performance (probably wait for explicit request
335 // before spending time on this).
336 // For now use dynamically allocated per-thread lock,
337 // free memory in __kmp_dispatch_next when status==0.
338 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
339 th->th.th_dispatch->th_steal_lock =
340 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
341 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
345 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
346 "kmp_sch_static_balanced\n",
348 schedule = kmp_sch_static_balanced;
349 /* too few iterations: fall-through to kmp_sch_static_balanced */
351 /* FALL-THROUGH to static balanced */
354 case kmp_sch_static_balanced: {
359 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
369 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
371 pr->u.p.count = 1; /* means no more chunks to execute */
372 pr->u.p.parm1 = FALSE;
376 T small_chunk = tc / nproc;
377 T extras = tc % nproc;
378 init = id * small_chunk + (id < extras ? id : extras);
379 limit = init + small_chunk - (id < extras ? 0 : 1);
380 pr->u.p.parm1 = (id == nproc - 1);
386 pr->u.p.parm1 = TRUE;
389 pr->u.p.count = 1; /* means no more chunks to execute */
390 pr->u.p.parm1 = FALSE;
395 // Calculate chunk for metadata report
396 if (itt_need_metadata_reporting)
398 *cur_chunk = limit - init + 1;
401 pr->u.p.lb = lb + init;
402 pr->u.p.ub = lb + limit;
404 // calculated upper bound, "ub" is user-defined upper bound
405 T ub_tmp = lb + limit * st;
406 pr->u.p.lb = lb + init * st;
407 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
410 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
412 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
415 if (pr->flags.ordered) {
416 pr->u.p.ordered_lower = init;
417 pr->u.p.ordered_upper = limit;
422 case kmp_sch_static_balanced_chunked: {
423 // similar to balanced, but chunk adjusted to multiple of simd width
425 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
426 " -> falling-through to static_greedy\n",
428 schedule = kmp_sch_static_greedy;
430 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
435 case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED
437 case kmp_sch_guided_iterative_chunked: {
440 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
445 if ((2L * chunk + 1) * nproc >= tc) {
446 /* chunk size too large, switch to dynamic */
447 schedule = kmp_sch_dynamic_chunked;
449 // when remaining iters become less than parm2 - switch to dynamic
450 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451 *(double *)&pr->u.p.parm3 =
452 guided_flt_param / nproc; // may occupy parm3 and parm4
455 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
456 "kmp_sch_static_greedy\n",
458 schedule = kmp_sch_static_greedy;
459 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
462 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
468 case kmp_sch_guided_analytical_chunked: {
469 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
470 "kmp_sch_guided_analytical_chunked case\n",
474 if ((2L * chunk + 1) * nproc >= tc) {
475 /* chunk size too large, switch to dynamic */
476 schedule = kmp_sch_dynamic_chunked;
478 /* commonly used term: (2 nproc - 1)/(2 nproc) */
481 #if KMP_USE_X87CONTROL
482 /* Linux* OS already has 64-bit computation by default for long double,
483 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
484 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
485 instead of the default 53-bit. Even though long double doesn't work
486 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
487 expected to impact the correctness of the algorithm, but this has not
488 been mathematically proven. */
489 // save original FPCW and set precision to 64-bit, as
490 // Windows* OS on IA-32 architecture defaults to 53-bit
491 unsigned int oldFpcw = _control87(0, 0);
492 _control87(_PC_64, _MCW_PC); // 0,0x30000
494 /* value used for comparison in solver for cross-over point */
495 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
497 /* crossover point--chunk indexes equal to or greater than
498 this point switch to dynamic-style scheduling */
501 /* commonly used term: (2 nproc - 1)/(2 nproc) */
502 x = (long double)1.0 - (long double)0.5 / nproc;
505 { // test natural alignment
513 ptrdiff_t natural_alignment =
514 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
515 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
516 // long)natural_alignment );
518 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
522 /* save the term in thread private dispatch structure */
523 *(DBL *)&pr->u.p.parm3 = x;
525 /* solve for the crossover point to the nearest integer i for which C_i
531 /* estimate initial upper and lower bound */
533 /* doesn't matter what value right is as long as it is positive, but
534 it affects performance of the solver */
536 p = __kmp_pow<UT>(x, right);
541 } while (p > target && right < (1 << 27));
542 /* lower bound is previous (failed) estimate of upper bound */
548 /* bisection root-finding method */
549 while (left + 1 < right) {
550 mid = (left + right) / 2;
551 if (__kmp_pow<UT>(x, mid) > target) {
559 /* assert sanity of computed crossover point */
560 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561 __kmp_pow<UT>(x, cross) <= target);
563 /* save the crossover point in thread private dispatch structure */
564 pr->u.p.parm2 = cross;
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
570 #define GUIDED_ANALYTICAL_WORKAROUND (x)
572 /* dynamic-style scheduling offset */
573 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
576 #if KMP_USE_X87CONTROL
578 _control87(oldFpcw, _MCW_PC);
582 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
583 "kmp_sch_static_greedy\n",
585 schedule = kmp_sch_static_greedy;
586 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
591 case kmp_sch_static_greedy:
594 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
596 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
598 case kmp_sch_static_chunked:
599 case kmp_sch_dynamic_chunked:
600 if (pr->u.p.parm1 <= 0) {
601 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
603 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
604 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
607 case kmp_sch_trapezoidal: {
608 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
610 T parm1, parm2, parm3, parm4;
612 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
617 /* F : size of the first cycle */
618 parm2 = (tc / (2 * nproc));
624 /* L : size of the last cycle. Make sure the last cycle is not larger
625 than the first cycle. */
628 } else if (parm1 > parm2) {
632 /* N : number of cycles */
633 parm3 = (parm2 + parm1);
634 parm3 = (2 * tc + parm3 - 1) / parm3;
640 /* sigma : decreasing incr of the trapezoid */
642 parm4 = (parm2 - parm1) / parm4;
644 // pointless check, because parm4 >= 0 always
645 // if ( parm4 < 0 ) {
649 pr->u.p.parm1 = parm1;
650 pr->u.p.parm2 = parm2;
651 pr->u.p.parm3 = parm3;
652 pr->u.p.parm4 = parm4;
657 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
658 KMP_HNT(GetNewerLibrary), // Hint
659 __kmp_msg_null // Variadic argument list terminator
663 pr->schedule = schedule;
666 #if KMP_USE_HIER_SCHED
667 template <typename T>
668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
669 typename traits_t<T>::signed_t st);
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
673 kmp_int32 ub, kmp_int32 st) {
674 __kmp_dispatch_init_hierarchy<kmp_int32>(
675 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
681 kmp_uint32 ub, kmp_int32 st) {
682 __kmp_dispatch_init_hierarchy<kmp_uint32>(
683 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
689 kmp_int64 ub, kmp_int64 st) {
690 __kmp_dispatch_init_hierarchy<kmp_int64>(
691 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
697 kmp_uint64 ub, kmp_int64 st) {
698 __kmp_dispatch_init_hierarchy<kmp_uint64>(
699 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
703 // free all the hierarchy scheduling memory associated with the team
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706 for (int i = 0; i < num_disp_buff; ++i) {
707 // type does not matter here so use kmp_int32
709 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
710 &team->t.t_disp_buffer[i]);
712 sh->hier->deallocate();
713 __kmp_free(sh->hier);
719 // UT - unsigned flavor of T, ST - signed flavor of T,
720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
721 template <typename T>
723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
724 T ub, typename traits_t<T>::signed_t st,
725 typename traits_t<T>::signed_t chunk, int push_ws) {
726 typedef typename traits_t<T>::unsigned_t UT;
731 kmp_uint32 my_buffer_index;
732 dispatch_private_info_template<T> *pr;
733 dispatch_shared_info_template<T> volatile *sh;
735 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
736 sizeof(dispatch_private_info));
737 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
738 sizeof(dispatch_shared_info));
740 if (!TCR_4(__kmp_init_parallel))
741 __kmp_parallel_initialize();
743 #if INCLUDE_SSC_MARKS
744 SSC_MARK_DISPATCH_INIT();
747 typedef typename traits_t<T>::signed_t ST;
750 // create format specifiers before the debug output
751 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
752 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
753 traits_t<ST>::spec, traits_t<T>::spec,
754 traits_t<T>::spec, traits_t<ST>::spec);
755 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
756 __kmp_str_free(&buff);
760 th = __kmp_threads[gtid];
761 team = th->th.th_team;
762 active = !team->t.t_serialized;
763 th->th.th_ident = loc;
765 // Any half-decent optimizer will remove this test when the blocks are empty
766 // since the macros expand to nothing
767 // when statistics are disabled.
768 if (schedule == __kmp_static) {
769 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
771 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
774 #if KMP_USE_HIER_SCHED
775 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
776 // Hierarchical scheduling does not work with ordered, so if ordered is
777 // detected, then revert back to threaded scheduling.
779 enum sched_type my_sched = schedule;
780 my_buffer_index = th->th.th_dispatch->th_disp_index;
781 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
783 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
784 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
785 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
787 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
788 ordered = (kmp_ord_lower & my_sched);
789 if (pr->flags.use_hier) {
791 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
792 "Disabling hierarchical scheduling.\n",
794 pr->flags.use_hier = FALSE;
797 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
798 // Don't use hierarchical for ordered parallel loops and don't
799 // use the runtime hierarchy if one was specified in the program
800 if (!ordered && !pr->flags.use_hier)
801 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
803 #endif // KMP_USE_HIER_SCHED
806 kmp_uint64 cur_chunk = chunk;
807 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
808 __kmp_forkjoin_frames_mode == 3 &&
809 KMP_MASTER_GTID(gtid) &&
811 th->th.th_teams_microtask == NULL &&
813 team->t.t_active_level == 1;
816 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
817 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
819 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
820 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
822 my_buffer_index = th->th.th_dispatch->th_disp_index++;
824 /* What happens when number of threads changes, need to resize buffer? */
825 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
827 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
828 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
829 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
830 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
834 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
838 chunk, (T)th->th.th_team_nproc,
839 (T)th->th.th_info.ds.ds_tid);
841 if (pr->flags.ordered == 0) {
842 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
843 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
845 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
846 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
851 /* The name of this buffer should be my_buffer_index when it's free to use
854 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
855 "sh->buffer_index:%d\n",
856 gtid, my_buffer_index, sh->buffer_index));
857 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
858 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
859 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
860 // my_buffer_index are *always* 32-bit integers.
861 KMP_MB(); /* is this necessary? */
862 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
863 "sh->buffer_index:%d\n",
864 gtid, my_buffer_index, sh->buffer_index));
866 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
867 th->th.th_dispatch->th_dispatch_sh_current =
868 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
870 if (pr->flags.ordered) {
871 __kmp_itt_ordered_init(gtid);
873 // Report loop metadata
874 if (itt_need_metadata_reporting) {
875 // Only report metadata by master of active team at level 1
876 kmp_uint64 schedtype = 0;
878 case kmp_sch_static_chunked:
879 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
881 case kmp_sch_static_greedy:
882 cur_chunk = pr->u.p.parm1;
884 case kmp_sch_dynamic_chunked:
887 case kmp_sch_guided_iterative_chunked:
888 case kmp_sch_guided_analytical_chunked:
890 case kmp_sch_guided_simd:
895 // Should we put this case under "static"?
896 // case kmp_sch_static_steal:
900 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
902 #if KMP_USE_HIER_SCHED
903 if (pr->flags.use_hier) {
905 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
907 #endif // KMP_USER_HIER_SCHED
908 #endif /* USE_ITT_BUILD */
914 // create format specifiers before the debug output
915 buff = __kmp_str_format(
916 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
918 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
919 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
920 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
921 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
922 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
923 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
924 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
925 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
926 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
927 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
928 __kmp_str_free(&buff);
931 #if (KMP_STATIC_STEAL_ENABLED)
932 // It cannot be guaranteed that after execution of a loop with some other
933 // schedule kind all the parm3 variables will contain the same value. Even if
934 // all parm3 will be the same, it still exists a bad case like using 0 and 1
935 // rather than program life-time increment. So the dedicated variable is
936 // required. The 'static_steal_counter' is used.
937 if (schedule == kmp_sch_static_steal) {
938 // Other threads will inspect this variable when searching for a victim.
939 // This is a flag showing that other threads may steal from this thread
941 volatile T *p = &pr->u.p.static_steal_counter;
944 #endif // ( KMP_STATIC_STEAL_ENABLED )
946 #if OMPT_SUPPORT && OMPT_OPTIONAL
947 if (ompt_enabled.ompt_callback_work) {
948 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
949 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
950 ompt_callbacks.ompt_callback(ompt_callback_work)(
951 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
952 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
955 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
958 /* For ordered loops, either __kmp_dispatch_finish() should be called after
959 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
960 * every chunk of iterations. If the ordered section(s) were not executed
961 * for this iteration (or every iteration in this chunk), we need to set the
962 * ordered iteration counters so that the next thread can proceed. */
963 template <typename UT>
964 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
965 typedef typename traits_t<UT>::signed_t ST;
966 kmp_info_t *th = __kmp_threads[gtid];
968 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
969 if (!th->th.th_team->t.t_serialized) {
971 dispatch_private_info_template<UT> *pr =
972 reinterpret_cast<dispatch_private_info_template<UT> *>(
973 th->th.th_dispatch->th_dispatch_pr_current);
974 dispatch_shared_info_template<UT> volatile *sh =
975 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
976 th->th.th_dispatch->th_dispatch_sh_current);
977 KMP_DEBUG_ASSERT(pr);
978 KMP_DEBUG_ASSERT(sh);
979 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
980 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
982 if (pr->ordered_bumped) {
985 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
987 pr->ordered_bumped = 0;
989 UT lower = pr->u.p.ordered_lower;
994 // create format specifiers before the debug output
995 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
996 "ordered_iteration:%%%s lower:%%%s\n",
997 traits_t<UT>::spec, traits_t<UT>::spec);
998 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
999 __kmp_str_free(&buff);
1003 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1004 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1005 KMP_MB(); /* is this necessary? */
1009 // create format specifiers before the debug output
1010 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1011 "ordered_iteration:%%%s lower:%%%s\n",
1012 traits_t<UT>::spec, traits_t<UT>::spec);
1013 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1014 __kmp_str_free(&buff);
1018 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1021 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1024 #ifdef KMP_GOMP_COMPAT
1026 template <typename UT>
1027 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1028 typedef typename traits_t<UT>::signed_t ST;
1029 kmp_info_t *th = __kmp_threads[gtid];
1031 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1032 if (!th->th.th_team->t.t_serialized) {
1034 dispatch_private_info_template<UT> *pr =
1035 reinterpret_cast<dispatch_private_info_template<UT> *>(
1036 th->th.th_dispatch->th_dispatch_pr_current);
1037 dispatch_shared_info_template<UT> volatile *sh =
1038 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1039 th->th.th_dispatch->th_dispatch_sh_current);
1040 KMP_DEBUG_ASSERT(pr);
1041 KMP_DEBUG_ASSERT(sh);
1042 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1043 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1045 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1046 UT lower = pr->u.p.ordered_lower;
1047 UT upper = pr->u.p.ordered_upper;
1048 UT inc = upper - lower + 1;
1050 if (pr->ordered_bumped == inc) {
1053 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1055 pr->ordered_bumped = 0;
1057 inc -= pr->ordered_bumped;
1062 // create format specifiers before the debug output
1063 buff = __kmp_str_format(
1064 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1065 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1066 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1067 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1068 __kmp_str_free(&buff);
1072 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1073 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1075 KMP_MB(); /* is this necessary? */
1076 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1077 "ordered_bumped to zero\n",
1079 pr->ordered_bumped = 0;
1080 //!!!!! TODO check if the inc should be unsigned, or signed???
1084 // create format specifiers before the debug output
1085 buff = __kmp_str_format(
1086 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1087 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1088 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1089 traits_t<UT>::spec);
1091 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1092 __kmp_str_free(&buff);
1096 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1100 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1103 #endif /* KMP_GOMP_COMPAT */
1105 template <typename T>
1106 int __kmp_dispatch_next_algorithm(int gtid,
1107 dispatch_private_info_template<T> *pr,
1108 dispatch_shared_info_template<T> volatile *sh,
1109 kmp_int32 *p_last, T *p_lb, T *p_ub,
1110 typename traits_t<T>::signed_t *p_st, T nproc,
1112 typedef typename traits_t<T>::unsigned_t UT;
1113 typedef typename traits_t<T>::signed_t ST;
1114 typedef typename traits_t<T>::floating_t DBL;
1119 UT limit, trip, init;
1120 kmp_info_t *th = __kmp_threads[gtid];
1121 kmp_team_t *team = th->th.th_team;
1123 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1124 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1125 KMP_DEBUG_ASSERT(pr);
1126 KMP_DEBUG_ASSERT(sh);
1127 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1131 // create format specifiers before the debug output
1133 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1134 "sh:%%p nproc:%%%s tid:%%%s\n",
1135 traits_t<T>::spec, traits_t<T>::spec);
1136 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1137 __kmp_str_free(&buff);
1142 if (pr->u.p.tc == 0) {
1144 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1150 switch (pr->schedule) {
1151 #if (KMP_STATIC_STEAL_ENABLED)
1152 case kmp_sch_static_steal: {
1153 T chunk = pr->u.p.parm1;
1156 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1159 trip = pr->u.p.tc - 1;
1161 if (traits_t<T>::type_size > 4) {
1162 // use lock for 8-byte and CAS for 4-byte induction
1163 // variable. TODO (optional): check and use 16-byte CAS
1164 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1165 KMP_DEBUG_ASSERT(lck != NULL);
1166 if (pr->u.p.count < (UT)pr->u.p.ub) {
1167 __kmp_acquire_lock(lck, gtid);
1168 // try to get own chunk of iterations
1169 init = (pr->u.p.count)++;
1170 status = (init < (UT)pr->u.p.ub);
1171 __kmp_release_lock(lck, gtid);
1173 status = 0; // no own chunks
1175 if (!status) { // try to steal
1176 kmp_info_t **other_threads = team->t.t_threads;
1177 int while_limit = nproc; // nproc attempts to find a victim
1178 int while_index = 0;
1179 // TODO: algorithm of searching for a victim
1180 // should be cleaned up and measured
1181 while ((!status) && (while_limit != ++while_index)) {
1183 T victimIdx = pr->u.p.parm4;
1184 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1185 dispatch_private_info_template<T> *victim =
1186 reinterpret_cast<dispatch_private_info_template<T> *>(
1187 other_threads[victimIdx]
1188 ->th.th_dispatch->th_dispatch_pr_current);
1189 while ((victim == NULL || victim == pr ||
1190 (*(volatile T *)&victim->u.p.static_steal_counter !=
1191 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1192 oldVictimIdx != victimIdx) {
1193 victimIdx = (victimIdx + 1) % nproc;
1194 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1195 other_threads[victimIdx]
1196 ->th.th_dispatch->th_dispatch_pr_current);
1198 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1199 *(volatile T *)&pr->u.p.static_steal_counter)) {
1200 continue; // try once more (nproc attempts in total)
1201 // no victim is ready yet to participate in stealing
1202 // because all victims are still in kmp_init_dispatch
1204 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1205 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1206 continue; // not enough chunks to steal, goto next victim
1209 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1210 KMP_ASSERT(lck != NULL);
1211 __kmp_acquire_lock(lck, gtid);
1212 limit = victim->u.p.ub; // keep initial ub
1213 if (victim->u.p.count >= limit ||
1214 (remaining = limit - victim->u.p.count) < 2) {
1215 __kmp_release_lock(lck, gtid);
1216 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1217 continue; // not enough chunks to steal
1219 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1221 if (remaining > 3) {
1222 // steal 1/4 of remaining
1223 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1224 init = (victim->u.p.ub -= (remaining >> 2));
1226 // steal 1 chunk of 2 or 3 remaining
1227 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1228 init = (victim->u.p.ub -= 1);
1230 __kmp_release_lock(lck, gtid);
1232 KMP_DEBUG_ASSERT(init + 1 <= limit);
1233 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1236 // now update own count and ub with stolen range but init chunk
1237 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1238 pr->u.p.count = init + 1;
1240 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1241 } // while (search for victim)
1242 } // if (try to find victim and steal)
1244 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1252 // All operations on 'count' or 'ub' must be combined atomically
1255 union_i4 vold, vnew;
1256 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1259 while (!KMP_COMPARE_AND_STORE_ACQ64(
1260 (volatile kmp_int64 *)&pr->u.p.count,
1261 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1262 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1264 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1269 init = vnew.p.count;
1270 status = (init < (UT)vnew.p.ub);
1274 kmp_info_t **other_threads = team->t.t_threads;
1275 int while_limit = nproc; // nproc attempts to find a victim
1276 int while_index = 0;
1278 // TODO: algorithm of searching for a victim
1279 // should be cleaned up and measured
1280 while ((!status) && (while_limit != ++while_index)) {
1281 union_i4 vold, vnew;
1282 kmp_int32 remaining;
1283 T victimIdx = pr->u.p.parm4;
1284 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1285 dispatch_private_info_template<T> *victim =
1286 reinterpret_cast<dispatch_private_info_template<T> *>(
1287 other_threads[victimIdx]
1288 ->th.th_dispatch->th_dispatch_pr_current);
1289 while ((victim == NULL || victim == pr ||
1290 (*(volatile T *)&victim->u.p.static_steal_counter !=
1291 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1292 oldVictimIdx != victimIdx) {
1293 victimIdx = (victimIdx + 1) % nproc;
1294 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1295 other_threads[victimIdx]
1296 ->th.th_dispatch->th_dispatch_pr_current);
1298 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1299 *(volatile T *)&pr->u.p.static_steal_counter)) {
1300 continue; // try once more (nproc attempts in total)
1301 // no victim is ready yet to participate in stealing
1302 // because all victims are still in kmp_init_dispatch
1304 pr->u.p.parm4 = victimIdx; // new victim found
1305 while (1) { // CAS loop if victim has enough chunks to steal
1306 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1309 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1310 if (vnew.p.count >= (UT)vnew.p.ub ||
1311 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1312 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1313 break; // not enough chunks to steal, goto next victim
1315 if (remaining > 3) {
1316 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1318 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1320 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1321 // TODO: Should this be acquire or release?
1322 if (KMP_COMPARE_AND_STORE_ACQ64(
1323 (volatile kmp_int64 *)&victim->u.p.count,
1324 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1325 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1326 // stealing succedded
1327 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1328 vold.p.ub - vnew.p.ub);
1331 // now update own count and ub
1333 vold.p.count = init + 1;
1335 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1337 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1340 } // if (check CAS result)
1341 KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1342 } // while (try to steal from particular victim)
1343 } // while (search for victim)
1344 } // if (try to find victim and steal)
1345 } // if (4-byte induction variable)
1352 start = pr->u.p.parm2;
1354 limit = chunk + init - 1;
1356 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1358 KMP_DEBUG_ASSERT(init <= trip);
1359 if ((last = (limit >= trip)) != 0)
1365 *p_lb = start + init;
1366 *p_ub = start + limit;
1368 *p_lb = start + init * incr;
1369 *p_ub = start + limit * incr;
1372 if (pr->flags.ordered) {
1373 pr->u.p.ordered_lower = init;
1374 pr->u.p.ordered_upper = limit;
1379 #endif // ( KMP_STATIC_STEAL_ENABLED )
1380 case kmp_sch_static_balanced: {
1383 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1385 /* check if thread has any iteration to do */
1386 if ((status = !pr->u.p.count) != 0) {
1390 last = pr->u.p.parm1;
1393 } else { /* no iterations to do */
1394 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1398 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1400 case kmp_sch_static_chunked: {
1403 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1404 "kmp_sch_static_[affinity|chunked] case\n",
1406 parm1 = pr->u.p.parm1;
1408 trip = pr->u.p.tc - 1;
1409 init = parm1 * (pr->u.p.count + tid);
1411 if ((status = (init <= trip)) != 0) {
1414 limit = parm1 + init - 1;
1416 if ((last = (limit >= trip)) != 0)
1422 pr->u.p.count += nproc;
1425 *p_lb = start + init;
1426 *p_ub = start + limit;
1428 *p_lb = start + init * incr;
1429 *p_ub = start + limit * incr;
1432 if (pr->flags.ordered) {
1433 pr->u.p.ordered_lower = init;
1434 pr->u.p.ordered_upper = limit;
1440 case kmp_sch_dynamic_chunked: {
1441 T chunk = pr->u.p.parm1;
1445 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1448 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1449 trip = pr->u.p.tc - 1;
1451 if ((status = (init <= trip)) == 0) {
1458 limit = chunk + init - 1;
1461 if ((last = (limit >= trip)) != 0)
1468 *p_lb = start + init;
1469 *p_ub = start + limit;
1471 *p_lb = start + init * incr;
1472 *p_ub = start + limit * incr;
1475 if (pr->flags.ordered) {
1476 pr->u.p.ordered_lower = init;
1477 pr->u.p.ordered_upper = limit;
1483 case kmp_sch_guided_iterative_chunked: {
1484 T chunkspec = pr->u.p.parm1;
1485 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1489 // Start atomic part of calculations
1491 ST remaining; // signed, because can be < 0
1492 init = sh->u.s.iteration; // shared value
1493 remaining = trip - init;
1494 if (remaining <= 0) { // AC: need to compare with 0 first
1495 // nothing to do, don't try atomic op
1500 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1501 // use dynamic-style shcedule
1502 // atomically inrement iterations, get old value
1503 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1505 remaining = trip - init;
1506 if (remaining <= 0) {
1507 status = 0; // all iterations got by other threads
1509 // got some iterations to work on
1511 if ((T)remaining > chunkspec) {
1512 limit = init + chunkspec - 1;
1514 last = 1; // the last chunk
1515 limit = init + remaining - 1;
1521 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1522 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1523 (ST)init, (ST)limit)) {
1524 // CAS was successful, chunk obtained
1535 *p_lb = start + init * incr;
1536 *p_ub = start + limit * incr;
1537 if (pr->flags.ordered) {
1538 pr->u.p.ordered_lower = init;
1539 pr->u.p.ordered_upper = limit;
1551 case kmp_sch_guided_simd: {
1552 // same as iterative but curr-chunk adjusted to be multiple of given
1554 T chunk = pr->u.p.parm1;
1556 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1559 // Start atomic part of calculations
1561 ST remaining; // signed, because can be < 0
1562 init = sh->u.s.iteration; // shared value
1563 remaining = trip - init;
1564 if (remaining <= 0) { // AC: need to compare with 0 first
1565 status = 0; // nothing to do, don't try atomic op
1568 KMP_DEBUG_ASSERT(init % chunk == 0);
1569 // compare with K*nproc*(chunk+1), K=2 by default
1570 if ((T)remaining < pr->u.p.parm2) {
1571 // use dynamic-style shcedule
1572 // atomically inrement iterations, get old value
1573 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1575 remaining = trip - init;
1576 if (remaining <= 0) {
1577 status = 0; // all iterations got by other threads
1579 // got some iterations to work on
1581 if ((T)remaining > chunk) {
1582 limit = init + chunk - 1;
1584 last = 1; // the last chunk
1585 limit = init + remaining - 1;
1590 // divide by K*nproc
1591 UT span = remaining * (*(double *)&pr->u.p.parm3);
1592 UT rem = span % chunk;
1593 if (rem) // adjust so that span%chunk == 0
1594 span += chunk - rem;
1595 limit = init + span;
1596 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1597 (ST)init, (ST)limit)) {
1598 // CAS was successful, chunk obtained
1609 *p_lb = start + init * incr;
1610 *p_ub = start + limit * incr;
1611 if (pr->flags.ordered) {
1612 pr->u.p.ordered_lower = init;
1613 pr->u.p.ordered_upper = limit;
1623 #endif // OMP_45_ENABLED
1625 case kmp_sch_guided_analytical_chunked: {
1626 T chunkspec = pr->u.p.parm1;
1628 #if KMP_USE_X87CONTROL
1629 /* for storing original FPCW value for Windows* OS on
1630 IA-32 architecture 8-byte version */
1631 unsigned int oldFpcw;
1632 unsigned int fpcwSet = 0;
1634 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1635 "kmp_sch_guided_analytical_chunked case\n",
1640 KMP_DEBUG_ASSERT(nproc > 1);
1641 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1643 while (1) { /* this while loop is a safeguard against unexpected zero
1645 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1646 if (chunkIdx >= (UT)pr->u.p.parm2) {
1648 /* use dynamic-style scheduling */
1649 init = chunkIdx * chunkspec + pr->u.p.count;
1650 /* need to verify init > 0 in case of overflow in the above
1652 if ((status = (init > 0 && init <= trip)) != 0) {
1653 limit = init + chunkspec - 1;
1655 if ((last = (limit >= trip)) != 0)
1660 /* use exponential-style scheduling */
1661 /* The following check is to workaround the lack of long double precision on
1663 This check works around the possible effect that init != 0 for chunkIdx == 0.
1665 #if KMP_USE_X87CONTROL
1666 /* If we haven't already done so, save original
1667 FPCW and set precision to 64-bit, as Windows* OS
1668 on IA-32 architecture defaults to 53-bit */
1670 oldFpcw = _control87(0, 0);
1671 _control87(_PC_64, _MCW_PC);
1676 init = __kmp_dispatch_guided_remaining<T>(
1677 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1678 KMP_DEBUG_ASSERT(init);
1682 limit = trip - __kmp_dispatch_guided_remaining<T>(
1683 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1684 KMP_ASSERT(init <= limit);
1686 KMP_DEBUG_ASSERT(limit <= trip);
1693 #if KMP_USE_X87CONTROL
1694 /* restore FPCW if necessary
1695 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1697 if (fpcwSet && (oldFpcw & fpcwSet))
1698 _control87(oldFpcw, _MCW_PC);
1705 *p_lb = start + init * incr;
1706 *p_ub = start + limit * incr;
1707 if (pr->flags.ordered) {
1708 pr->u.p.ordered_lower = init;
1709 pr->u.p.ordered_upper = limit;
1720 case kmp_sch_trapezoidal: {
1722 T parm2 = pr->u.p.parm2;
1723 T parm3 = pr->u.p.parm3;
1724 T parm4 = pr->u.p.parm4;
1726 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1729 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1731 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1732 trip = pr->u.p.tc - 1;
1734 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1741 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1744 if ((last = (limit >= trip)) != 0)
1751 *p_lb = start + init;
1752 *p_ub = start + limit;
1754 *p_lb = start + init * incr;
1755 *p_ub = start + limit * incr;
1758 if (pr->flags.ordered) {
1759 pr->u.p.ordered_lower = init;
1760 pr->u.p.ordered_upper = limit;
1766 status = 0; // to avoid complaints on uninitialized variable use
1767 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1768 KMP_HNT(GetNewerLibrary), // Hint
1769 __kmp_msg_null // Variadic argument list terminator
1776 if (pr->flags.ordered) {
1778 // create format specifiers before the debug output
1779 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1780 "ordered_lower:%%%s ordered_upper:%%%s\n",
1781 traits_t<UT>::spec, traits_t<UT>::spec);
1782 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1783 __kmp_str_free(&buff);
1787 // create format specifiers before the debug output
1788 buff = __kmp_str_format(
1789 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1790 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1791 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1792 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1793 __kmp_str_free(&buff);
1799 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1800 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1802 #if OMPT_SUPPORT && OMPT_OPTIONAL
1803 #define OMPT_LOOP_END \
1804 if (status == 0) { \
1805 if (ompt_enabled.ompt_callback_work) { \
1806 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1807 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1808 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1809 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1810 &(task_info->task_data), 0, codeptr); \
1813 // TODO: implement count
1815 #define OMPT_LOOP_END // no-op
1818 #if KMP_STATS_ENABLED
1819 #define KMP_STATS_LOOP_END \
1821 kmp_int64 u, l, t, i; \
1822 l = (kmp_int64)(*p_lb); \
1823 u = (kmp_int64)(*p_ub); \
1824 i = (kmp_int64)(pr->u.p.st); \
1825 if (status == 0) { \
1827 KMP_POP_PARTITIONED_TIMER(); \
1828 } else if (i == 1) { \
1833 } else if (i < 0) { \
1835 t = (l - u) / (-i) + 1; \
1840 t = (u - l) / i + 1; \
1844 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1847 #define KMP_STATS_LOOP_END /* Nothing */
1850 template <typename T>
1851 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1853 typename traits_t<T>::signed_t *p_st
1854 #if OMPT_SUPPORT && OMPT_OPTIONAL
1860 typedef typename traits_t<T>::unsigned_t UT;
1861 typedef typename traits_t<T>::signed_t ST;
1862 // This is potentially slightly misleading, schedule(runtime) will appear here
1863 // even if the actual runtme schedule is static. (Which points out a
1864 // disadavantage of schedule(runtime): even when static scheduling is used it
1865 // costs more than a compile time choice to use static scheduling would.)
1866 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1869 dispatch_private_info_template<T> *pr;
1870 kmp_info_t *th = __kmp_threads[gtid];
1871 kmp_team_t *team = th->th.th_team;
1873 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1876 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1877 gtid, p_lb, p_ub, p_st, p_last));
1879 if (team->t.t_serialized) {
1880 /* NOTE: serialize this dispatch becase we are not at the active level */
1881 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1882 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1883 KMP_DEBUG_ASSERT(pr);
1885 if ((status = (pr->u.p.tc != 0)) == 0) {
1888 // if ( p_last != NULL )
1892 if (__kmp_env_consistency_check) {
1893 if (pr->pushed_ws != ct_none) {
1894 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1897 } else if (pr->flags.nomerge) {
1900 UT limit, trip, init;
1902 T chunk = pr->u.p.parm1;
1904 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1907 init = chunk * pr->u.p.count++;
1908 trip = pr->u.p.tc - 1;
1910 if ((status = (init <= trip)) == 0) {
1913 // if ( p_last != NULL )
1917 if (__kmp_env_consistency_check) {
1918 if (pr->pushed_ws != ct_none) {
1919 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1924 limit = chunk + init - 1;
1927 if ((last = (limit >= trip)) != 0) {
1930 pr->u.p.last_upper = pr->u.p.ub;
1931 #endif /* KMP_OS_WINDOWS */
1938 *p_lb = start + init;
1939 *p_ub = start + limit;
1941 *p_lb = start + init * incr;
1942 *p_ub = start + limit * incr;
1945 if (pr->flags.ordered) {
1946 pr->u.p.ordered_lower = init;
1947 pr->u.p.ordered_upper = limit;
1951 // create format specifiers before the debug output
1952 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1953 "ordered_lower:%%%s ordered_upper:%%%s\n",
1954 traits_t<UT>::spec, traits_t<UT>::spec);
1955 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1956 pr->u.p.ordered_upper));
1957 __kmp_str_free(&buff);
1967 pr->u.p.last_upper = *p_ub;
1968 #endif /* KMP_OS_WINDOWS */
1977 // create format specifiers before the debug output
1978 buff = __kmp_str_format(
1979 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1980 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1981 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1982 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1983 __kmp_str_free(&buff);
1986 #if INCLUDE_SSC_MARKS
1987 SSC_MARK_DISPATCH_NEXT();
1994 dispatch_shared_info_template<T> volatile *sh;
1996 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1997 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1999 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2000 th->th.th_dispatch->th_dispatch_pr_current);
2001 KMP_DEBUG_ASSERT(pr);
2002 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2003 th->th.th_dispatch->th_dispatch_sh_current);
2004 KMP_DEBUG_ASSERT(sh);
2006 #if KMP_USE_HIER_SCHED
2007 if (pr->flags.use_hier)
2008 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2010 #endif // KMP_USE_HIER_SCHED
2011 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2012 p_st, th->th.th_team_nproc,
2013 th->th.th_info.ds.ds_tid);
2014 // status == 0: no more iterations to execute
2018 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2022 // create format specifiers before the debug output
2023 buff = __kmp_str_format(
2024 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2025 traits_t<UT>::spec);
2026 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2027 __kmp_str_free(&buff);
2031 #if KMP_USE_HIER_SCHED
2032 pr->flags.use_hier = FALSE;
2034 if ((ST)num_done == th->th.th_team_nproc - 1) {
2035 #if (KMP_STATIC_STEAL_ENABLED)
2036 if (pr->schedule == kmp_sch_static_steal &&
2037 traits_t<T>::type_size > 4) {
2039 kmp_info_t **other_threads = team->t.t_threads;
2040 // loop complete, safe to destroy locks used for stealing
2041 for (i = 0; i < th->th.th_team_nproc; ++i) {
2042 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2043 KMP_ASSERT(lck != NULL);
2044 __kmp_destroy_lock(lck);
2046 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2050 /* NOTE: release this buffer to be reused */
2052 KMP_MB(); /* Flush all pending memory write invalidates. */
2054 sh->u.s.num_done = 0;
2055 sh->u.s.iteration = 0;
2057 /* TODO replace with general release procedure? */
2058 if (pr->flags.ordered) {
2059 sh->u.s.ordered_iteration = 0;
2062 KMP_MB(); /* Flush all pending memory write invalidates. */
2064 sh->buffer_index += __kmp_dispatch_num_buffers;
2065 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2066 gtid, sh->buffer_index));
2068 KMP_MB(); /* Flush all pending memory write invalidates. */
2071 if (__kmp_env_consistency_check) {
2072 if (pr->pushed_ws != ct_none) {
2073 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2077 th->th.th_dispatch->th_deo_fcn = NULL;
2078 th->th.th_dispatch->th_dxo_fcn = NULL;
2079 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2080 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2081 } // if (status == 0)
2084 pr->u.p.last_upper = pr->u.p.ub;
2086 #endif /* KMP_OS_WINDOWS */
2087 if (p_last != NULL && status != 0)
2094 // create format specifiers before the debug output
2095 buff = __kmp_str_format(
2096 "__kmp_dispatch_next: T#%%d normal case: "
2097 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2098 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2099 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2100 (p_last ? *p_last : 0), status));
2101 __kmp_str_free(&buff);
2104 #if INCLUDE_SSC_MARKS
2105 SSC_MARK_DISPATCH_NEXT();
2112 template <typename T>
2113 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2114 kmp_int32 *plastiter, T *plower, T *pupper,
2115 typename traits_t<T>::signed_t incr) {
2116 typedef typename traits_t<T>::unsigned_t UT;
2123 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2124 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2126 typedef typename traits_t<T>::signed_t ST;
2129 // create format specifiers before the debug output
2130 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2131 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2132 traits_t<T>::spec, traits_t<T>::spec,
2133 traits_t<ST>::spec, traits_t<T>::spec);
2134 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2135 __kmp_str_free(&buff);
2139 if (__kmp_env_consistency_check) {
2141 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2144 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2145 // The loop is illegal.
2146 // Some zero-trip loops maintained by compiler, e.g.:
2147 // for(i=10;i<0;++i) // lower >= upper - run-time check
2148 // for(i=0;i>10;--i) // lower <= upper - run-time check
2149 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2150 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2151 // Compiler does not check the following illegal loops:
2152 // for(i=0;i<10;i+=incr) // where incr<0
2153 // for(i=10;i>0;i-=incr) // where incr<0
2154 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2157 th = __kmp_threads[gtid];
2158 team = th->th.th_team;
2160 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2161 nteams = th->th.th_teams_size.nteams;
2163 team_id = team->t.t_master_tid;
2164 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2166 // compute global trip count
2168 trip_count = *pupper - *plower + 1;
2169 } else if (incr == -1) {
2170 trip_count = *plower - *pupper + 1;
2171 } else if (incr > 0) {
2172 // upper-lower can exceed the limit of signed type
2173 trip_count = (UT)(*pupper - *plower) / incr + 1;
2175 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2178 if (trip_count <= nteams) {
2180 __kmp_static == kmp_sch_static_greedy ||
2182 kmp_sch_static_balanced); // Unknown static scheduling type.
2183 // only some teams get single iteration, others get nothing
2184 if (team_id < trip_count) {
2185 *pupper = *plower = *plower + team_id * incr;
2187 *plower = *pupper + incr; // zero-trip loop
2189 if (plastiter != NULL)
2190 *plastiter = (team_id == trip_count - 1);
2192 if (__kmp_static == kmp_sch_static_balanced) {
2193 UT chunk = trip_count / nteams;
2194 UT extras = trip_count % nteams;
2196 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2197 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2198 if (plastiter != NULL)
2199 *plastiter = (team_id == nteams - 1);
2202 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2204 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2205 // Unknown static scheduling type.
2206 *plower += team_id * chunk_inc_count;
2207 *pupper = *plower + chunk_inc_count - incr;
2208 // Check/correct bounds if needed
2210 if (*pupper < *plower)
2211 *pupper = traits_t<T>::max_value;
2212 if (plastiter != NULL)
2213 *plastiter = *plower <= upper && *pupper > upper - incr;
2214 if (*pupper > upper)
2215 *pupper = upper; // tracker C73258
2217 if (*pupper > *plower)
2218 *pupper = traits_t<T>::min_value;
2219 if (plastiter != NULL)
2220 *plastiter = *plower >= upper && *pupper < upper - incr;
2221 if (*pupper < upper)
2222 *pupper = upper; // tracker C73258
2228 //-----------------------------------------------------------------------------
2229 // Dispatch routines
2230 // Transfer call to template< type T >
2231 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2232 // T lb, T ub, ST st, ST chunk )
2236 @ingroup WORK_SHARING
2238 @param loc Source location
2239 @param gtid Global thread id
2240 @param schedule Schedule type
2241 @param lb Lower bound
2242 @param ub Upper bound
2243 @param st Step (or increment if you prefer)
2244 @param chunk The chunk size to block with
2246 This function prepares the runtime to start a dynamically scheduled for loop,
2247 saving the loop arguments.
2248 These functions are all identical apart from the types of the arguments.
2251 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2252 enum sched_type schedule, kmp_int32 lb,
2253 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2254 KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 #if OMPT_SUPPORT && OMPT_OPTIONAL
2256 OMPT_STORE_RETURN_ADDRESS(gtid);
2258 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2261 See @ref __kmpc_dispatch_init_4
2263 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2264 enum sched_type schedule, kmp_uint32 lb,
2265 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2266 KMP_DEBUG_ASSERT(__kmp_init_serial);
2267 #if OMPT_SUPPORT && OMPT_OPTIONAL
2268 OMPT_STORE_RETURN_ADDRESS(gtid);
2270 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2274 See @ref __kmpc_dispatch_init_4
2276 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2277 enum sched_type schedule, kmp_int64 lb,
2278 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2279 KMP_DEBUG_ASSERT(__kmp_init_serial);
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL
2281 OMPT_STORE_RETURN_ADDRESS(gtid);
2283 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2287 See @ref __kmpc_dispatch_init_4
2289 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2290 enum sched_type schedule, kmp_uint64 lb,
2291 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2292 KMP_DEBUG_ASSERT(__kmp_init_serial);
2293 #if OMPT_SUPPORT && OMPT_OPTIONAL
2294 OMPT_STORE_RETURN_ADDRESS(gtid);
2296 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2300 See @ref __kmpc_dispatch_init_4
2302 Difference from __kmpc_dispatch_init set of functions is these functions
2303 are called for composite distribute parallel for construct. Thus before
2304 regular iterations dispatching we need to calc per-team iteration space.
2306 These functions are all identical apart from the types of the arguments.
2308 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2309 enum sched_type schedule, kmp_int32 *p_last,
2310 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2312 KMP_DEBUG_ASSERT(__kmp_init_serial);
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314 OMPT_STORE_RETURN_ADDRESS(gtid);
2316 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2317 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2320 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2321 enum sched_type schedule, kmp_int32 *p_last,
2322 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2324 KMP_DEBUG_ASSERT(__kmp_init_serial);
2325 #if OMPT_SUPPORT && OMPT_OPTIONAL
2326 OMPT_STORE_RETURN_ADDRESS(gtid);
2328 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2329 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2332 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2333 enum sched_type schedule, kmp_int32 *p_last,
2334 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2336 KMP_DEBUG_ASSERT(__kmp_init_serial);
2337 #if OMPT_SUPPORT && OMPT_OPTIONAL
2338 OMPT_STORE_RETURN_ADDRESS(gtid);
2340 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2341 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2344 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2345 enum sched_type schedule, kmp_int32 *p_last,
2346 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2348 KMP_DEBUG_ASSERT(__kmp_init_serial);
2349 #if OMPT_SUPPORT && OMPT_OPTIONAL
2350 OMPT_STORE_RETURN_ADDRESS(gtid);
2352 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2353 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2357 @param loc Source code location
2358 @param gtid Global thread id
2359 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2361 @param p_lb Pointer to the lower bound for the next chunk of work
2362 @param p_ub Pointer to the upper bound for the next chunk of work
2363 @param p_st Pointer to the stride for the next chunk of work
2364 @return one if there is work to be done, zero otherwise
2366 Get the next dynamically allocated chunk of work for this thread.
2367 If there is no more work, then the lb,ub and stride need not be modified.
2369 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2370 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2371 #if OMPT_SUPPORT && OMPT_OPTIONAL
2372 OMPT_STORE_RETURN_ADDRESS(gtid);
2374 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2377 OMPT_LOAD_RETURN_ADDRESS(gtid)
2383 See @ref __kmpc_dispatch_next_4
2385 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2386 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL
2389 OMPT_STORE_RETURN_ADDRESS(gtid);
2391 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2394 OMPT_LOAD_RETURN_ADDRESS(gtid)
2400 See @ref __kmpc_dispatch_next_4
2402 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2403 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405 OMPT_STORE_RETURN_ADDRESS(gtid);
2407 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2410 OMPT_LOAD_RETURN_ADDRESS(gtid)
2416 See @ref __kmpc_dispatch_next_4
2418 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2419 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422 OMPT_STORE_RETURN_ADDRESS(gtid);
2424 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2427 OMPT_LOAD_RETURN_ADDRESS(gtid)
2433 @param loc Source code location
2434 @param gtid Global thread id
2436 Mark the end of a dynamic loop.
2438 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2439 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2443 See @ref __kmpc_dispatch_fini_4
2445 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2446 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2450 See @ref __kmpc_dispatch_fini_4
2452 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2453 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2457 See @ref __kmpc_dispatch_fini_4
2459 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2460 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2464 //-----------------------------------------------------------------------------
2465 // Non-template routines from kmp_dispatch.cpp used in other sources
2467 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2468 return value == checker;
2471 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2472 return value != checker;
2475 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2476 return value < checker;
2479 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2480 return value >= checker;
2483 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2484 return value <= checker;
2488 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2489 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2490 void *obj // Higher-level synchronization object, or NULL.
2492 // note: we may not belong to a team at this point
2493 volatile kmp_uint32 *spin = spinner;
2494 kmp_uint32 check = checker;
2496 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2499 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2500 KMP_INIT_YIELD(spins);
2501 // main wait spin loop
2502 while (!f(r = TCR_4(*spin), check)) {
2503 KMP_FSYNC_SPIN_PREPARE(obj);
2504 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2505 split. It causes problems with infinite recursion because of exit lock */
2506 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2507 __kmp_abort_thread(); */
2509 /* if we have waited a bit, or are oversubscribed, yield */
2510 /* pause is in the following code */
2511 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2512 KMP_YIELD_SPIN(spins);
2514 KMP_FSYNC_SPIN_ACQUIRED(obj);
2518 void __kmp_wait_yield_4_ptr(
2519 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2520 void *obj // Higher-level synchronization object, or NULL.
2522 // note: we may not belong to a team at this point
2523 void *spin = spinner;
2524 kmp_uint32 check = checker;
2526 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2528 KMP_FSYNC_SPIN_INIT(obj, spin);
2529 KMP_INIT_YIELD(spins);
2530 // main wait spin loop
2531 while (!f(spin, check)) {
2532 KMP_FSYNC_SPIN_PREPARE(obj);
2533 /* if we have waited a bit, or are oversubscribed, yield */
2534 /* pause is in the following code */
2535 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2536 KMP_YIELD_SPIN(spins);
2538 KMP_FSYNC_SPIN_ACQUIRED(obj);
2543 #ifdef KMP_GOMP_COMPAT
2545 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2546 enum sched_type schedule, kmp_int32 lb,
2547 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2549 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2553 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2554 enum sched_type schedule, kmp_uint32 lb,
2555 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2557 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2561 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2562 enum sched_type schedule, kmp_int64 lb,
2563 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2565 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2569 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2570 enum sched_type schedule, kmp_uint64 lb,
2571 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2573 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2577 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2578 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2581 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2582 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2585 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2586 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2589 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2590 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2593 #endif /* KMP_GOMP_COMPAT */
2595 /* ------------------------------------------------------------------------ */