2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
13 /* Dynamic scheduling initialization and dispatch.
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_USE_X87CONTROL
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
36 #include "ompt-specific.h"
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
76 // default to monotonic
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk. The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride). nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used). tid is the id of the thread calling
90 // the function within the group of nproc threads. It will have a value
91 // between 0 and nproc - 1. This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97 dispatch_private_info_template<T> *pr,
98 enum sched_type schedule, T lb, T ub,
99 typename traits_t<T>::signed_t st,
101 kmp_uint64 *cur_chunk,
103 typename traits_t<T>::signed_t chunk,
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
116 typedef typename traits_t<T>::signed_t ST;
119 // create format specifiers before the debug output
120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
142 #if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
152 /* Pick up the nomerge/ordered bits from the scheduling type */
153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154 pr->flags.nomerge = TRUE;
156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
158 pr->flags.nomerge = FALSE;
160 pr->type_size = traits_t<T>::type_size; // remember the size of variables
161 if (kmp_ord_lower & schedule) {
162 pr->flags.ordered = TRUE;
164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
166 pr->flags.ordered = FALSE;
168 // Ordered overrides nonmonotonic
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
173 if (schedule == kmp_sch_static) {
174 schedule = __kmp_static;
176 if (schedule == kmp_sch_runtime) {
177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182 // Detail the schedule if needed (global controls are differentiated
184 if (schedule == kmp_sch_guided_chunked) {
185 schedule = __kmp_guided;
186 } else if (schedule == kmp_sch_static) {
187 schedule = __kmp_static;
189 // Use the chunk size specified by OMP_SCHEDULE (or default if not
191 chunk = team->t.t_sched.chunk;
199 // create format specifiers before the debug output
200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
208 if (schedule == kmp_sch_guided_chunked) {
209 schedule = __kmp_guided;
212 chunk = KMP_DEFAULT_CHUNK;
216 if (schedule == kmp_sch_auto) {
217 // mapping and differentiation: in the __kmp_do_serial_initialize()
218 schedule = __kmp_auto;
222 // create format specifiers before the debug output
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
232 #if KMP_STATIC_STEAL_ENABLED
233 // map nonmonotonic:dynamic to static steal
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
239 /* guided analytical not safe for too many threads */
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
244 if (schedule == kmp_sch_runtime_simd) {
245 // compiler provides simd_width in the chunk parameter
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249 // Detail the schedule if needed (global controls are differentiated
251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256 schedule = kmp_sch_guided_simd;
258 chunk = team->t.t_sched.chunk * chunk;
267 // create format specifiers before the debug output
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
277 pr->u.p.parm1 = chunk;
279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
290 // compute trip count
291 if (st == 1) { // most common case
299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300 // where the division needs to be unsigned regardless of the result type
301 tc = (UT)(lb - ub) / (-st) + 1;
307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308 // where the division needs to be unsigned regardless of the result type
309 tc = (UT)(ub - lb) / st + 1;
315 #if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid)) {
317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
327 pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
330 /* NOTE: only the active parallel region(s) has active ordered sections */
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
341 #if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
353 T small_chunk, extras;
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
358 init = id * small_chunk + (id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
363 // parm3 is the number of times to attempt stealing which is
364 // proportional to the number of chunks per thread up until
365 // the maximum value of nproc.
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
369 if (traits_t<T>::type_size > 4) {
370 // AC: TODO: check if 16-byte CAS available and use it to
371 // improve performance (probably wait for explicit request
372 // before spending time on this).
373 // For now use dynamically allocated per-thread lock,
374 // free memory in __kmp_dispatch_next when status==0.
375 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376 th->th.th_dispatch->th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
382 /* too few chunks: switching to kmp_sch_dynamic_chunked */
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
393 case kmp_sch_static_balanced: {
398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
408 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
410 pr->u.p.count = 1; /* means no more chunks to execute */
411 pr->u.p.parm1 = FALSE;
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init = id * small_chunk + (id < extras ? id : extras);
418 limit = init + small_chunk - (id < extras ? 0 : 1);
419 pr->u.p.parm1 = (id == nproc - 1);
425 pr->u.p.parm1 = TRUE;
428 pr->u.p.count = 1; /* means no more chunks to execute */
429 pr->u.p.parm1 = FALSE;
434 // Calculate chunk for metadata report
435 if (itt_need_metadata_reporting)
437 *cur_chunk = limit - init + 1;
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
443 // calculated upper bound, "ub" is user-defined upper bound
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
460 case kmp_sch_static_balanced_chunked: {
461 // similar to balanced, but chunk adjusted to multiple of simd width
463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
466 schedule = kmp_sch_static_greedy;
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
473 case kmp_sch_guided_simd:
474 case kmp_sch_guided_iterative_chunked: {
477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
482 if ((2L * chunk + 1) * nproc >= tc) {
483 /* chunk size too large, switch to dynamic */
484 schedule = kmp_sch_dynamic_chunked;
486 // when remaining iters become less than parm2 - switch to dynamic
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc; // may occupy parm3 and parm4
492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
495 schedule = kmp_sch_static_greedy;
496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
511 if ((2L * chunk + 1) * nproc >= tc) {
512 /* chunk size too large, switch to dynamic */
513 schedule = kmp_sch_dynamic_chunked;
515 /* commonly used term: (2 nproc - 1)/(2 nproc) */
518 #if KMP_USE_X87CONTROL
519 /* Linux* OS already has 64-bit computation by default for long double,
520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522 instead of the default 53-bit. Even though long double doesn't work
523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524 expected to impact the correctness of the algorithm, but this has not
525 been mathematically proven. */
526 // save original FPCW and set precision to 64-bit, as
527 // Windows* OS on IA-32 architecture defaults to 53-bit
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC); // 0,0x30000
531 /* value used for comparison in solver for cross-over point */
532 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
534 /* crossover point--chunk indexes equal to or greater than
535 this point switch to dynamic-style scheduling */
538 /* commonly used term: (2 nproc - 1)/(2 nproc) */
539 x = (long double)1.0 - (long double)0.5 / nproc;
542 { // test natural alignment
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553 // long)natural_alignment );
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
559 /* save the term in thread private dispatch structure */
560 *(DBL *)&pr->u.p.parm3 = x;
562 /* solve for the crossover point to the nearest integer i for which C_i
568 /* estimate initial upper and lower bound */
570 /* doesn't matter what value right is as long as it is positive, but
571 it affects performance of the solver */
573 p = __kmp_pow<UT>(x, right);
578 } while (p > target && right < (1 << 27));
579 /* lower bound is previous (failed) estimate of upper bound */
585 /* bisection root-finding method */
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
596 /* assert sanity of computed crossover point */
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
600 /* save the crossover point in thread private dispatch structure */
601 pr->u.p.parm2 = cross;
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
609 /* dynamic-style scheduling offset */
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
613 #if KMP_USE_X87CONTROL
615 _control87(oldFpcw, _MCW_PC);
619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
622 schedule = kmp_sch_static_greedy;
623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
628 case kmp_sch_static_greedy:
631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
644 case kmp_sch_trapezoidal: {
645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
647 T parm1, parm2, parm3, parm4;
649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
654 /* F : size of the first cycle */
655 parm2 = (tc / (2 * nproc));
661 /* L : size of the last cycle. Make sure the last cycle is not larger
662 than the first cycle. */
665 } else if (parm1 > parm2) {
669 /* N : number of cycles */
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
677 /* sigma : decreasing incr of the trapezoid */
679 parm4 = (parm2 - parm1) / parm4;
681 // pointless check, because parm4 >= 0 always
682 // if ( parm4 < 0 ) {
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695 KMP_HNT(GetNewerLibrary), // Hint
696 __kmp_msg_null // Variadic argument list terminator
700 pr->schedule = schedule;
703 #if KMP_USE_HIER_SCHED
704 template <typename T>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
740 // free all the hierarchy scheduling memory associated with the team
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (int i = 0; i < num_disp_buff; ++i) {
744 // type does not matter here so use kmp_int32
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747 &team->t.t_disp_buffer[i]);
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T>
760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761 T ub, typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk, int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T> volatile *sh;
772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
777 if (!TCR_4(__kmp_init_parallel))
778 __kmp_parallel_initialize();
780 __kmp_resume_if_soft_paused();
782 #if INCLUDE_SSC_MARKS
783 SSC_MARK_DISPATCH_INIT();
786 typedef typename traits_t<T>::signed_t ST;
789 // create format specifiers before the debug output
790 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792 traits_t<ST>::spec, traits_t<T>::spec,
793 traits_t<T>::spec, traits_t<ST>::spec);
794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795 __kmp_str_free(&buff);
799 th = __kmp_threads[gtid];
800 team = th->th.th_team;
801 active = !team->t.t_serialized;
802 th->th.th_ident = loc;
804 // Any half-decent optimizer will remove this test when the blocks are empty
805 // since the macros expand to nothing
806 // when statistics are disabled.
807 if (schedule == __kmp_static) {
808 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
810 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
813 #if KMP_USE_HIER_SCHED
814 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815 // Hierarchical scheduling does not work with ordered, so if ordered is
816 // detected, then revert back to threaded scheduling.
818 enum sched_type my_sched = schedule;
819 my_buffer_index = th->th.th_dispatch->th_disp_index;
820 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
826 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827 ordered = (kmp_ord_lower & my_sched);
828 if (pr->flags.use_hier) {
830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
831 "Disabling hierarchical scheduling.\n",
833 pr->flags.use_hier = FALSE;
836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837 // Don't use hierarchical for ordered parallel loops and don't
838 // use the runtime hierarchy if one was specified in the program
839 if (!ordered && !pr->flags.use_hier)
840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
842 #endif // KMP_USE_HIER_SCHED
845 kmp_uint64 cur_chunk = chunk;
846 int itt_need_metadata_reporting =
847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849 team->t.t_active_level == 1;
852 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
855 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
858 my_buffer_index = th->th.th_dispatch->th_disp_index++;
860 /* What happens when number of threads changes, need to resize buffer? */
861 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
874 chunk, (T)th->th.th_team_nproc,
875 (T)th->th.th_info.ds.ds_tid);
877 if (pr->flags.ordered == 0) {
878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
887 /* The name of this buffer should be my_buffer_index when it's free to use
890 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891 "sh->buffer_index:%d\n",
892 gtid, my_buffer_index, sh->buffer_index));
893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895 // Note: KMP_WAIT() cannot be used there: buffer index and
896 // my_buffer_index are *always* 32-bit integers.
897 KMP_MB(); /* is this necessary? */
898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899 "sh->buffer_index:%d\n",
900 gtid, my_buffer_index, sh->buffer_index));
902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903 th->th.th_dispatch->th_dispatch_sh_current =
904 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
906 if (pr->flags.ordered) {
907 __kmp_itt_ordered_init(gtid);
909 // Report loop metadata
910 if (itt_need_metadata_reporting) {
911 // Only report metadata by master of active team at level 1
912 kmp_uint64 schedtype = 0;
914 case kmp_sch_static_chunked:
915 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
917 case kmp_sch_static_greedy:
918 cur_chunk = pr->u.p.parm1;
920 case kmp_sch_dynamic_chunked:
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_analytical_chunked:
925 case kmp_sch_guided_simd:
929 // Should we put this case under "static"?
930 // case kmp_sch_static_steal:
934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
936 #if KMP_USE_HIER_SCHED
937 if (pr->flags.use_hier) {
939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
941 #endif // KMP_USER_HIER_SCHED
942 #endif /* USE_ITT_BUILD */
948 // create format specifiers before the debug output
949 buff = __kmp_str_format(
950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962 __kmp_str_free(&buff);
965 #if (KMP_STATIC_STEAL_ENABLED)
966 // It cannot be guaranteed that after execution of a loop with some other
967 // schedule kind all the parm3 variables will contain the same value. Even if
968 // all parm3 will be the same, it still exists a bad case like using 0 and 1
969 // rather than program life-time increment. So the dedicated variable is
970 // required. The 'static_steal_counter' is used.
971 if (schedule == kmp_sch_static_steal) {
972 // Other threads will inspect this variable when searching for a victim.
973 // This is a flag showing that other threads may steal from this thread
975 volatile T *p = &pr->u.p.static_steal_counter;
978 #endif // ( KMP_STATIC_STEAL_ENABLED )
980 #if OMPT_SUPPORT && OMPT_OPTIONAL
981 if (ompt_enabled.ompt_callback_work) {
982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984 ompt_callbacks.ompt_callback(ompt_callback_work)(
985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
992 /* For ordered loops, either __kmp_dispatch_finish() should be called after
993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994 * every chunk of iterations. If the ordered section(s) were not executed
995 * for this iteration (or every iteration in this chunk), we need to set the
996 * ordered iteration counters so that the next thread can proceed. */
997 template <typename UT>
998 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999 typedef typename traits_t<UT>::signed_t ST;
1000 kmp_info_t *th = __kmp_threads[gtid];
1002 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003 if (!th->th.th_team->t.t_serialized) {
1005 dispatch_private_info_template<UT> *pr =
1006 reinterpret_cast<dispatch_private_info_template<UT> *>(
1007 th->th.th_dispatch->th_dispatch_pr_current);
1008 dispatch_shared_info_template<UT> volatile *sh =
1009 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010 th->th.th_dispatch->th_dispatch_sh_current);
1011 KMP_DEBUG_ASSERT(pr);
1012 KMP_DEBUG_ASSERT(sh);
1013 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1016 if (pr->ordered_bumped) {
1019 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1021 pr->ordered_bumped = 0;
1023 UT lower = pr->u.p.ordered_lower;
1028 // create format specifiers before the debug output
1029 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030 "ordered_iteration:%%%s lower:%%%s\n",
1031 traits_t<UT>::spec, traits_t<UT>::spec);
1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033 __kmp_str_free(&buff);
1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039 KMP_MB(); /* is this necessary? */
1043 // create format specifiers before the debug output
1044 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045 "ordered_iteration:%%%s lower:%%%s\n",
1046 traits_t<UT>::spec, traits_t<UT>::spec);
1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048 __kmp_str_free(&buff);
1052 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1055 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1058 #ifdef KMP_GOMP_COMPAT
1060 template <typename UT>
1061 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062 typedef typename traits_t<UT>::signed_t ST;
1063 kmp_info_t *th = __kmp_threads[gtid];
1065 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066 if (!th->th.th_team->t.t_serialized) {
1068 dispatch_private_info_template<UT> *pr =
1069 reinterpret_cast<dispatch_private_info_template<UT> *>(
1070 th->th.th_dispatch->th_dispatch_pr_current);
1071 dispatch_shared_info_template<UT> volatile *sh =
1072 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073 th->th.th_dispatch->th_dispatch_sh_current);
1074 KMP_DEBUG_ASSERT(pr);
1075 KMP_DEBUG_ASSERT(sh);
1076 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1079 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080 UT lower = pr->u.p.ordered_lower;
1081 UT upper = pr->u.p.ordered_upper;
1082 UT inc = upper - lower + 1;
1084 if (pr->ordered_bumped == inc) {
1087 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1089 pr->ordered_bumped = 0;
1091 inc -= pr->ordered_bumped;
1096 // create format specifiers before the debug output
1097 buff = __kmp_str_format(
1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102 __kmp_str_free(&buff);
1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1109 KMP_MB(); /* is this necessary? */
1110 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111 "ordered_bumped to zero\n",
1113 pr->ordered_bumped = 0;
1114 //!!!!! TODO check if the inc should be unsigned, or signed???
1118 // create format specifiers before the debug output
1119 buff = __kmp_str_format(
1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123 traits_t<UT>::spec);
1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126 __kmp_str_free(&buff);
1130 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1134 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1137 #endif /* KMP_GOMP_COMPAT */
1139 template <typename T>
1140 int __kmp_dispatch_next_algorithm(int gtid,
1141 dispatch_private_info_template<T> *pr,
1142 dispatch_shared_info_template<T> volatile *sh,
1143 kmp_int32 *p_last, T *p_lb, T *p_ub,
1144 typename traits_t<T>::signed_t *p_st, T nproc,
1146 typedef typename traits_t<T>::unsigned_t UT;
1147 typedef typename traits_t<T>::signed_t ST;
1148 typedef typename traits_t<T>::floating_t DBL;
1153 UT limit, trip, init;
1154 kmp_info_t *th = __kmp_threads[gtid];
1155 kmp_team_t *team = th->th.th_team;
1157 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159 KMP_DEBUG_ASSERT(pr);
1160 KMP_DEBUG_ASSERT(sh);
1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1165 // create format specifiers before the debug output
1167 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168 "sh:%%p nproc:%%%s tid:%%%s\n",
1169 traits_t<T>::spec, traits_t<T>::spec);
1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171 __kmp_str_free(&buff);
1176 if (pr->u.p.tc == 0) {
1178 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1184 switch (pr->schedule) {
1185 #if (KMP_STATIC_STEAL_ENABLED)
1186 case kmp_sch_static_steal: {
1187 T chunk = pr->u.p.parm1;
1190 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1193 trip = pr->u.p.tc - 1;
1195 if (traits_t<T>::type_size > 4) {
1196 // use lock for 8-byte and CAS for 4-byte induction
1197 // variable. TODO (optional): check and use 16-byte CAS
1198 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1199 KMP_DEBUG_ASSERT(lck != NULL);
1200 if (pr->u.p.count < (UT)pr->u.p.ub) {
1201 __kmp_acquire_lock(lck, gtid);
1202 // try to get own chunk of iterations
1203 init = (pr->u.p.count)++;
1204 status = (init < (UT)pr->u.p.ub);
1205 __kmp_release_lock(lck, gtid);
1207 status = 0; // no own chunks
1209 if (!status) { // try to steal
1210 kmp_info_t **other_threads = team->t.t_threads;
1211 int while_limit = pr->u.p.parm3;
1212 int while_index = 0;
1213 // TODO: algorithm of searching for a victim
1214 // should be cleaned up and measured
1215 while ((!status) && (while_limit != ++while_index)) {
1217 T victimIdx = pr->u.p.parm4;
1218 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1219 dispatch_private_info_template<T> *victim =
1220 reinterpret_cast<dispatch_private_info_template<T> *>(
1221 other_threads[victimIdx]
1222 ->th.th_dispatch->th_dispatch_pr_current);
1223 while ((victim == NULL || victim == pr ||
1224 (*(volatile T *)&victim->u.p.static_steal_counter !=
1225 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1226 oldVictimIdx != victimIdx) {
1227 victimIdx = (victimIdx + 1) % nproc;
1228 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1229 other_threads[victimIdx]
1230 ->th.th_dispatch->th_dispatch_pr_current);
1232 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1233 *(volatile T *)&pr->u.p.static_steal_counter)) {
1234 continue; // try once more (nproc attempts in total)
1235 // no victim is ready yet to participate in stealing
1236 // because all victims are still in kmp_init_dispatch
1238 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1239 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1240 continue; // not enough chunks to steal, goto next victim
1243 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1244 KMP_ASSERT(lck != NULL);
1245 __kmp_acquire_lock(lck, gtid);
1246 limit = victim->u.p.ub; // keep initial ub
1247 if (victim->u.p.count >= limit ||
1248 (remaining = limit - victim->u.p.count) < 2) {
1249 __kmp_release_lock(lck, gtid);
1250 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1251 continue; // not enough chunks to steal
1253 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1255 if (remaining > 3) {
1256 // steal 1/4 of remaining
1257 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1258 init = (victim->u.p.ub -= (remaining >> 2));
1260 // steal 1 chunk of 2 or 3 remaining
1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1262 init = (victim->u.p.ub -= 1);
1264 __kmp_release_lock(lck, gtid);
1266 KMP_DEBUG_ASSERT(init + 1 <= limit);
1267 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1270 // now update own count and ub with stolen range but init chunk
1271 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1272 pr->u.p.count = init + 1;
1274 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1275 } // while (search for victim)
1276 } // if (try to find victim and steal)
1278 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1286 // All operations on 'count' or 'ub' must be combined atomically
1289 union_i4 vold, vnew;
1290 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1293 while (!KMP_COMPARE_AND_STORE_ACQ64(
1294 (volatile kmp_int64 *)&pr->u.p.count,
1295 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1296 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1298 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303 init = vnew.p.count;
1304 status = (init < (UT)vnew.p.ub);
1308 kmp_info_t **other_threads = team->t.t_threads;
1309 int while_limit = pr->u.p.parm3;
1310 int while_index = 0;
1312 // TODO: algorithm of searching for a victim
1313 // should be cleaned up and measured
1314 while ((!status) && (while_limit != ++while_index)) {
1315 union_i4 vold, vnew;
1316 kmp_int32 remaining;
1317 T victimIdx = pr->u.p.parm4;
1318 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1319 dispatch_private_info_template<T> *victim =
1320 reinterpret_cast<dispatch_private_info_template<T> *>(
1321 other_threads[victimIdx]
1322 ->th.th_dispatch->th_dispatch_pr_current);
1323 while ((victim == NULL || victim == pr ||
1324 (*(volatile T *)&victim->u.p.static_steal_counter !=
1325 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1326 oldVictimIdx != victimIdx) {
1327 victimIdx = (victimIdx + 1) % nproc;
1328 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1329 other_threads[victimIdx]
1330 ->th.th_dispatch->th_dispatch_pr_current);
1332 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1333 *(volatile T *)&pr->u.p.static_steal_counter)) {
1334 continue; // try once more (nproc attempts in total)
1335 // no victim is ready yet to participate in stealing
1336 // because all victims are still in kmp_init_dispatch
1338 pr->u.p.parm4 = victimIdx; // new victim found
1339 while (1) { // CAS loop if victim has enough chunks to steal
1340 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1343 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1344 if (vnew.p.count >= (UT)vnew.p.ub ||
1345 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1346 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1347 break; // not enough chunks to steal, goto next victim
1349 if (remaining > 3) {
1350 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1352 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1354 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1355 // TODO: Should this be acquire or release?
1356 if (KMP_COMPARE_AND_STORE_ACQ64(
1357 (volatile kmp_int64 *)&victim->u.p.count,
1358 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1359 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1360 // stealing succedded
1361 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1362 vold.p.ub - vnew.p.ub);
1365 // now update own count and ub
1367 vold.p.count = init + 1;
1369 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1371 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1374 } // if (check CAS result)
1375 KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1376 } // while (try to steal from particular victim)
1377 } // while (search for victim)
1378 } // if (try to find victim and steal)
1379 } // if (4-byte induction variable)
1386 start = pr->u.p.parm2;
1388 limit = chunk + init - 1;
1390 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1392 KMP_DEBUG_ASSERT(init <= trip);
1393 if ((last = (limit >= trip)) != 0)
1399 *p_lb = start + init;
1400 *p_ub = start + limit;
1402 *p_lb = start + init * incr;
1403 *p_ub = start + limit * incr;
1406 if (pr->flags.ordered) {
1407 pr->u.p.ordered_lower = init;
1408 pr->u.p.ordered_upper = limit;
1413 #endif // ( KMP_STATIC_STEAL_ENABLED )
1414 case kmp_sch_static_balanced: {
1417 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1419 /* check if thread has any iteration to do */
1420 if ((status = !pr->u.p.count) != 0) {
1424 last = pr->u.p.parm1;
1427 } else { /* no iterations to do */
1428 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1432 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1434 case kmp_sch_static_chunked: {
1437 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1438 "kmp_sch_static_[affinity|chunked] case\n",
1440 parm1 = pr->u.p.parm1;
1442 trip = pr->u.p.tc - 1;
1443 init = parm1 * (pr->u.p.count + tid);
1445 if ((status = (init <= trip)) != 0) {
1448 limit = parm1 + init - 1;
1450 if ((last = (limit >= trip)) != 0)
1456 pr->u.p.count += nproc;
1459 *p_lb = start + init;
1460 *p_ub = start + limit;
1462 *p_lb = start + init * incr;
1463 *p_ub = start + limit * incr;
1466 if (pr->flags.ordered) {
1467 pr->u.p.ordered_lower = init;
1468 pr->u.p.ordered_upper = limit;
1474 case kmp_sch_dynamic_chunked: {
1475 T chunk = pr->u.p.parm1;
1479 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1482 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1483 trip = pr->u.p.tc - 1;
1485 if ((status = (init <= trip)) == 0) {
1492 limit = chunk + init - 1;
1495 if ((last = (limit >= trip)) != 0)
1502 *p_lb = start + init;
1503 *p_ub = start + limit;
1505 *p_lb = start + init * incr;
1506 *p_ub = start + limit * incr;
1509 if (pr->flags.ordered) {
1510 pr->u.p.ordered_lower = init;
1511 pr->u.p.ordered_upper = limit;
1517 case kmp_sch_guided_iterative_chunked: {
1518 T chunkspec = pr->u.p.parm1;
1519 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1523 // Start atomic part of calculations
1525 ST remaining; // signed, because can be < 0
1526 init = sh->u.s.iteration; // shared value
1527 remaining = trip - init;
1528 if (remaining <= 0) { // AC: need to compare with 0 first
1529 // nothing to do, don't try atomic op
1534 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1535 // use dynamic-style shcedule
1536 // atomically increment iterations, get old value
1537 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1539 remaining = trip - init;
1540 if (remaining <= 0) {
1541 status = 0; // all iterations got by other threads
1543 // got some iterations to work on
1545 if ((T)remaining > chunkspec) {
1546 limit = init + chunkspec - 1;
1548 last = 1; // the last chunk
1549 limit = init + remaining - 1;
1555 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1556 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1557 (ST)init, (ST)limit)) {
1558 // CAS was successful, chunk obtained
1569 *p_lb = start + init * incr;
1570 *p_ub = start + limit * incr;
1571 if (pr->flags.ordered) {
1572 pr->u.p.ordered_lower = init;
1573 pr->u.p.ordered_upper = limit;
1584 case kmp_sch_guided_simd: {
1585 // same as iterative but curr-chunk adjusted to be multiple of given
1587 T chunk = pr->u.p.parm1;
1589 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1592 // Start atomic part of calculations
1594 ST remaining; // signed, because can be < 0
1595 init = sh->u.s.iteration; // shared value
1596 remaining = trip - init;
1597 if (remaining <= 0) { // AC: need to compare with 0 first
1598 status = 0; // nothing to do, don't try atomic op
1601 KMP_DEBUG_ASSERT(init % chunk == 0);
1602 // compare with K*nproc*(chunk+1), K=2 by default
1603 if ((T)remaining < pr->u.p.parm2) {
1604 // use dynamic-style shcedule
1605 // atomically increment iterations, get old value
1606 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1608 remaining = trip - init;
1609 if (remaining <= 0) {
1610 status = 0; // all iterations got by other threads
1612 // got some iterations to work on
1614 if ((T)remaining > chunk) {
1615 limit = init + chunk - 1;
1617 last = 1; // the last chunk
1618 limit = init + remaining - 1;
1623 // divide by K*nproc
1624 UT span = remaining * (*(double *)&pr->u.p.parm3);
1625 UT rem = span % chunk;
1626 if (rem) // adjust so that span%chunk == 0
1627 span += chunk - rem;
1628 limit = init + span;
1629 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1630 (ST)init, (ST)limit)) {
1631 // CAS was successful, chunk obtained
1642 *p_lb = start + init * incr;
1643 *p_ub = start + limit * incr;
1644 if (pr->flags.ordered) {
1645 pr->u.p.ordered_lower = init;
1646 pr->u.p.ordered_upper = limit;
1657 case kmp_sch_guided_analytical_chunked: {
1658 T chunkspec = pr->u.p.parm1;
1660 #if KMP_USE_X87CONTROL
1661 /* for storing original FPCW value for Windows* OS on
1662 IA-32 architecture 8-byte version */
1663 unsigned int oldFpcw;
1664 unsigned int fpcwSet = 0;
1666 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1667 "kmp_sch_guided_analytical_chunked case\n",
1672 KMP_DEBUG_ASSERT(nproc > 1);
1673 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1675 while (1) { /* this while loop is a safeguard against unexpected zero
1677 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1678 if (chunkIdx >= (UT)pr->u.p.parm2) {
1680 /* use dynamic-style scheduling */
1681 init = chunkIdx * chunkspec + pr->u.p.count;
1682 /* need to verify init > 0 in case of overflow in the above
1684 if ((status = (init > 0 && init <= trip)) != 0) {
1685 limit = init + chunkspec - 1;
1687 if ((last = (limit >= trip)) != 0)
1692 /* use exponential-style scheduling */
1693 /* The following check is to workaround the lack of long double precision on
1695 This check works around the possible effect that init != 0 for chunkIdx == 0.
1697 #if KMP_USE_X87CONTROL
1698 /* If we haven't already done so, save original
1699 FPCW and set precision to 64-bit, as Windows* OS
1700 on IA-32 architecture defaults to 53-bit */
1702 oldFpcw = _control87(0, 0);
1703 _control87(_PC_64, _MCW_PC);
1708 init = __kmp_dispatch_guided_remaining<T>(
1709 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1710 KMP_DEBUG_ASSERT(init);
1714 limit = trip - __kmp_dispatch_guided_remaining<T>(
1715 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1716 KMP_ASSERT(init <= limit);
1718 KMP_DEBUG_ASSERT(limit <= trip);
1725 #if KMP_USE_X87CONTROL
1726 /* restore FPCW if necessary
1727 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1729 if (fpcwSet && (oldFpcw & fpcwSet))
1730 _control87(oldFpcw, _MCW_PC);
1737 *p_lb = start + init * incr;
1738 *p_ub = start + limit * incr;
1739 if (pr->flags.ordered) {
1740 pr->u.p.ordered_lower = init;
1741 pr->u.p.ordered_upper = limit;
1752 case kmp_sch_trapezoidal: {
1754 T parm2 = pr->u.p.parm2;
1755 T parm3 = pr->u.p.parm3;
1756 T parm4 = pr->u.p.parm4;
1758 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1761 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1763 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1764 trip = pr->u.p.tc - 1;
1766 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1773 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1776 if ((last = (limit >= trip)) != 0)
1783 *p_lb = start + init;
1784 *p_ub = start + limit;
1786 *p_lb = start + init * incr;
1787 *p_ub = start + limit * incr;
1790 if (pr->flags.ordered) {
1791 pr->u.p.ordered_lower = init;
1792 pr->u.p.ordered_upper = limit;
1798 status = 0; // to avoid complaints on uninitialized variable use
1799 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1800 KMP_HNT(GetNewerLibrary), // Hint
1801 __kmp_msg_null // Variadic argument list terminator
1808 if (pr->flags.ordered) {
1810 // create format specifiers before the debug output
1811 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1812 "ordered_lower:%%%s ordered_upper:%%%s\n",
1813 traits_t<UT>::spec, traits_t<UT>::spec);
1814 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1815 __kmp_str_free(&buff);
1819 // create format specifiers before the debug output
1820 buff = __kmp_str_format(
1821 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1822 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1823 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1824 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1825 __kmp_str_free(&buff);
1831 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1832 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 #define OMPT_LOOP_END \
1836 if (status == 0) { \
1837 if (ompt_enabled.ompt_callback_work) { \
1838 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1839 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1840 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1841 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1842 &(task_info->task_data), 0, codeptr); \
1845 // TODO: implement count
1847 #define OMPT_LOOP_END // no-op
1850 #if KMP_STATS_ENABLED
1851 #define KMP_STATS_LOOP_END \
1853 kmp_int64 u, l, t, i; \
1854 l = (kmp_int64)(*p_lb); \
1855 u = (kmp_int64)(*p_ub); \
1856 i = (kmp_int64)(pr->u.p.st); \
1857 if (status == 0) { \
1859 KMP_POP_PARTITIONED_TIMER(); \
1860 } else if (i == 1) { \
1865 } else if (i < 0) { \
1867 t = (l - u) / (-i) + 1; \
1872 t = (u - l) / i + 1; \
1876 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1879 #define KMP_STATS_LOOP_END /* Nothing */
1882 template <typename T>
1883 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1885 typename traits_t<T>::signed_t *p_st
1886 #if OMPT_SUPPORT && OMPT_OPTIONAL
1892 typedef typename traits_t<T>::unsigned_t UT;
1893 typedef typename traits_t<T>::signed_t ST;
1894 // This is potentially slightly misleading, schedule(runtime) will appear here
1895 // even if the actual runtme schedule is static. (Which points out a
1896 // disadvantage of schedule(runtime): even when static scheduling is used it
1897 // costs more than a compile time choice to use static scheduling would.)
1898 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1901 dispatch_private_info_template<T> *pr;
1902 kmp_info_t *th = __kmp_threads[gtid];
1903 kmp_team_t *team = th->th.th_team;
1905 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1908 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1909 gtid, p_lb, p_ub, p_st, p_last));
1911 if (team->t.t_serialized) {
1912 /* NOTE: serialize this dispatch becase we are not at the active level */
1913 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1914 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1915 KMP_DEBUG_ASSERT(pr);
1917 if ((status = (pr->u.p.tc != 0)) == 0) {
1920 // if ( p_last != NULL )
1924 if (__kmp_env_consistency_check) {
1925 if (pr->pushed_ws != ct_none) {
1926 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1929 } else if (pr->flags.nomerge) {
1932 UT limit, trip, init;
1934 T chunk = pr->u.p.parm1;
1936 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1939 init = chunk * pr->u.p.count++;
1940 trip = pr->u.p.tc - 1;
1942 if ((status = (init <= trip)) == 0) {
1945 // if ( p_last != NULL )
1949 if (__kmp_env_consistency_check) {
1950 if (pr->pushed_ws != ct_none) {
1951 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1956 limit = chunk + init - 1;
1959 if ((last = (limit >= trip)) != 0) {
1962 pr->u.p.last_upper = pr->u.p.ub;
1963 #endif /* KMP_OS_WINDOWS */
1970 *p_lb = start + init;
1971 *p_ub = start + limit;
1973 *p_lb = start + init * incr;
1974 *p_ub = start + limit * incr;
1977 if (pr->flags.ordered) {
1978 pr->u.p.ordered_lower = init;
1979 pr->u.p.ordered_upper = limit;
1983 // create format specifiers before the debug output
1984 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1985 "ordered_lower:%%%s ordered_upper:%%%s\n",
1986 traits_t<UT>::spec, traits_t<UT>::spec);
1987 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1988 pr->u.p.ordered_upper));
1989 __kmp_str_free(&buff);
1999 pr->u.p.last_upper = *p_ub;
2000 #endif /* KMP_OS_WINDOWS */
2009 // create format specifiers before the debug output
2010 buff = __kmp_str_format(
2011 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2012 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2013 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2014 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2015 __kmp_str_free(&buff);
2018 #if INCLUDE_SSC_MARKS
2019 SSC_MARK_DISPATCH_NEXT();
2026 dispatch_shared_info_template<T> volatile *sh;
2028 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2029 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2031 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2032 th->th.th_dispatch->th_dispatch_pr_current);
2033 KMP_DEBUG_ASSERT(pr);
2034 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2035 th->th.th_dispatch->th_dispatch_sh_current);
2036 KMP_DEBUG_ASSERT(sh);
2038 #if KMP_USE_HIER_SCHED
2039 if (pr->flags.use_hier)
2040 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2042 #endif // KMP_USE_HIER_SCHED
2043 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2044 p_st, th->th.th_team_nproc,
2045 th->th.th_info.ds.ds_tid);
2046 // status == 0: no more iterations to execute
2050 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2054 // create format specifiers before the debug output
2055 buff = __kmp_str_format(
2056 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2057 traits_t<UT>::spec);
2058 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2059 __kmp_str_free(&buff);
2063 #if KMP_USE_HIER_SCHED
2064 pr->flags.use_hier = FALSE;
2066 if ((ST)num_done == th->th.th_team_nproc - 1) {
2067 #if (KMP_STATIC_STEAL_ENABLED)
2068 if (pr->schedule == kmp_sch_static_steal &&
2069 traits_t<T>::type_size > 4) {
2071 kmp_info_t **other_threads = team->t.t_threads;
2072 // loop complete, safe to destroy locks used for stealing
2073 for (i = 0; i < th->th.th_team_nproc; ++i) {
2074 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2075 KMP_ASSERT(lck != NULL);
2076 __kmp_destroy_lock(lck);
2078 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2082 /* NOTE: release this buffer to be reused */
2084 KMP_MB(); /* Flush all pending memory write invalidates. */
2086 sh->u.s.num_done = 0;
2087 sh->u.s.iteration = 0;
2089 /* TODO replace with general release procedure? */
2090 if (pr->flags.ordered) {
2091 sh->u.s.ordered_iteration = 0;
2094 KMP_MB(); /* Flush all pending memory write invalidates. */
2096 sh->buffer_index += __kmp_dispatch_num_buffers;
2097 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2098 gtid, sh->buffer_index));
2100 KMP_MB(); /* Flush all pending memory write invalidates. */
2103 if (__kmp_env_consistency_check) {
2104 if (pr->pushed_ws != ct_none) {
2105 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2109 th->th.th_dispatch->th_deo_fcn = NULL;
2110 th->th.th_dispatch->th_dxo_fcn = NULL;
2111 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2112 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2113 } // if (status == 0)
2116 pr->u.p.last_upper = pr->u.p.ub;
2118 #endif /* KMP_OS_WINDOWS */
2119 if (p_last != NULL && status != 0)
2126 // create format specifiers before the debug output
2127 buff = __kmp_str_format(
2128 "__kmp_dispatch_next: T#%%d normal case: "
2129 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2130 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2131 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2132 (p_last ? *p_last : 0), status));
2133 __kmp_str_free(&buff);
2136 #if INCLUDE_SSC_MARKS
2137 SSC_MARK_DISPATCH_NEXT();
2144 template <typename T>
2145 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2146 kmp_int32 *plastiter, T *plower, T *pupper,
2147 typename traits_t<T>::signed_t incr) {
2148 typedef typename traits_t<T>::unsigned_t UT;
2155 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2156 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2158 typedef typename traits_t<T>::signed_t ST;
2161 // create format specifiers before the debug output
2162 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2163 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2164 traits_t<T>::spec, traits_t<T>::spec,
2165 traits_t<ST>::spec, traits_t<T>::spec);
2166 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2167 __kmp_str_free(&buff);
2171 if (__kmp_env_consistency_check) {
2173 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2176 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2177 // The loop is illegal.
2178 // Some zero-trip loops maintained by compiler, e.g.:
2179 // for(i=10;i<0;++i) // lower >= upper - run-time check
2180 // for(i=0;i>10;--i) // lower <= upper - run-time check
2181 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2182 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2183 // Compiler does not check the following illegal loops:
2184 // for(i=0;i<10;i+=incr) // where incr<0
2185 // for(i=10;i>0;i-=incr) // where incr<0
2186 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2189 th = __kmp_threads[gtid];
2190 team = th->th.th_team;
2191 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2192 nteams = th->th.th_teams_size.nteams;
2193 team_id = team->t.t_master_tid;
2194 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2196 // compute global trip count
2198 trip_count = *pupper - *plower + 1;
2199 } else if (incr == -1) {
2200 trip_count = *plower - *pupper + 1;
2201 } else if (incr > 0) {
2202 // upper-lower can exceed the limit of signed type
2203 trip_count = (UT)(*pupper - *plower) / incr + 1;
2205 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2208 if (trip_count <= nteams) {
2210 __kmp_static == kmp_sch_static_greedy ||
2212 kmp_sch_static_balanced); // Unknown static scheduling type.
2213 // only some teams get single iteration, others get nothing
2214 if (team_id < trip_count) {
2215 *pupper = *plower = *plower + team_id * incr;
2217 *plower = *pupper + incr; // zero-trip loop
2219 if (plastiter != NULL)
2220 *plastiter = (team_id == trip_count - 1);
2222 if (__kmp_static == kmp_sch_static_balanced) {
2223 UT chunk = trip_count / nteams;
2224 UT extras = trip_count % nteams;
2226 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2227 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2228 if (plastiter != NULL)
2229 *plastiter = (team_id == nteams - 1);
2232 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2234 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2235 // Unknown static scheduling type.
2236 *plower += team_id * chunk_inc_count;
2237 *pupper = *plower + chunk_inc_count - incr;
2238 // Check/correct bounds if needed
2240 if (*pupper < *plower)
2241 *pupper = traits_t<T>::max_value;
2242 if (plastiter != NULL)
2243 *plastiter = *plower <= upper && *pupper > upper - incr;
2244 if (*pupper > upper)
2245 *pupper = upper; // tracker C73258
2247 if (*pupper > *plower)
2248 *pupper = traits_t<T>::min_value;
2249 if (plastiter != NULL)
2250 *plastiter = *plower >= upper && *pupper < upper - incr;
2251 if (*pupper < upper)
2252 *pupper = upper; // tracker C73258
2258 //-----------------------------------------------------------------------------
2259 // Dispatch routines
2260 // Transfer call to template< type T >
2261 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2262 // T lb, T ub, ST st, ST chunk )
2266 @ingroup WORK_SHARING
2268 @param loc Source location
2269 @param gtid Global thread id
2270 @param schedule Schedule type
2271 @param lb Lower bound
2272 @param ub Upper bound
2273 @param st Step (or increment if you prefer)
2274 @param chunk The chunk size to block with
2276 This function prepares the runtime to start a dynamically scheduled for loop,
2277 saving the loop arguments.
2278 These functions are all identical apart from the types of the arguments.
2281 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2282 enum sched_type schedule, kmp_int32 lb,
2283 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2284 KMP_DEBUG_ASSERT(__kmp_init_serial);
2285 #if OMPT_SUPPORT && OMPT_OPTIONAL
2286 OMPT_STORE_RETURN_ADDRESS(gtid);
2288 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2291 See @ref __kmpc_dispatch_init_4
2293 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2294 enum sched_type schedule, kmp_uint32 lb,
2295 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298 OMPT_STORE_RETURN_ADDRESS(gtid);
2300 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2304 See @ref __kmpc_dispatch_init_4
2306 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2307 enum sched_type schedule, kmp_int64 lb,
2308 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2309 KMP_DEBUG_ASSERT(__kmp_init_serial);
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311 OMPT_STORE_RETURN_ADDRESS(gtid);
2313 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2317 See @ref __kmpc_dispatch_init_4
2319 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2320 enum sched_type schedule, kmp_uint64 lb,
2321 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2322 KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324 OMPT_STORE_RETURN_ADDRESS(gtid);
2326 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2330 See @ref __kmpc_dispatch_init_4
2332 Difference from __kmpc_dispatch_init set of functions is these functions
2333 are called for composite distribute parallel for construct. Thus before
2334 regular iterations dispatching we need to calc per-team iteration space.
2336 These functions are all identical apart from the types of the arguments.
2338 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2339 enum sched_type schedule, kmp_int32 *p_last,
2340 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2342 KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344 OMPT_STORE_RETURN_ADDRESS(gtid);
2346 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2347 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2350 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2351 enum sched_type schedule, kmp_int32 *p_last,
2352 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2354 KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 OMPT_STORE_RETURN_ADDRESS(gtid);
2358 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2359 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2362 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2363 enum sched_type schedule, kmp_int32 *p_last,
2364 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2366 KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368 OMPT_STORE_RETURN_ADDRESS(gtid);
2370 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2371 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2374 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2375 enum sched_type schedule, kmp_int32 *p_last,
2376 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2378 KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380 OMPT_STORE_RETURN_ADDRESS(gtid);
2382 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2383 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2387 @param loc Source code location
2388 @param gtid Global thread id
2389 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2391 @param p_lb Pointer to the lower bound for the next chunk of work
2392 @param p_ub Pointer to the upper bound for the next chunk of work
2393 @param p_st Pointer to the stride for the next chunk of work
2394 @return one if there is work to be done, zero otherwise
2396 Get the next dynamically allocated chunk of work for this thread.
2397 If there is no more work, then the lb,ub and stride need not be modified.
2399 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2400 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402 OMPT_STORE_RETURN_ADDRESS(gtid);
2404 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2407 OMPT_LOAD_RETURN_ADDRESS(gtid)
2413 See @ref __kmpc_dispatch_next_4
2415 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419 OMPT_STORE_RETURN_ADDRESS(gtid);
2421 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2424 OMPT_LOAD_RETURN_ADDRESS(gtid)
2430 See @ref __kmpc_dispatch_next_4
2432 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435 OMPT_STORE_RETURN_ADDRESS(gtid);
2437 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2438 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440 OMPT_LOAD_RETURN_ADDRESS(gtid)
2446 See @ref __kmpc_dispatch_next_4
2448 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452 OMPT_STORE_RETURN_ADDRESS(gtid);
2454 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #if OMPT_SUPPORT && OMPT_OPTIONAL
2457 OMPT_LOAD_RETURN_ADDRESS(gtid)
2463 @param loc Source code location
2464 @param gtid Global thread id
2466 Mark the end of a dynamic loop.
2468 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2469 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2473 See @ref __kmpc_dispatch_fini_4
2475 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2476 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2480 See @ref __kmpc_dispatch_fini_4
2482 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2483 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2487 See @ref __kmpc_dispatch_fini_4
2489 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2490 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2494 //-----------------------------------------------------------------------------
2495 // Non-template routines from kmp_dispatch.cpp used in other sources
2497 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2498 return value == checker;
2501 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2502 return value != checker;
2505 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2506 return value < checker;
2509 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2510 return value >= checker;
2513 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2514 return value <= checker;
2518 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2519 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2520 void *obj // Higher-level synchronization object, or NULL.
2522 // note: we may not belong to a team at this point
2523 volatile kmp_uint32 *spin = spinner;
2524 kmp_uint32 check = checker;
2526 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2529 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2530 KMP_INIT_YIELD(spins);
2531 // main wait spin loop
2532 while (!f(r = TCR_4(*spin), check)) {
2533 KMP_FSYNC_SPIN_PREPARE(obj);
2534 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2535 split. It causes problems with infinite recursion because of exit lock */
2536 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2537 __kmp_abort_thread(); */
2538 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2540 KMP_FSYNC_SPIN_ACQUIRED(obj);
2544 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2545 kmp_uint32 (*pred)(void *, kmp_uint32),
2546 void *obj // Higher-level synchronization object, or NULL.
2548 // note: we may not belong to a team at this point
2549 void *spin = spinner;
2550 kmp_uint32 check = checker;
2552 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2554 KMP_FSYNC_SPIN_INIT(obj, spin);
2555 KMP_INIT_YIELD(spins);
2556 // main wait spin loop
2557 while (!f(spin, check)) {
2558 KMP_FSYNC_SPIN_PREPARE(obj);
2559 /* if we have waited a bit, or are noversubscribed, yield */
2560 /* pause is in the following code */
2561 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2563 KMP_FSYNC_SPIN_ACQUIRED(obj);
2568 #ifdef KMP_GOMP_COMPAT
2570 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2571 enum sched_type schedule, kmp_int32 lb,
2572 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2574 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2578 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2579 enum sched_type schedule, kmp_uint32 lb,
2580 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2582 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2586 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2587 enum sched_type schedule, kmp_int64 lb,
2588 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2590 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2594 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2595 enum sched_type schedule, kmp_uint64 lb,
2596 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2598 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2602 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2603 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2606 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2607 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2610 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2611 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2614 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2615 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2618 #endif /* KMP_GOMP_COMPAT */
2620 /* ------------------------------------------------------------------------ */