2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
5 //===----------------------------------------------------------------------===//
7 // The LLVM Compiler Infrastructure
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
12 //===----------------------------------------------------------------------===//
14 #ifndef KMP_DISPATCH_H
15 #define KMP_DISPATCH_H
17 /* ------------------------------------------------------------------------ */
18 /* ------------------------------------------------------------------------ */
21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_OS_WINDOWS && KMP_ARCH_X86
31 #include "ompt-internal.h"
32 #include "ompt-specific.h"
35 /* ------------------------------------------------------------------------ */
36 /* ------------------------------------------------------------------------ */
37 #if KMP_USE_HIER_SCHED
38 // Forward declarations of some hierarchical scheduling data structures
39 template <typename T> struct kmp_hier_t;
40 template <typename T> struct kmp_hier_top_unit_t;
41 #endif // KMP_USE_HIER_SCHED
43 template <typename T> struct dispatch_shared_info_template;
44 template <typename T> struct dispatch_private_info_template;
47 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
48 dispatch_private_info_template<T> *pr,
49 enum sched_type schedule, T lb, T ub,
50 typename traits_t<T>::signed_t st,
52 kmp_uint64 *cur_chunk,
54 typename traits_t<T>::signed_t chunk,
57 extern int __kmp_dispatch_next_algorithm(
58 int gtid, dispatch_private_info_template<T> *pr,
59 dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
60 T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
62 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
65 #if KMP_STATIC_STEAL_ENABLED
67 // replaces dispatch_private_info{32,64} structures and
68 // dispatch_private_info{32,64}_t types
69 template <typename T> struct dispatch_private_infoXX_template {
70 typedef typename traits_t<T>::unsigned_t UT;
71 typedef typename traits_t<T>::signed_t ST;
74 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
78 T static_steal_counter; // for static_steal only; maybe better to put after ub
80 /* parm[1-4] are used in different ways by different scheduling algorithms */
82 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
83 // a) parm3 is properly aligned and
84 // b) all parm1-4 are in the same cache line.
85 // Because of parm1-4 are used together, performance seems to be better
86 // if they are in the same line (not measured though).
88 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
95 UT ordered_lower; // unsigned
96 UT ordered_upper; // unsigned
99 #endif /* KMP_OS_WINDOWS */
102 #else /* KMP_STATIC_STEAL_ENABLED */
104 // replaces dispatch_private_info{32,64} structures and
105 // dispatch_private_info{32,64}_t types
106 template <typename T> struct dispatch_private_infoXX_template {
107 typedef typename traits_t<T>::unsigned_t UT;
108 typedef typename traits_t<T>::signed_t ST;
119 UT count; // unsigned
121 UT ordered_lower; // unsigned
122 UT ordered_upper; // unsigned
125 #endif /* KMP_OS_WINDOWS */
127 #endif /* KMP_STATIC_STEAL_ENABLED */
129 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
130 // duplicate alignment here, otherwise size of structure is not correct in our
132 union KMP_ALIGN_CACHE private_info_tmpl {
133 dispatch_private_infoXX_template<T> p;
134 dispatch_private_info64_t p64;
136 enum sched_type schedule; /* scheduling algorithm */
137 kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
138 kmp_uint32 ordered_bumped;
139 // to retain the structure size after making order
140 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
141 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
142 kmp_uint32 type_size;
143 #if KMP_USE_HIER_SCHED
145 kmp_hier_top_unit_t<T> *hier_parent;
147 kmp_int32 get_hier_id() const { return hier_id; }
148 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
150 enum cons_type pushed_ws;
153 // replaces dispatch_shared_info{32,64} structures and
154 // dispatch_shared_info{32,64}_t types
155 template <typename T> struct dispatch_shared_infoXX_template {
156 typedef typename traits_t<T>::unsigned_t UT;
157 /* chunk index under dynamic, number of idle threads under static-steal;
158 iteration index otherwise */
159 volatile UT iteration;
160 volatile UT num_done;
161 volatile UT ordered_iteration;
162 // to retain the structure size making ordered_iteration scalar
163 UT ordered_dummy[KMP_MAX_ORDERED - 3];
166 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
167 template <typename T> struct dispatch_shared_info_template {
168 typedef typename traits_t<T>::unsigned_t UT;
169 // we need union here to keep the structure size
170 union shared_info_tmpl {
171 dispatch_shared_infoXX_template<UT> s;
172 dispatch_shared_info64_t s64;
174 volatile kmp_uint32 buffer_index;
176 volatile kmp_int32 doacross_buf_idx; // teamwise index
177 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
178 kmp_int32 doacross_num_done; // count finished threads
180 #if KMP_USE_HIER_SCHED
184 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
185 // machines (> 48 cores). Performance analysis showed that a cache thrash
186 // was occurring and this padding helps alleviate the problem.
191 /* ------------------------------------------------------------------------ */
192 /* ------------------------------------------------------------------------ */
194 #undef USE_TEST_LOCKS
196 // test_then_add template (general template should NOT be used)
197 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
200 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
203 r = KMP_TEST_THEN_ADD32(p, d);
208 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
211 r = KMP_TEST_THEN_ADD64(p, d);
215 // test_then_inc_acq template (general template should NOT be used)
216 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
219 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
221 r = KMP_TEST_THEN_INC_ACQ32(p);
226 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
228 r = KMP_TEST_THEN_INC_ACQ64(p);
232 // test_then_inc template (general template should NOT be used)
233 template <typename T> static __forceinline T test_then_inc(volatile T *p);
236 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
238 r = KMP_TEST_THEN_INC32(p);
243 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
245 r = KMP_TEST_THEN_INC64(p);
249 // compare_and_swap template (general template should NOT be used)
250 template <typename T>
251 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
254 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
255 kmp_int32 c, kmp_int32 s) {
256 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
260 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
261 kmp_int64 c, kmp_int64 s) {
262 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
265 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
266 return value >= checker;
268 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
269 return value == checker;
273 Spin wait loop that first does pause, then yield.
274 Waits until function returns non-zero when called with *spinner and check.
275 Does NOT put threads to sleep.
277 UT is unsigned 4- or 8-byte type
278 spinner - memory location to check value
279 checker - value which spinner is >, <, ==, etc.
280 pred - predicate function to perform binary comparison of some sort
282 obj -- is higher-level synchronization object to report to ittnotify. It
283 is used to report locks consistently. For example, if lock is acquired
284 immediately, its address is reported to ittnotify via
285 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
286 and lock routine calls to KMP_WAIT_YIELD(), the later should report the
287 same address, not an address of low-level spinner.
288 #endif // USE_ITT_BUILD
289 TODO: make inline function (move to header file for icl)
291 template <typename UT>
292 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
293 kmp_uint32 (*pred)(UT, UT)
294 USE_ITT_BUILD_ARG(void *obj)) {
295 // note: we may not belong to a team at this point
296 volatile UT *spin = spinner;
299 kmp_uint32 (*f)(UT, UT) = pred;
302 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
303 KMP_INIT_YIELD(spins);
304 // main wait spin loop
305 while (!f(r = *spin, check)) {
306 KMP_FSYNC_SPIN_PREPARE(obj);
307 /* GEH - remove this since it was accidentally introduced when kmp_wait was
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
317 KMP_YIELD_SPIN(spins);
319 KMP_FSYNC_SPIN_ACQUIRED(obj);
323 /* ------------------------------------------------------------------------ */
324 /* ------------------------------------------------------------------------ */
326 template <typename UT>
327 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
328 dispatch_private_info_template<UT> *pr;
330 int gtid = *gtid_ref;
331 // int cid = *cid_ref;
332 kmp_info_t *th = __kmp_threads[gtid];
333 KMP_DEBUG_ASSERT(th->th.th_dispatch);
335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336 if (__kmp_env_consistency_check) {
337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338 th->th.th_dispatch->th_dispatch_pr_current);
339 if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
348 if (!th->th.th_team->t.t_serialized) {
349 dispatch_shared_info_template<UT> *sh =
350 reinterpret_cast<dispatch_shared_info_template<UT> *>(
351 th->th.th_dispatch->th_dispatch_sh_current);
354 if (!__kmp_env_consistency_check) {
355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356 th->th.th_dispatch->th_dispatch_pr_current);
358 lower = pr->u.p.ordered_lower;
360 #if !defined(KMP_GOMP_COMPAT)
361 if (__kmp_env_consistency_check) {
362 if (pr->ordered_bumped) {
363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365 ct_ordered_in_pdo, loc_ref,
366 &p->stack_data[p->w_top]);
369 #endif /* !defined(KMP_GOMP_COMPAT) */
375 // create format specifiers before the debug output
376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377 "ordered_iter:%%%s lower:%%%s\n",
378 traits_t<UT>::spec, traits_t<UT>::spec);
379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380 __kmp_str_free(&buff);
383 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
384 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
385 KMP_MB(); /* is this necessary? */
389 // create format specifiers before the debug output
390 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
391 "ordered_iter:%%%s lower:%%%s\n",
392 traits_t<UT>::spec, traits_t<UT>::spec);
393 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
394 __kmp_str_free(&buff);
398 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
401 template <typename UT>
402 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
403 typedef typename traits_t<UT>::signed_t ST;
404 dispatch_private_info_template<UT> *pr;
406 int gtid = *gtid_ref;
407 // int cid = *cid_ref;
408 kmp_info_t *th = __kmp_threads[gtid];
409 KMP_DEBUG_ASSERT(th->th.th_dispatch);
411 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
412 if (__kmp_env_consistency_check) {
413 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
414 th->th.th_dispatch->th_dispatch_pr_current);
415 if (pr->pushed_ws != ct_none) {
416 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
420 if (!th->th.th_team->t.t_serialized) {
421 dispatch_shared_info_template<UT> *sh =
422 reinterpret_cast<dispatch_shared_info_template<UT> *>(
423 th->th.th_dispatch->th_dispatch_sh_current);
425 if (!__kmp_env_consistency_check) {
426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427 th->th.th_dispatch->th_dispatch_pr_current);
430 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
431 #if !defined(KMP_GOMP_COMPAT)
432 if (__kmp_env_consistency_check) {
433 if (pr->ordered_bumped != 0) {
434 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
435 /* How to test it? - OM */
436 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
437 ct_ordered_in_pdo, loc_ref,
438 &p->stack_data[p->w_top]);
441 #endif /* !defined(KMP_GOMP_COMPAT) */
443 KMP_MB(); /* Flush all pending memory write invalidates. */
445 pr->ordered_bumped += 1;
448 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
449 gtid, pr->ordered_bumped));
451 KMP_MB(); /* Flush all pending memory write invalidates. */
453 /* TODO use general release procedure? */
454 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
456 KMP_MB(); /* Flush all pending memory write invalidates. */
458 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
461 /* Computes and returns x to the power of y, where y must a non-negative integer
463 template <typename UT>
464 static __forceinline long double __kmp_pow(long double x, UT y) {
465 long double s = 1.0L;
467 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
468 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
478 /* Computes and returns the number of unassigned iterations after idx chunks
480 (the total number of unassigned iterations in chunks with index greater than
482 __forceinline seems to be broken so that if we __forceinline this function,
483 the behavior is wrong
484 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
486 template <typename T>
487 static __inline typename traits_t<T>::unsigned_t
488 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
489 typename traits_t<T>::unsigned_t idx) {
490 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
491 least for ICL 8.1, long double arithmetic may not really have
492 long double precision, even with /Qlong_double. Currently, we
493 workaround that in the caller code, by manipulating the FPCW for
494 Windows* OS on IA-32 architecture. The lack of precision is not
495 expected to be a correctness issue, though.
497 typedef typename traits_t<T>::unsigned_t UT;
499 long double x = tc * __kmp_pow<UT>(base, idx);
506 // Parameters of the guided-iterative algorithm:
507 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
508 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
509 // by default n = 2. For example with n = 3 the chunks distribution will be more
511 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
512 static const int guided_int_param = 2;
513 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
514 #endif // KMP_DISPATCH_H