/* * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch. */ //===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef KMP_DISPATCH_H #define KMP_DISPATCH_H /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ #include "kmp.h" #include "kmp_error.h" #include "kmp_i18n.h" #include "kmp_itt.h" #include "kmp_stats.h" #include "kmp_str.h" #if KMP_OS_WINDOWS && KMP_ARCH_X86 #include #endif #if OMPT_SUPPORT #include "ompt-internal.h" #include "ompt-specific.h" #endif /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ #if KMP_USE_HIER_SCHED // Forward declarations of some hierarchical scheduling data structures template struct kmp_hier_t; template struct kmp_hier_top_unit_t; #endif // KMP_USE_HIER_SCHED template struct dispatch_shared_info_template; template struct dispatch_private_info_template; template extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, dispatch_private_info_template *pr, enum sched_type schedule, T lb, T ub, typename traits_t::signed_t st, #if USE_ITT_BUILD kmp_uint64 *cur_chunk, #endif typename traits_t::signed_t chunk, T nproc, T unit_id); template extern int __kmp_dispatch_next_algorithm( int gtid, dispatch_private_info_template *pr, dispatch_shared_info_template volatile *sh, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t::signed_t *p_st, T nproc, T unit_id); void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); #if KMP_STATIC_STEAL_ENABLED // replaces dispatch_private_info{32,64} structures and // dispatch_private_info{32,64}_t types template struct dispatch_private_infoXX_template { typedef typename traits_t::unsigned_t UT; typedef typename traits_t::signed_t ST; UT count; // unsigned T ub; /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ T lb; ST st; // signed UT tc; // unsigned T static_steal_counter; // for static_steal only; maybe better to put after ub kmp_lock_t *th_steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) // a) parm3 is properly aligned and // b) all parm1-4 are in the same cache line. // Because of parm1-4 are used together, performance seems to be better // if they are in the same line (not measured though). struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 T parm1; T parm2; T parm3; T parm4; }; UT ordered_lower; // unsigned UT ordered_upper; // unsigned #if KMP_OS_WINDOWS T last_upper; #endif /* KMP_OS_WINDOWS */ }; #else /* KMP_STATIC_STEAL_ENABLED */ // replaces dispatch_private_info{32,64} structures and // dispatch_private_info{32,64}_t types template struct dispatch_private_infoXX_template { typedef typename traits_t::unsigned_t UT; typedef typename traits_t::signed_t ST; T lb; T ub; ST st; // signed UT tc; // unsigned T parm1; T parm2; T parm3; T parm4; UT count; // unsigned UT ordered_lower; // unsigned UT ordered_upper; // unsigned #if KMP_OS_WINDOWS T last_upper; #endif /* KMP_OS_WINDOWS */ }; #endif /* KMP_STATIC_STEAL_ENABLED */ template struct KMP_ALIGN_CACHE dispatch_private_info_template { // duplicate alignment here, otherwise size of structure is not correct in our // compiler union KMP_ALIGN_CACHE private_info_tmpl { dispatch_private_infoXX_template p; dispatch_private_info64_t p64; } u; enum sched_type schedule; /* scheduling algorithm */ kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ kmp_uint32 ordered_bumped; // to retain the structure size after making order kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; dispatch_private_info *next; /* stack of buffers for nest of serial regions */ kmp_uint32 type_size; #if KMP_USE_HIER_SCHED kmp_int32 hier_id; kmp_hier_top_unit_t *hier_parent; // member functions kmp_int32 get_hier_id() const { return hier_id; } kmp_hier_top_unit_t *get_parent() { return hier_parent; } #endif enum cons_type pushed_ws; }; // replaces dispatch_shared_info{32,64} structures and // dispatch_shared_info{32,64}_t types template struct dispatch_shared_infoXX_template { typedef typename traits_t::unsigned_t UT; /* chunk index under dynamic, number of idle threads under static-steal; iteration index otherwise */ volatile UT iteration; volatile UT num_done; volatile UT ordered_iteration; // to retain the structure size making ordered_iteration scalar UT ordered_dummy[KMP_MAX_ORDERED - 3]; }; // replaces dispatch_shared_info structure and dispatch_shared_info_t type template struct dispatch_shared_info_template { typedef typename traits_t::unsigned_t UT; // we need union here to keep the structure size union shared_info_tmpl { dispatch_shared_infoXX_template s; dispatch_shared_info64_t s64; } u; volatile kmp_uint32 buffer_index; volatile kmp_int32 doacross_buf_idx; // teamwise index kmp_uint32 *doacross_flags; // array of iteration flags (0/1) kmp_int32 doacross_num_done; // count finished threads #if KMP_USE_HIER_SCHED kmp_hier_t *hier; #endif #if KMP_USE_HWLOC // When linking with libhwloc, the ORDERED EPCC test slowsdown on big // machines (> 48 cores). Performance analysis showed that a cache thrash // was occurring and this padding helps alleviate the problem. char padding[64]; #endif }; /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ #undef USE_TEST_LOCKS // test_then_add template (general template should NOT be used) template static __forceinline T test_then_add(volatile T *p, T d); template <> __forceinline kmp_int32 test_then_add(volatile kmp_int32 *p, kmp_int32 d) { kmp_int32 r; r = KMP_TEST_THEN_ADD32(p, d); return r; } template <> __forceinline kmp_int64 test_then_add(volatile kmp_int64 *p, kmp_int64 d) { kmp_int64 r; r = KMP_TEST_THEN_ADD64(p, d); return r; } // test_then_inc_acq template (general template should NOT be used) template static __forceinline T test_then_inc_acq(volatile T *p); template <> __forceinline kmp_int32 test_then_inc_acq(volatile kmp_int32 *p) { kmp_int32 r; r = KMP_TEST_THEN_INC_ACQ32(p); return r; } template <> __forceinline kmp_int64 test_then_inc_acq(volatile kmp_int64 *p) { kmp_int64 r; r = KMP_TEST_THEN_INC_ACQ64(p); return r; } // test_then_inc template (general template should NOT be used) template static __forceinline T test_then_inc(volatile T *p); template <> __forceinline kmp_int32 test_then_inc(volatile kmp_int32 *p) { kmp_int32 r; r = KMP_TEST_THEN_INC32(p); return r; } template <> __forceinline kmp_int64 test_then_inc(volatile kmp_int64 *p) { kmp_int64 r; r = KMP_TEST_THEN_INC64(p); return r; } // compare_and_swap template (general template should NOT be used) template static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); template <> __forceinline kmp_int32 compare_and_swap(volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s) { return KMP_COMPARE_AND_STORE_REL32(p, c, s); } template <> __forceinline kmp_int32 compare_and_swap(volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s) { return KMP_COMPARE_AND_STORE_REL64(p, c, s); } template kmp_uint32 __kmp_ge(T value, T checker) { return value >= checker; } template kmp_uint32 __kmp_eq(T value, T checker) { return value == checker; } /* Spin wait loop that pauses between checks. Waits until function returns non-zero when called with *spinner and check. Does NOT put threads to sleep. Arguments: UT is unsigned 4- or 8-byte type spinner - memory location to check value checker - value which spinner is >, <, ==, etc. pred - predicate function to perform binary comparison of some sort #if USE_ITT_BUILD obj -- is higher-level synchronization object to report to ittnotify. It is used to report locks consistently. For example, if lock is acquired immediately, its address is reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately and lock routine calls to KMP_WAIT(), the later should report the same address, not an address of low-level spinner. #endif // USE_ITT_BUILD TODO: make inline function (move to header file for icl) */ template static UT __kmp_wait(volatile UT *spinner, UT checker, kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) { // note: we may not belong to a team at this point volatile UT *spin = spinner; UT check = checker; kmp_uint32 spins; kmp_uint32 (*f)(UT, UT) = pred; UT r; KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); KMP_INIT_YIELD(spins); // main wait spin loop while (!f(r = *spin, check)) { KMP_FSYNC_SPIN_PREPARE(obj); /* GEH - remove this since it was accidentally introduced when kmp_wait was split. It causes problems with infinite recursion because of exit lock */ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) __kmp_abort_thread(); */ // If oversubscribed, or have waited a bit then yield. KMP_YIELD_OVERSUB_ELSE_SPIN(spins); } KMP_FSYNC_SPIN_ACQUIRED(obj); return r; } /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ template void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { dispatch_private_info_template *pr; int gtid = *gtid_ref; // int cid = *cid_ref; kmp_info_t *th = __kmp_threads[gtid]; KMP_DEBUG_ASSERT(th->th.th_dispatch); KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); if (__kmp_env_consistency_check) { pr = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_pr_current); if (pr->pushed_ws != ct_none) { #if KMP_USE_DYNAMIC_LOCK __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); #else __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); #endif } } if (!th->th.th_team->t.t_serialized) { dispatch_shared_info_template *sh = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_sh_current); UT lower; if (!__kmp_env_consistency_check) { pr = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_pr_current); } lower = pr->u.p.ordered_lower; #if !defined(KMP_GOMP_COMPAT) if (__kmp_env_consistency_check) { if (pr->ordered_bumped) { struct cons_header *p = __kmp_threads[gtid]->th.th_cons; __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, ct_ordered_in_pdo, loc_ref, &p->stack_data[p->w_top]); } } #endif /* !defined(KMP_GOMP_COMPAT) */ KMP_MB(); #ifdef KMP_DEBUG { char *buff; // create format specifiers before the debug output buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " "ordered_iter:%%%s lower:%%%s\n", traits_t::spec, traits_t::spec); KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); __kmp_str_free(&buff); } #endif __kmp_wait(&sh->u.s.ordered_iteration, lower, __kmp_ge USE_ITT_BUILD_ARG(NULL)); KMP_MB(); /* is this necessary? */ #ifdef KMP_DEBUG { char *buff; // create format specifiers before the debug output buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " "ordered_iter:%%%s lower:%%%s\n", traits_t::spec, traits_t::spec); KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); __kmp_str_free(&buff); } #endif } KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); } template void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { typedef typename traits_t::signed_t ST; dispatch_private_info_template *pr; int gtid = *gtid_ref; // int cid = *cid_ref; kmp_info_t *th = __kmp_threads[gtid]; KMP_DEBUG_ASSERT(th->th.th_dispatch); KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); if (__kmp_env_consistency_check) { pr = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_pr_current); if (pr->pushed_ws != ct_none) { __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); } } if (!th->th.th_team->t.t_serialized) { dispatch_shared_info_template *sh = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_sh_current); if (!__kmp_env_consistency_check) { pr = reinterpret_cast *>( th->th.th_dispatch->th_dispatch_pr_current); } KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); #if !defined(KMP_GOMP_COMPAT) if (__kmp_env_consistency_check) { if (pr->ordered_bumped != 0) { struct cons_header *p = __kmp_threads[gtid]->th.th_cons; /* How to test it? - OM */ __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, ct_ordered_in_pdo, loc_ref, &p->stack_data[p->w_top]); } } #endif /* !defined(KMP_GOMP_COMPAT) */ KMP_MB(); /* Flush all pending memory write invalidates. */ pr->ordered_bumped += 1; KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", gtid, pr->ordered_bumped)); KMP_MB(); /* Flush all pending memory write invalidates. */ /* TODO use general release procedure? */ test_then_inc((volatile ST *)&sh->u.s.ordered_iteration); KMP_MB(); /* Flush all pending memory write invalidates. */ } KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); } /* Computes and returns x to the power of y, where y must a non-negative integer */ template static __forceinline long double __kmp_pow(long double x, UT y) { long double s = 1.0L; KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned while (y) { if (y & 1) s *= x; x *= x; y >>= 1; } return s; } /* Computes and returns the number of unassigned iterations after idx chunks have been assigned (the total number of unassigned iterations in chunks with index greater than or equal to idx). __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */ template static __inline typename traits_t::unsigned_t __kmp_dispatch_guided_remaining(T tc, typename traits_t::floating_t base, typename traits_t::unsigned_t idx) { /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for ICL 8.1, long double arithmetic may not really have long double precision, even with /Qlong_double. Currently, we workaround that in the caller code, by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack of precision is not expected to be a correctness issue, though. */ typedef typename traits_t::unsigned_t UT; long double x = tc * __kmp_pow(base, idx); UT r = (UT)x; if (x == r) return r; return r + 1; } // Parameters of the guided-iterative algorithm: // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic // p3 = 1 / ( n * nproc ) // remaining iterations multiplier // by default n = 2. For example with n = 3 the chunks distribution will be more // flat. // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. static const int guided_int_param = 2; static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param; #endif // KMP_DISPATCH_H