contrib/llvm-project/openmp/runtime/src/kmp_dispatch.h

   1 /*
   2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef KMP_DISPATCH_H
  14 #define KMP_DISPATCH_H
  15
  16 /* ------------------------------------------------------------------------ */
  17 /* ------------------------------------------------------------------------ */
  18
  19 #include "kmp.h"
  20 #include "kmp_error.h"
  21 #include "kmp_i18n.h"
  22 #include "kmp_itt.h"
  23 #include "kmp_stats.h"
  24 #include "kmp_str.h"
  25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
  26 #include <float.h>
  27 #endif
  28
  29 #if OMPT_SUPPORT
  30 #include "ompt-internal.h"
  31 #include "ompt-specific.h"
  32 #endif
  33
  34 /* ------------------------------------------------------------------------ */
  35 /* ------------------------------------------------------------------------ */
  36 #if KMP_USE_HIER_SCHED
  37 // Forward declarations of some hierarchical scheduling data structures
  38 template <typename T> struct kmp_hier_t;
  39 template <typename T> struct kmp_hier_top_unit_t;
  40 #endif // KMP_USE_HIER_SCHED
  41
  42 template <typename T> struct dispatch_shared_info_template;
  43 template <typename T> struct dispatch_private_info_template;
  44
  45 template <typename T>
  46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  47                                           dispatch_private_info_template<T> *pr,
  48                                           enum sched_type schedule, T lb, T ub,
  49                                           typename traits_t<T>::signed_t st,
  50 #if USE_ITT_BUILD
  51                                           kmp_uint64 *cur_chunk,
  52 #endif
  53                                           typename traits_t<T>::signed_t chunk,
  54                                           T nproc, T unit_id);
  55 template <typename T>
  56 extern int __kmp_dispatch_next_algorithm(
  57     int gtid, dispatch_private_info_template<T> *pr,
  58     dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
  59     T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
  60
  61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  63
  64 #if KMP_STATIC_STEAL_ENABLED
  65
  66 // replaces dispatch_private_info{32,64} structures and
  67 // dispatch_private_info{32,64}_t types
  68 template <typename T> struct dispatch_private_infoXX_template {
  69   typedef typename traits_t<T>::unsigned_t UT;
  70   typedef typename traits_t<T>::signed_t ST;
  71   UT count; // unsigned
  72   T ub;
  73   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
  74   T lb;
  75   ST st; // signed
  76   UT tc; // unsigned
  77   T static_steal_counter; // for static_steal only; maybe better to put after ub
  78
  79   /* parm[1-4] are used in different ways by different scheduling algorithms */
  80
  81   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
  82   //    a) parm3 is properly aligned and
  83   //    b) all parm1-4 are in the same cache line.
  84   // Because of parm1-4 are used together, performance seems to be better
  85   // if they are in the same line (not measured though).
  86
  87   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
  88     T parm1;
  89     T parm2;
  90     T parm3;
  91     T parm4;
  92   };
  93
  94   UT ordered_lower; // unsigned
  95   UT ordered_upper; // unsigned
  96 #if KMP_OS_WINDOWS
  97   T last_upper;
  98 #endif /* KMP_OS_WINDOWS */
  99 };
 100
 101 #else /* KMP_STATIC_STEAL_ENABLED */
 102
 103 // replaces dispatch_private_info{32,64} structures and
 104 // dispatch_private_info{32,64}_t types
 105 template <typename T> struct dispatch_private_infoXX_template {
 106   typedef typename traits_t<T>::unsigned_t UT;
 107   typedef typename traits_t<T>::signed_t ST;
 108   T lb;
 109   T ub;
 110   ST st; // signed
 111   UT tc; // unsigned
 112
 113   T parm1;
 114   T parm2;
 115   T parm3;
 116   T parm4;
 117
 118   UT count; // unsigned
 119
 120   UT ordered_lower; // unsigned
 121   UT ordered_upper; // unsigned
 122 #if KMP_OS_WINDOWS
 123   T last_upper;
 124 #endif /* KMP_OS_WINDOWS */
 125 };
 126 #endif /* KMP_STATIC_STEAL_ENABLED */
 127
 128 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
 129   // duplicate alignment here, otherwise size of structure is not correct in our
 130   // compiler
 131   union KMP_ALIGN_CACHE private_info_tmpl {
 132     dispatch_private_infoXX_template<T> p;
 133     dispatch_private_info64_t p64;
 134   } u;
 135   enum sched_type schedule; /* scheduling algorithm */
 136   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
 137   kmp_uint32 ordered_bumped;
 138   // to retain the structure size after making order
 139   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
 140   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
 141   kmp_uint32 type_size;
 142 #if KMP_USE_HIER_SCHED
 143   kmp_int32 hier_id;
 144   kmp_hier_top_unit_t<T> *hier_parent;
 145   // member functions
 146   kmp_int32 get_hier_id() const { return hier_id; }
 147   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 148 #endif
 149   enum cons_type pushed_ws;
 150 };
 151
 152 // replaces dispatch_shared_info{32,64} structures and
 153 // dispatch_shared_info{32,64}_t types
 154 template <typename T> struct dispatch_shared_infoXX_template {
 155   typedef typename traits_t<T>::unsigned_t UT;
 156   /* chunk index under dynamic, number of idle threads under static-steal;
 157      iteration index otherwise */
 158   volatile UT iteration;
 159   volatile UT num_done;
 160   volatile UT ordered_iteration;
 161   // to retain the structure size making ordered_iteration scalar
 162   UT ordered_dummy[KMP_MAX_ORDERED - 3];
 163 };
 164
 165 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
 166 template <typename T> struct dispatch_shared_info_template {
 167   typedef typename traits_t<T>::unsigned_t UT;
 168   // we need union here to keep the structure size
 169   union shared_info_tmpl {
 170     dispatch_shared_infoXX_template<UT> s;
 171     dispatch_shared_info64_t s64;
 172   } u;
 173   volatile kmp_uint32 buffer_index;
 174   volatile kmp_int32 doacross_buf_idx; // teamwise index
 175   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
 176   kmp_int32 doacross_num_done; // count finished threads
 177 #if KMP_USE_HIER_SCHED
 178   kmp_hier_t<T> *hier;
 179 #endif
 180 #if KMP_USE_HWLOC
 181   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
 182   // machines (> 48 cores). Performance analysis showed that a cache thrash
 183   // was occurring and this padding helps alleviate the problem.
 184   char padding[64];
 185 #endif
 186 };
 187
 188 /* ------------------------------------------------------------------------ */
 189 /* ------------------------------------------------------------------------ */
 190
 191 #undef USE_TEST_LOCKS
 192
 193 // test_then_add template (general template should NOT be used)
 194 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
 195
 196 template <>
 197 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
 198                                                  kmp_int32 d) {
 199   kmp_int32 r;
 200   r = KMP_TEST_THEN_ADD32(p, d);
 201   return r;
 202 }
 203
 204 template <>
 205 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
 206                                                  kmp_int64 d) {
 207   kmp_int64 r;
 208   r = KMP_TEST_THEN_ADD64(p, d);
 209   return r;
 210 }
 211
 212 // test_then_inc_acq template (general template should NOT be used)
 213 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
 214
 215 template <>
 216 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
 217   kmp_int32 r;
 218   r = KMP_TEST_THEN_INC_ACQ32(p);
 219   return r;
 220 }
 221
 222 template <>
 223 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
 224   kmp_int64 r;
 225   r = KMP_TEST_THEN_INC_ACQ64(p);
 226   return r;
 227 }
 228
 229 // test_then_inc template (general template should NOT be used)
 230 template <typename T> static __forceinline T test_then_inc(volatile T *p);
 231
 232 template <>
 233 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
 234   kmp_int32 r;
 235   r = KMP_TEST_THEN_INC32(p);
 236   return r;
 237 }
 238
 239 template <>
 240 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
 241   kmp_int64 r;
 242   r = KMP_TEST_THEN_INC64(p);
 243   return r;
 244 }
 245
 246 // compare_and_swap template (general template should NOT be used)
 247 template <typename T>
 248 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
 249
 250 template <>
 251 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
 252                                                     kmp_int32 c, kmp_int32 s) {
 253   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
 254 }
 255
 256 template <>
 257 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
 258                                                     kmp_int64 c, kmp_int64 s) {
 259   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
 260 }
 261
 262 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
 263   return value >= checker;
 264 }
 265 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
 266   return value == checker;
 267 }
 268
 269 /*
 270     Spin wait loop that pauses between checks.
 271     Waits until function returns non-zero when called with *spinner and check.
 272     Does NOT put threads to sleep.
 273     Arguments:
 274         UT is unsigned 4- or 8-byte type
 275         spinner - memory location to check value
 276         checker - value which spinner is >, <, ==, etc.
 277         pred - predicate function to perform binary comparison of some sort
 278 #if USE_ITT_BUILD
 279         obj -- is higher-level synchronization object to report to ittnotify. It
 280         is used to report locks consistently. For example, if lock is acquired
 281         immediately, its address is reported to ittnotify via
 282         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
 283         and lock routine calls to KMP_WAIT(), the later should report the
 284         same address, not an address of low-level spinner.
 285 #endif // USE_ITT_BUILD
 286     TODO: make inline function (move to header file for icl)
 287 */
 288 template <typename UT>
 289 static UT __kmp_wait(volatile UT *spinner, UT checker,
 290                      kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
 291   // note: we may not belong to a team at this point
 292   volatile UT *spin = spinner;
 293   UT check = checker;
 294   kmp_uint32 spins;
 295   kmp_uint32 (*f)(UT, UT) = pred;
 296   UT r;
 297
 298   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
 299   KMP_INIT_YIELD(spins);
 300   // main wait spin loop
 301   while (!f(r = *spin, check)) {
 302     KMP_FSYNC_SPIN_PREPARE(obj);
 303     /* GEH - remove this since it was accidentally introduced when kmp_wait was
 304        split.
 305        It causes problems with infinite recursion because of exit lock */
 306     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
 307         __kmp_abort_thread(); */
 308     // If oversubscribed, or have waited a bit then yield.
 309     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
 310   }
 311   KMP_FSYNC_SPIN_ACQUIRED(obj);
 312   return r;
 313 }
 314
 315 /* ------------------------------------------------------------------------ */
 316 /* ------------------------------------------------------------------------ */
 317
 318 template <typename UT>
 319 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 320   dispatch_private_info_template<UT> *pr;
 321
 322   int gtid = *gtid_ref;
 323   //    int  cid = *cid_ref;
 324   kmp_info_t *th = __kmp_threads[gtid];
 325   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 326
 327   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
 328   if (__kmp_env_consistency_check) {
 329     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 330         th->th.th_dispatch->th_dispatch_pr_current);
 331     if (pr->pushed_ws != ct_none) {
 332 #if KMP_USE_DYNAMIC_LOCK
 333       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
 334 #else
 335       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
 336 #endif
 337     }
 338   }
 339
 340   if (!th->th.th_team->t.t_serialized) {
 341     dispatch_shared_info_template<UT> *sh =
 342         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 343             th->th.th_dispatch->th_dispatch_sh_current);
 344     UT lower;
 345
 346     if (!__kmp_env_consistency_check) {
 347       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 348           th->th.th_dispatch->th_dispatch_pr_current);
 349     }
 350     lower = pr->u.p.ordered_lower;
 351
 352 #if !defined(KMP_GOMP_COMPAT)
 353     if (__kmp_env_consistency_check) {
 354       if (pr->ordered_bumped) {
 355         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 356         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 357                                ct_ordered_in_pdo, loc_ref,
 358                                &p->stack_data[p->w_top]);
 359       }
 360     }
 361 #endif /* !defined(KMP_GOMP_COMPAT) */
 362
 363     KMP_MB();
 364 #ifdef KMP_DEBUG
 365     {
 366       char *buff;
 367       // create format specifiers before the debug output
 368       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
 369                               "ordered_iter:%%%s lower:%%%s\n",
 370                               traits_t<UT>::spec, traits_t<UT>::spec);
 371       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 372       __kmp_str_free(&buff);
 373     }
 374 #endif
 375     __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
 376                    __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
 377     KMP_MB(); /* is this necessary? */
 378 #ifdef KMP_DEBUG
 379     {
 380       char *buff;
 381       // create format specifiers before the debug output
 382       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
 383                               "ordered_iter:%%%s lower:%%%s\n",
 384                               traits_t<UT>::spec, traits_t<UT>::spec);
 385       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 386       __kmp_str_free(&buff);
 387     }
 388 #endif
 389   }
 390   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
 391 }
 392
 393 template <typename UT>
 394 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 395   typedef typename traits_t<UT>::signed_t ST;
 396   dispatch_private_info_template<UT> *pr;
 397
 398   int gtid = *gtid_ref;
 399   //    int  cid = *cid_ref;
 400   kmp_info_t *th = __kmp_threads[gtid];
 401   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 402
 403   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
 404   if (__kmp_env_consistency_check) {
 405     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 406         th->th.th_dispatch->th_dispatch_pr_current);
 407     if (pr->pushed_ws != ct_none) {
 408       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
 409     }
 410   }
 411
 412   if (!th->th.th_team->t.t_serialized) {
 413     dispatch_shared_info_template<UT> *sh =
 414         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 415             th->th.th_dispatch->th_dispatch_sh_current);
 416
 417     if (!__kmp_env_consistency_check) {
 418       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 419           th->th.th_dispatch->th_dispatch_pr_current);
 420     }
 421
 422     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
 423 #if !defined(KMP_GOMP_COMPAT)
 424     if (__kmp_env_consistency_check) {
 425       if (pr->ordered_bumped != 0) {
 426         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 427         /* How to test it? - OM */
 428         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 429                                ct_ordered_in_pdo, loc_ref,
 430                                &p->stack_data[p->w_top]);
 431       }
 432     }
 433 #endif /* !defined(KMP_GOMP_COMPAT) */
 434
 435     KMP_MB(); /* Flush all pending memory write invalidates.  */
 436
 437     pr->ordered_bumped += 1;
 438
 439     KD_TRACE(1000,
 440              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
 441               gtid, pr->ordered_bumped));
 442
 443     KMP_MB(); /* Flush all pending memory write invalidates.  */
 444
 445     /* TODO use general release procedure? */
 446     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
 447
 448     KMP_MB(); /* Flush all pending memory write invalidates.  */
 449   }
 450   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
 451 }
 452
 453 /* Computes and returns x to the power of y, where y must a non-negative integer
 454  */
 455 template <typename UT>
 456 static __forceinline long double __kmp_pow(long double x, UT y) {
 457   long double s = 1.0L;
 458
 459   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
 460   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
 461   while (y) {
 462     if (y & 1)
 463       s *= x;
 464     x *= x;
 465     y >>= 1;
 466   }
 467   return s;
 468 }
 469
 470 /* Computes and returns the number of unassigned iterations after idx chunks
 471    have been assigned
 472    (the total number of unassigned iterations in chunks with index greater than
 473    or equal to idx).
 474    __forceinline seems to be broken so that if we __forceinline this function,
 475    the behavior is wrong
 476    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
 477 */
 478 template <typename T>
 479 static __inline typename traits_t<T>::unsigned_t
 480 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
 481                                 typename traits_t<T>::unsigned_t idx) {
 482   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
 483      least for ICL 8.1, long double arithmetic may not really have
 484      long double precision, even with /Qlong_double.  Currently, we
 485      workaround that in the caller code, by manipulating the FPCW for
 486      Windows* OS on IA-32 architecture.  The lack of precision is not
 487      expected to be a correctness issue, though.
 488   */
 489   typedef typename traits_t<T>::unsigned_t UT;
 490
 491   long double x = tc * __kmp_pow<UT>(base, idx);
 492   UT r = (UT)x;
 493   if (x == r)
 494     return r;
 495   return r + 1;
 496 }
 497
 498 // Parameters of the guided-iterative algorithm:
 499 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
 500 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
 501 // by default n = 2. For example with n = 3 the chunks distribution will be more
 502 // flat.
 503 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
 504 static const int guided_int_param = 2;
 505 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
 506 #endif // KMP_DISPATCH_H