contrib/openmp/runtime/src/kmp_dispatch.h

   1 /*
   2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 //                     The LLVM Compiler Infrastructure
   8 //
   9 // This file is dual licensed under the MIT and the University of Illinois Open
  10 // Source Licenses. See LICENSE.txt for details.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef KMP_DISPATCH_H
  15 #define KMP_DISPATCH_H
  16
  17 /* ------------------------------------------------------------------------ */
  18 /* ------------------------------------------------------------------------ */
  19
  20 #include "kmp.h"
  21 #include "kmp_error.h"
  22 #include "kmp_i18n.h"
  23 #include "kmp_itt.h"
  24 #include "kmp_stats.h"
  25 #include "kmp_str.h"
  26 #if KMP_OS_WINDOWS && KMP_ARCH_X86
  27 #include <float.h>
  28 #endif
  29
  30 #if OMPT_SUPPORT
  31 #include "ompt-internal.h"
  32 #include "ompt-specific.h"
  33 #endif
  34
  35 /* ------------------------------------------------------------------------ */
  36 /* ------------------------------------------------------------------------ */
  37 #if KMP_USE_HIER_SCHED
  38 // Forward declarations of some hierarchical scheduling data structures
  39 template <typename T> struct kmp_hier_t;
  40 template <typename T> struct kmp_hier_top_unit_t;
  41 #endif // KMP_USE_HIER_SCHED
  42
  43 template <typename T> struct dispatch_shared_info_template;
  44 template <typename T> struct dispatch_private_info_template;
  45
  46 template <typename T>
  47 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
  48                                           dispatch_private_info_template<T> *pr,
  49                                           enum sched_type schedule, T lb, T ub,
  50                                           typename traits_t<T>::signed_t st,
  51 #if USE_ITT_BUILD
  52                                           kmp_uint64 *cur_chunk,
  53 #endif
  54                                           typename traits_t<T>::signed_t chunk,
  55                                           T nproc, T unit_id);
  56 template <typename T>
  57 extern int __kmp_dispatch_next_algorithm(
  58     int gtid, dispatch_private_info_template<T> *pr,
  59     dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
  60     T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
  61
  62 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  63 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
  64
  65 #if KMP_STATIC_STEAL_ENABLED
  66
  67 // replaces dispatch_private_info{32,64} structures and
  68 // dispatch_private_info{32,64}_t types
  69 template <typename T> struct dispatch_private_infoXX_template {
  70   typedef typename traits_t<T>::unsigned_t UT;
  71   typedef typename traits_t<T>::signed_t ST;
  72   UT count; // unsigned
  73   T ub;
  74   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
  75   T lb;
  76   ST st; // signed
  77   UT tc; // unsigned
  78   T static_steal_counter; // for static_steal only; maybe better to put after ub
  79
  80   /* parm[1-4] are used in different ways by different scheduling algorithms */
  81
  82   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
  83   //    a) parm3 is properly aligned and
  84   //    b) all parm1-4 are in the same cache line.
  85   // Because of parm1-4 are used together, performance seems to be better
  86   // if they are in the same line (not measured though).
  87
  88   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
  89     T parm1;
  90     T parm2;
  91     T parm3;
  92     T parm4;
  93   };
  94
  95   UT ordered_lower; // unsigned
  96   UT ordered_upper; // unsigned
  97 #if KMP_OS_WINDOWS
  98   T last_upper;
  99 #endif /* KMP_OS_WINDOWS */
 100 };
 101
 102 #else /* KMP_STATIC_STEAL_ENABLED */
 103
 104 // replaces dispatch_private_info{32,64} structures and
 105 // dispatch_private_info{32,64}_t types
 106 template <typename T> struct dispatch_private_infoXX_template {
 107   typedef typename traits_t<T>::unsigned_t UT;
 108   typedef typename traits_t<T>::signed_t ST;
 109   T lb;
 110   T ub;
 111   ST st; // signed
 112   UT tc; // unsigned
 113
 114   T parm1;
 115   T parm2;
 116   T parm3;
 117   T parm4;
 118
 119   UT count; // unsigned
 120
 121   UT ordered_lower; // unsigned
 122   UT ordered_upper; // unsigned
 123 #if KMP_OS_WINDOWS
 124   T last_upper;
 125 #endif /* KMP_OS_WINDOWS */
 126 };
 127 #endif /* KMP_STATIC_STEAL_ENABLED */
 128
 129 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
 130   // duplicate alignment here, otherwise size of structure is not correct in our
 131   // compiler
 132   union KMP_ALIGN_CACHE private_info_tmpl {
 133     dispatch_private_infoXX_template<T> p;
 134     dispatch_private_info64_t p64;
 135   } u;
 136   enum sched_type schedule; /* scheduling algorithm */
 137   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
 138   kmp_uint32 ordered_bumped;
 139   // to retain the structure size after making order
 140   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
 141   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
 142   kmp_uint32 type_size;
 143 #if KMP_USE_HIER_SCHED
 144   kmp_int32 hier_id;
 145   kmp_hier_top_unit_t<T> *hier_parent;
 146   // member functions
 147   kmp_int32 get_hier_id() const { return hier_id; }
 148   kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
 149 #endif
 150   enum cons_type pushed_ws;
 151 };
 152
 153 // replaces dispatch_shared_info{32,64} structures and
 154 // dispatch_shared_info{32,64}_t types
 155 template <typename T> struct dispatch_shared_infoXX_template {
 156   typedef typename traits_t<T>::unsigned_t UT;
 157   /* chunk index under dynamic, number of idle threads under static-steal;
 158      iteration index otherwise */
 159   volatile UT iteration;
 160   volatile UT num_done;
 161   volatile UT ordered_iteration;
 162   // to retain the structure size making ordered_iteration scalar
 163   UT ordered_dummy[KMP_MAX_ORDERED - 3];
 164 };
 165
 166 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
 167 template <typename T> struct dispatch_shared_info_template {
 168   typedef typename traits_t<T>::unsigned_t UT;
 169   // we need union here to keep the structure size
 170   union shared_info_tmpl {
 171     dispatch_shared_infoXX_template<UT> s;
 172     dispatch_shared_info64_t s64;
 173   } u;
 174   volatile kmp_uint32 buffer_index;
 175 #if OMP_45_ENABLED
 176   volatile kmp_int32 doacross_buf_idx; // teamwise index
 177   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
 178   kmp_int32 doacross_num_done; // count finished threads
 179 #endif
 180 #if KMP_USE_HIER_SCHED
 181   kmp_hier_t<T> *hier;
 182 #endif
 183 #if KMP_USE_HWLOC
 184   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
 185   // machines (> 48 cores). Performance analysis showed that a cache thrash
 186   // was occurring and this padding helps alleviate the problem.
 187   char padding[64];
 188 #endif
 189 };
 190
 191 /* ------------------------------------------------------------------------ */
 192 /* ------------------------------------------------------------------------ */
 193
 194 #undef USE_TEST_LOCKS
 195
 196 // test_then_add template (general template should NOT be used)
 197 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
 198
 199 template <>
 200 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
 201                                                  kmp_int32 d) {
 202   kmp_int32 r;
 203   r = KMP_TEST_THEN_ADD32(p, d);
 204   return r;
 205 }
 206
 207 template <>
 208 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
 209                                                  kmp_int64 d) {
 210   kmp_int64 r;
 211   r = KMP_TEST_THEN_ADD64(p, d);
 212   return r;
 213 }
 214
 215 // test_then_inc_acq template (general template should NOT be used)
 216 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
 217
 218 template <>
 219 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
 220   kmp_int32 r;
 221   r = KMP_TEST_THEN_INC_ACQ32(p);
 222   return r;
 223 }
 224
 225 template <>
 226 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
 227   kmp_int64 r;
 228   r = KMP_TEST_THEN_INC_ACQ64(p);
 229   return r;
 230 }
 231
 232 // test_then_inc template (general template should NOT be used)
 233 template <typename T> static __forceinline T test_then_inc(volatile T *p);
 234
 235 template <>
 236 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
 237   kmp_int32 r;
 238   r = KMP_TEST_THEN_INC32(p);
 239   return r;
 240 }
 241
 242 template <>
 243 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
 244   kmp_int64 r;
 245   r = KMP_TEST_THEN_INC64(p);
 246   return r;
 247 }
 248
 249 // compare_and_swap template (general template should NOT be used)
 250 template <typename T>
 251 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
 252
 253 template <>
 254 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
 255                                                     kmp_int32 c, kmp_int32 s) {
 256   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
 257 }
 258
 259 template <>
 260 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
 261                                                     kmp_int64 c, kmp_int64 s) {
 262   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
 263 }
 264
 265 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
 266   return value >= checker;
 267 }
 268 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
 269   return value == checker;
 270 }
 271
 272 /*
 273     Spin wait loop that first does pause, then yield.
 274     Waits until function returns non-zero when called with *spinner and check.
 275     Does NOT put threads to sleep.
 276     Arguments:
 277         UT is unsigned 4- or 8-byte type
 278         spinner - memory location to check value
 279         checker - value which spinner is >, <, ==, etc.
 280         pred - predicate function to perform binary comparison of some sort
 281 #if USE_ITT_BUILD
 282         obj -- is higher-level synchronization object to report to ittnotify. It
 283         is used to report locks consistently. For example, if lock is acquired
 284         immediately, its address is reported to ittnotify via
 285         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
 286         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
 287         same address, not an address of low-level spinner.
 288 #endif // USE_ITT_BUILD
 289     TODO: make inline function (move to header file for icl)
 290 */
 291 template <typename UT>
 292 static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
 293                            kmp_uint32 (*pred)(UT, UT)
 294                                USE_ITT_BUILD_ARG(void *obj)) {
 295   // note: we may not belong to a team at this point
 296   volatile UT *spin = spinner;
 297   UT check = checker;
 298   kmp_uint32 spins;
 299   kmp_uint32 (*f)(UT, UT) = pred;
 300   UT r;
 301
 302   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
 303   KMP_INIT_YIELD(spins);
 304   // main wait spin loop
 305   while (!f(r = *spin, check)) {
 306     KMP_FSYNC_SPIN_PREPARE(obj);
 307     /* GEH - remove this since it was accidentally introduced when kmp_wait was
 308        split.
 309        It causes problems with infinite recursion because of exit lock */
 310     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
 311         __kmp_abort_thread(); */
 312
 313     // if we are oversubscribed,
 314     // or have waited a bit (and KMP_LIBRARY=throughput, then yield
 315     // pause is in the following code
 316     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
 317     KMP_YIELD_SPIN(spins);
 318   }
 319   KMP_FSYNC_SPIN_ACQUIRED(obj);
 320   return r;
 321 }
 322
 323 /* ------------------------------------------------------------------------ */
 324 /* ------------------------------------------------------------------------ */
 325
 326 template <typename UT>
 327 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 328   dispatch_private_info_template<UT> *pr;
 329
 330   int gtid = *gtid_ref;
 331   //    int  cid = *cid_ref;
 332   kmp_info_t *th = __kmp_threads[gtid];
 333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 334
 335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
 336   if (__kmp_env_consistency_check) {
 337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 338         th->th.th_dispatch->th_dispatch_pr_current);
 339     if (pr->pushed_ws != ct_none) {
 340 #if KMP_USE_DYNAMIC_LOCK
 341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
 342 #else
 343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
 344 #endif
 345     }
 346   }
 347
 348   if (!th->th.th_team->t.t_serialized) {
 349     dispatch_shared_info_template<UT> *sh =
 350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 351             th->th.th_dispatch->th_dispatch_sh_current);
 352     UT lower;
 353
 354     if (!__kmp_env_consistency_check) {
 355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 356           th->th.th_dispatch->th_dispatch_pr_current);
 357     }
 358     lower = pr->u.p.ordered_lower;
 359
 360 #if !defined(KMP_GOMP_COMPAT)
 361     if (__kmp_env_consistency_check) {
 362       if (pr->ordered_bumped) {
 363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 365                                ct_ordered_in_pdo, loc_ref,
 366                                &p->stack_data[p->w_top]);
 367       }
 368     }
 369 #endif /* !defined(KMP_GOMP_COMPAT) */
 370
 371     KMP_MB();
 372 #ifdef KMP_DEBUG
 373     {
 374       char *buff;
 375       // create format specifiers before the debug output
 376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
 377                               "ordered_iter:%%%s lower:%%%s\n",
 378                               traits_t<UT>::spec, traits_t<UT>::spec);
 379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 380       __kmp_str_free(&buff);
 381     }
 382 #endif
 383     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
 384                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
 385     KMP_MB(); /* is this necessary? */
 386 #ifdef KMP_DEBUG
 387     {
 388       char *buff;
 389       // create format specifiers before the debug output
 390       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
 391                               "ordered_iter:%%%s lower:%%%s\n",
 392                               traits_t<UT>::spec, traits_t<UT>::spec);
 393       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
 394       __kmp_str_free(&buff);
 395     }
 396 #endif
 397   }
 398   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
 399 }
 400
 401 template <typename UT>
 402 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
 403   typedef typename traits_t<UT>::signed_t ST;
 404   dispatch_private_info_template<UT> *pr;
 405
 406   int gtid = *gtid_ref;
 407   //    int  cid = *cid_ref;
 408   kmp_info_t *th = __kmp_threads[gtid];
 409   KMP_DEBUG_ASSERT(th->th.th_dispatch);
 410
 411   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
 412   if (__kmp_env_consistency_check) {
 413     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 414         th->th.th_dispatch->th_dispatch_pr_current);
 415     if (pr->pushed_ws != ct_none) {
 416       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
 417     }
 418   }
 419
 420   if (!th->th.th_team->t.t_serialized) {
 421     dispatch_shared_info_template<UT> *sh =
 422         reinterpret_cast<dispatch_shared_info_template<UT> *>(
 423             th->th.th_dispatch->th_dispatch_sh_current);
 424
 425     if (!__kmp_env_consistency_check) {
 426       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
 427           th->th.th_dispatch->th_dispatch_pr_current);
 428     }
 429
 430     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
 431 #if !defined(KMP_GOMP_COMPAT)
 432     if (__kmp_env_consistency_check) {
 433       if (pr->ordered_bumped != 0) {
 434         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
 435         /* How to test it? - OM */
 436         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
 437                                ct_ordered_in_pdo, loc_ref,
 438                                &p->stack_data[p->w_top]);
 439       }
 440     }
 441 #endif /* !defined(KMP_GOMP_COMPAT) */
 442
 443     KMP_MB(); /* Flush all pending memory write invalidates.  */
 444
 445     pr->ordered_bumped += 1;
 446
 447     KD_TRACE(1000,
 448              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
 449               gtid, pr->ordered_bumped));
 450
 451     KMP_MB(); /* Flush all pending memory write invalidates.  */
 452
 453     /* TODO use general release procedure? */
 454     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
 455
 456     KMP_MB(); /* Flush all pending memory write invalidates.  */
 457   }
 458   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
 459 }
 460
 461 /* Computes and returns x to the power of y, where y must a non-negative integer
 462  */
 463 template <typename UT>
 464 static __forceinline long double __kmp_pow(long double x, UT y) {
 465   long double s = 1.0L;
 466
 467   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
 468   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
 469   while (y) {
 470     if (y & 1)
 471       s *= x;
 472     x *= x;
 473     y >>= 1;
 474   }
 475   return s;
 476 }
 477
 478 /* Computes and returns the number of unassigned iterations after idx chunks
 479    have been assigned
 480    (the total number of unassigned iterations in chunks with index greater than
 481    or equal to idx).
 482    __forceinline seems to be broken so that if we __forceinline this function,
 483    the behavior is wrong
 484    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
 485 */
 486 template <typename T>
 487 static __inline typename traits_t<T>::unsigned_t
 488 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
 489                                 typename traits_t<T>::unsigned_t idx) {
 490   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
 491      least for ICL 8.1, long double arithmetic may not really have
 492      long double precision, even with /Qlong_double.  Currently, we
 493      workaround that in the caller code, by manipulating the FPCW for
 494      Windows* OS on IA-32 architecture.  The lack of precision is not
 495      expected to be a correctness issue, though.
 496   */
 497   typedef typename traits_t<T>::unsigned_t UT;
 498
 499   long double x = tc * __kmp_pow<UT>(base, idx);
 500   UT r = (UT)x;
 501   if (x == r)
 502     return r;
 503   return r + 1;
 504 }
 505
 506 // Parameters of the guided-iterative algorithm:
 507 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
 508 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
 509 // by default n = 2. For example with n = 3 the chunks distribution will be more
 510 // flat.
 511 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
 512 static const int guided_int_param = 2;
 513 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
 514 #endif // KMP_DISPATCH_H