contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp

   1 /*
   2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   8 // See https://llvm.org/LICENSE.txt for license information.
   9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #define __KMP_IMP
  14 #include "omp.h" /* extern "C" declarations of user-visible routines */
  15 #include "kmp.h"
  16 #include "kmp_error.h"
  17 #include "kmp_i18n.h"
  18 #include "kmp_itt.h"
  19 #include "kmp_lock.h"
  20 #include "kmp_stats.h"
  21
  22 #if OMPT_SUPPORT
  23 #include "ompt-specific.h"
  24 #endif
  25
  26 #define MAX_MESSAGE 512
  27
  28 // flags will be used in future, e.g. to implement openmp_strict library
  29 // restrictions
  30
  31 /*!
  32  * @ingroup STARTUP_SHUTDOWN
  33  * @param loc   in   source location information
  34  * @param flags in   for future use (currently ignored)
  35  *
  36  * Initialize the runtime library. This call is optional; if it is not made then
  37  * it will be implicitly called by attempts to use other library functions.
  38  */
  39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
  40   // By default __kmpc_begin() is no-op.
  41   char *env;
  42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
  43       __kmp_str_match_true(env)) {
  44     __kmp_middle_initialize();
  45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
  46   } else if (__kmp_ignore_mppbeg() == FALSE) {
  47     // By default __kmp_ignore_mppbeg() returns TRUE.
  48     __kmp_internal_begin();
  49     KC_TRACE(10, ("__kmpc_begin: called\n"));
  50   }
  51 }
  52
  53 /*!
  54  * @ingroup STARTUP_SHUTDOWN
  55  * @param loc source location information
  56  *
  57  * Shutdown the runtime library. This is also optional, and even if called will
  58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
  59  * zero.
  60  */
  61 void __kmpc_end(ident_t *loc) {
  62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
  63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
  64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
  65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
  66   // library shut down).
  67   if (__kmp_ignore_mppend() == FALSE) {
  68     KC_TRACE(10, ("__kmpc_end: called\n"));
  69     KA_TRACE(30, ("__kmpc_end\n"));
  70
  71     __kmp_internal_end_thread(-1);
  72   }
  73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
  74   // Normal exit process on Windows does not allow worker threads of the final
  75   // parallel region to finish reporting their events, so shutting down the
  76   // library here fixes the issue at least for the cases where __kmpc_end() is
  77   // placed properly.
  78   if (ompt_enabled.enabled)
  79     __kmp_internal_end_library(__kmp_gtid_get_specific());
  80 #endif
  81 }
  82
  83 /*!
  84 @ingroup THREAD_STATES
  85 @param loc Source location information.
  86 @return The global thread index of the active thread.
  87
  88 This function can be called in any context.
  89
  90 If the runtime has ony been entered at the outermost level from a
  91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
  92 that which would be returned by omp_get_thread_num() in the outermost
  93 active parallel construct. (Or zero if there is no active parallel
  94 construct, since the master thread is necessarily thread zero).
  95
  96 If multiple non-OpenMP threads all enter an OpenMP construct then this
  97 will be a unique thread identifier among all the threads created by
  98 the OpenMP runtime (but the value cannote be defined in terms of
  99 OpenMP thread ids returned by omp_get_thread_num()).
 100 */
 101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
 102   kmp_int32 gtid = __kmp_entry_gtid();
 103
 104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
 105
 106   return gtid;
 107 }
 108
 109 /*!
 110 @ingroup THREAD_STATES
 111 @param loc Source location information.
 112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
 113
 114 This function can be called in any context.
 115 It returns the total number of threads under the control of the OpenMP runtime.
 116 That is not a number that can be determined by any OpenMP standard calls, since
 117 the library may be called from more than one non-OpenMP thread, and this
 118 reflects the total over all such calls. Similarly the runtime maintains
 119 underlying threads even when they are not active (since the cost of creating
 120 and destroying OS threads is high), this call counts all such threads even if
 121 they are not waiting for work.
 122 */
 123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
 124   KC_TRACE(10,
 125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
 126
 127   return TCR_4(__kmp_all_nth);
 128 }
 129
 130 /*!
 131 @ingroup THREAD_STATES
 132 @param loc Source location information.
 133 @return The thread number of the calling thread in the innermost active parallel
 134 construct.
 135 */
 136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
 137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
 138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
 139 }
 140
 141 /*!
 142 @ingroup THREAD_STATES
 143 @param loc Source location information.
 144 @return The number of threads in the innermost active parallel construct.
 145 */
 146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
 147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
 148
 149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
 150 }
 151
 152 /*!
 153  * @ingroup DEPRECATED
 154  * @param loc location description
 155  *
 156  * This function need not be called. It always returns TRUE.
 157  */
 158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
 159 #ifndef KMP_DEBUG
 160
 161   return TRUE;
 162
 163 #else
 164
 165   const char *semi2;
 166   const char *semi3;
 167   int line_no;
 168
 169   if (__kmp_par_range == 0) {
 170     return TRUE;
 171   }
 172   semi2 = loc->psource;
 173   if (semi2 == NULL) {
 174     return TRUE;
 175   }
 176   semi2 = strchr(semi2, ';');
 177   if (semi2 == NULL) {
 178     return TRUE;
 179   }
 180   semi2 = strchr(semi2 + 1, ';');
 181   if (semi2 == NULL) {
 182     return TRUE;
 183   }
 184   if (__kmp_par_range_filename[0]) {
 185     const char *name = semi2 - 1;
 186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
 187       name--;
 188     }
 189     if ((*name == '/') || (*name == ';')) {
 190       name++;
 191     }
 192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
 193       return __kmp_par_range < 0;
 194     }
 195   }
 196   semi3 = strchr(semi2 + 1, ';');
 197   if (__kmp_par_range_routine[0]) {
 198     if ((semi3 != NULL) && (semi3 > semi2) &&
 199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
 200       return __kmp_par_range < 0;
 201     }
 202   }
 203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
 204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
 205       return __kmp_par_range > 0;
 206     }
 207     return __kmp_par_range < 0;
 208   }
 209   return TRUE;
 210
 211 #endif /* KMP_DEBUG */
 212 }
 213
 214 /*!
 215 @ingroup THREAD_STATES
 216 @param loc Source location information.
 217 @return 1 if this thread is executing inside an active parallel region, zero if
 218 not.
 219 */
 220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
 221   return __kmp_entry_thread()->th.th_root->r.r_active;
 222 }
 223
 224 /*!
 225 @ingroup PARALLEL
 226 @param loc source location information
 227 @param global_tid global thread number
 228 @param num_threads number of threads requested for this parallel construct
 229
 230 Set the number of threads to be used by the next fork spawned by this thread.
 231 This call is only required if the parallel construct has a `num_threads` clause.
 232 */
 233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
 234                              kmp_int32 num_threads) {
 235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
 236                 global_tid, num_threads));
 237
 238   __kmp_push_num_threads(loc, global_tid, num_threads);
 239 }
 240
 241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
 242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
 243
 244   /* the num_threads are automatically popped */
 245 }
 246
 247 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
 248                            kmp_int32 proc_bind) {
 249   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
 250                 proc_bind));
 251
 252   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
 253 }
 254
 255 /*!
 256 @ingroup PARALLEL
 257 @param loc  source location information
 258 @param argc  total number of arguments in the ellipsis
 259 @param microtask  pointer to callback routine consisting of outlined parallel
 260 construct
 261 @param ...  pointers to shared variables that aren't global
 262
 263 Do the actual fork and call the microtask in the relevant number of threads.
 264 */
 265 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
 266   int gtid = __kmp_entry_gtid();
 267
 268 #if (KMP_STATS_ENABLED)
 269   // If we were in a serial region, then stop the serial timer, record
 270   // the event, and start parallel region timer
 271   stats_state_e previous_state = KMP_GET_THREAD_STATE();
 272   if (previous_state == stats_state_e::SERIAL_REGION) {
 273     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
 274   } else {
 275     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
 276   }
 277   int inParallel = __kmpc_in_parallel(loc);
 278   if (inParallel) {
 279     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
 280   } else {
 281     KMP_COUNT_BLOCK(OMP_PARALLEL);
 282   }
 283 #endif
 284
 285   // maybe to save thr_state is enough here
 286   {
 287     va_list ap;
 288     va_start(ap, microtask);
 289
 290 #if OMPT_SUPPORT
 291     ompt_frame_t *ompt_frame;
 292     if (ompt_enabled.enabled) {
 293       kmp_info_t *master_th = __kmp_threads[gtid];
 294       kmp_team_t *parent_team = master_th->th.th_team;
 295       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
 296       if (lwt)
 297         ompt_frame = &(lwt->ompt_task_info.frame);
 298       else {
 299         int tid = __kmp_tid_from_gtid(gtid);
 300         ompt_frame = &(
 301             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
 302       }
 303       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 304       OMPT_STORE_RETURN_ADDRESS(gtid);
 305     }
 306 #endif
 307
 308 #if INCLUDE_SSC_MARKS
 309     SSC_MARK_FORKING();
 310 #endif
 311     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
 312                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
 313                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
 314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
 316                     &ap
 317 #else
 318                     ap
 319 #endif
 320                     );
 321 #if INCLUDE_SSC_MARKS
 322     SSC_MARK_JOINING();
 323 #endif
 324     __kmp_join_call(loc, gtid
 325 #if OMPT_SUPPORT
 326                     ,
 327                     fork_context_intel
 328 #endif
 329                     );
 330
 331     va_end(ap);
 332   }
 333
 334 #if KMP_STATS_ENABLED
 335   if (previous_state == stats_state_e::SERIAL_REGION) {
 336     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
 337   } else {
 338     KMP_POP_PARTITIONED_TIMER();
 339   }
 340 #endif // KMP_STATS_ENABLED
 341 }
 342
 343 /*!
 344 @ingroup PARALLEL
 345 @param loc source location information
 346 @param global_tid global thread number
 347 @param num_teams number of teams requested for the teams construct
 348 @param num_threads number of threads per team requested for the teams construct
 349
 350 Set the number of teams to be used by the teams construct.
 351 This call is only required if the teams construct has a `num_teams` clause
 352 or a `thread_limit` clause (or both).
 353 */
 354 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
 355                            kmp_int32 num_teams, kmp_int32 num_threads) {
 356   KA_TRACE(20,
 357            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
 358             global_tid, num_teams, num_threads));
 359
 360   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
 361 }
 362
 363 /*!
 364 @ingroup PARALLEL
 365 @param loc  source location information
 366 @param argc  total number of arguments in the ellipsis
 367 @param microtask  pointer to callback routine consisting of outlined teams
 368 construct
 369 @param ...  pointers to shared variables that aren't global
 370
 371 Do the actual fork and call the microtask in the relevant number of threads.
 372 */
 373 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
 374                        ...) {
 375   int gtid = __kmp_entry_gtid();
 376   kmp_info_t *this_thr = __kmp_threads[gtid];
 377   va_list ap;
 378   va_start(ap, microtask);
 379
 380 #if KMP_STATS_ENABLED
 381   KMP_COUNT_BLOCK(OMP_TEAMS);
 382   stats_state_e previous_state = KMP_GET_THREAD_STATE();
 383   if (previous_state == stats_state_e::SERIAL_REGION) {
 384     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
 385   } else {
 386     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
 387   }
 388 #endif
 389
 390   // remember teams entry point and nesting level
 391   this_thr->th.th_teams_microtask = microtask;
 392   this_thr->th.th_teams_level =
 393       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
 394
 395 #if OMPT_SUPPORT
 396   kmp_team_t *parent_team = this_thr->th.th_team;
 397   int tid = __kmp_tid_from_gtid(gtid);
 398   if (ompt_enabled.enabled) {
 399     parent_team->t.t_implicit_task_taskdata[tid]
 400         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 401   }
 402   OMPT_STORE_RETURN_ADDRESS(gtid);
 403 #endif
 404
 405   // check if __kmpc_push_num_teams called, set default number of teams
 406   // otherwise
 407   if (this_thr->th.th_teams_size.nteams == 0) {
 408     __kmp_push_num_teams(loc, gtid, 0, 0);
 409   }
 410   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
 411   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
 412   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 413
 414   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
 415                   VOLATILE_CAST(microtask_t)
 416                       __kmp_teams_master, // "wrapped" task
 417                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
 418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
 419                   &ap
 420 #else
 421                   ap
 422 #endif
 423                   );
 424   __kmp_join_call(loc, gtid
 425 #if OMPT_SUPPORT
 426                   ,
 427                   fork_context_intel
 428 #endif
 429                   );
 430
 431   // Pop current CG root off list
 432   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
 433   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
 434   this_thr->th.th_cg_roots = tmp->up;
 435   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
 436                  " to node %p. cg_nthreads was %d\n",
 437                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
 438   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
 439   int i = tmp->cg_nthreads--;
 440   if (i == 1) { // check is we are the last thread in CG (not always the case)
 441     __kmp_free(tmp);
 442   }
 443   // Restore current task's thread_limit from CG root
 444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
 445   this_thr->th.th_current_task->td_icvs.thread_limit =
 446       this_thr->th.th_cg_roots->cg_thread_limit;
 447
 448   this_thr->th.th_teams_microtask = NULL;
 449   this_thr->th.th_teams_level = 0;
 450   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
 451   va_end(ap);
 452 #if KMP_STATS_ENABLED
 453   if (previous_state == stats_state_e::SERIAL_REGION) {
 454     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
 455   } else {
 456     KMP_POP_PARTITIONED_TIMER();
 457   }
 458 #endif // KMP_STATS_ENABLED
 459 }
 460
 461 // I don't think this function should ever have been exported.
 462 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
 463 // openmp code ever called it, but it's been exported from the RTL for so
 464 // long that I'm afraid to remove the definition.
 465 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
 466
 467 /*!
 468 @ingroup PARALLEL
 469 @param loc  source location information
 470 @param global_tid  global thread number
 471
 472 Enter a serialized parallel construct. This interface is used to handle a
 473 conditional parallel region, like this,
 474 @code
 475 #pragma omp parallel if (condition)
 476 @endcode
 477 when the condition is false.
 478 */
 479 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 480 // The implementation is now in kmp_runtime.cpp so that it can share static
 481 // functions with kmp_fork_call since the tasks to be done are similar in
 482 // each case.
 483 #if OMPT_SUPPORT
 484   OMPT_STORE_RETURN_ADDRESS(global_tid);
 485 #endif
 486   __kmp_serialized_parallel(loc, global_tid);
 487 }
 488
 489 /*!
 490 @ingroup PARALLEL
 491 @param loc  source location information
 492 @param global_tid  global thread number
 493
 494 Leave a serialized parallel construct.
 495 */
 496 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 497   kmp_internal_control_t *top;
 498   kmp_info_t *this_thr;
 499   kmp_team_t *serial_team;
 500
 501   KC_TRACE(10,
 502            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
 503
 504   /* skip all this code for autopar serialized loops since it results in
 505      unacceptable overhead */
 506   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
 507     return;
 508
 509   // Not autopar code
 510   if (!TCR_4(__kmp_init_parallel))
 511     __kmp_parallel_initialize();
 512
 513   __kmp_resume_if_soft_paused();
 514
 515   this_thr = __kmp_threads[global_tid];
 516   serial_team = this_thr->th.th_serial_team;
 517
 518   kmp_task_team_t *task_team = this_thr->th.th_task_team;
 519   // we need to wait for the proxy tasks before finishing the thread
 520   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
 521     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
 522
 523   KMP_MB();
 524   KMP_DEBUG_ASSERT(serial_team);
 525   KMP_ASSERT(serial_team->t.t_serialized);
 526   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
 527   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
 528   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
 529   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
 530
 531 #if OMPT_SUPPORT
 532   if (ompt_enabled.enabled &&
 533       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
 534     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
 535     if (ompt_enabled.ompt_callback_implicit_task) {
 536       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
 537           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
 538           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
 539     }
 540
 541     // reset clear the task id only after unlinking the task
 542     ompt_data_t *parent_task_data;
 543     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
 544
 545     if (ompt_enabled.ompt_callback_parallel_end) {
 546       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
 547           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
 548           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
 549     }
 550     __ompt_lw_taskteam_unlink(this_thr);
 551     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
 552   }
 553 #endif
 554
 555   /* If necessary, pop the internal control stack values and replace the team
 556    * values */
 557   top = serial_team->t.t_control_stack_top;
 558   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
 559     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
 560     serial_team->t.t_control_stack_top = top->next;
 561     __kmp_free(top);
 562   }
 563
 564   // if( serial_team -> t.t_serialized > 1 )
 565   serial_team->t.t_level--;
 566
 567   /* pop dispatch buffers stack */
 568   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
 569   {
 570     dispatch_private_info_t *disp_buffer =
 571         serial_team->t.t_dispatch->th_disp_buffer;
 572     serial_team->t.t_dispatch->th_disp_buffer =
 573         serial_team->t.t_dispatch->th_disp_buffer->next;
 574     __kmp_free(disp_buffer);
 575   }
 576   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 577
 578   --serial_team->t.t_serialized;
 579   if (serial_team->t.t_serialized == 0) {
 580
 581 /* return to the parallel section */
 582
 583 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 584     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
 585       __kmp_clear_x87_fpu_status_word();
 586       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
 587       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
 588     }
 589 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 590
 591     this_thr->th.th_team = serial_team->t.t_parent;
 592     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
 593
 594     /* restore values cached in the thread */
 595     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
 596     this_thr->th.th_team_master =
 597         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
 598     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
 599
 600     /* TODO the below shouldn't need to be adjusted for serialized teams */
 601     this_thr->th.th_dispatch =
 602         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
 603
 604     __kmp_pop_current_task_from_thread(this_thr);
 605
 606     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
 607     this_thr->th.th_current_task->td_flags.executing = 1;
 608
 609     if (__kmp_tasking_mode != tskm_immediate_exec) {
 610       // Copy the task team from the new child / old parent team to the thread.
 611       this_thr->th.th_task_team =
 612           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
 613       KA_TRACE(20,
 614                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
 615                 "team %p\n",
 616                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
 617     }
 618   } else {
 619     if (__kmp_tasking_mode != tskm_immediate_exec) {
 620       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
 621                     "depth of serial team %p to %d\n",
 622                     global_tid, serial_team, serial_team->t.t_serialized));
 623     }
 624   }
 625
 626   if (__kmp_env_consistency_check)
 627     __kmp_pop_parallel(global_tid, NULL);
 628 #if OMPT_SUPPORT
 629   if (ompt_enabled.enabled)
 630     this_thr->th.ompt_thread_info.state =
 631         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
 632                                            : ompt_state_work_parallel);
 633 #endif
 634 }
 635
 636 /*!
 637 @ingroup SYNCHRONIZATION
 638 @param loc  source location information.
 639
 640 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
 641 depending on the memory ordering convention obeyed by the compiler
 642 even that may not be necessary).
 643 */
 644 void __kmpc_flush(ident_t *loc) {
 645   KC_TRACE(10, ("__kmpc_flush: called\n"));
 646
 647   /* need explicit __mf() here since use volatile instead in library */
 648   KMP_MB(); /* Flush all pending memory write invalidates.  */
 649
 650 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 651 #if KMP_MIC
 652 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
 653 // We shouldn't need it, though, since the ABI rules require that
 654 // * If the compiler generates NGO stores it also generates the fence
 655 // * If users hand-code NGO stores they should insert the fence
 656 // therefore no incomplete unordered stores should be visible.
 657 #else
 658   // C74404
 659   // This is to address non-temporal store instructions (sfence needed).
 660   // The clflush instruction is addressed either (mfence needed).
 661   // Probably the non-temporal load monvtdqa instruction should also be
 662   // addressed.
 663   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
 664   if (!__kmp_cpuinfo.initialized) {
 665     __kmp_query_cpuid(&__kmp_cpuinfo);
 666   }
 667   if (!__kmp_cpuinfo.sse2) {
 668     // CPU cannot execute SSE2 instructions.
 669   } else {
 670 #if KMP_COMPILER_ICC
 671     _mm_mfence();
 672 #elif KMP_COMPILER_MSVC
 673     MemoryBarrier();
 674 #else
 675     __sync_synchronize();
 676 #endif // KMP_COMPILER_ICC
 677   }
 678 #endif // KMP_MIC
 679 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
 680 // Nothing to see here move along
 681 #elif KMP_ARCH_PPC64
 682 // Nothing needed here (we have a real MB above).
 683 #if KMP_OS_CNK
 684   // The flushing thread needs to yield here; this prevents a
 685   // busy-waiting thread from saturating the pipeline. flush is
 686   // often used in loops like this:
 687   // while (!flag) {
 688   //   #pragma omp flush(flag)
 689   // }
 690   // and adding the yield here is good for at least a 10x speedup
 691   // when running >2 threads per core (on the NAS LU benchmark).
 692   __kmp_yield();
 693 #endif
 694 #else
 695 #error Unknown or unsupported architecture
 696 #endif
 697
 698 #if OMPT_SUPPORT && OMPT_OPTIONAL
 699   if (ompt_enabled.ompt_callback_flush) {
 700     ompt_callbacks.ompt_callback(ompt_callback_flush)(
 701         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
 702   }
 703 #endif
 704 }
 705
 706 /* -------------------------------------------------------------------------- */
 707 /*!
 708 @ingroup SYNCHRONIZATION
 709 @param loc source location information
 710 @param global_tid thread id.
 711
 712 Execute a barrier.
 713 */
 714 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
 715   KMP_COUNT_BLOCK(OMP_BARRIER);
 716   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
 717
 718   if (!TCR_4(__kmp_init_parallel))
 719     __kmp_parallel_initialize();
 720
 721   __kmp_resume_if_soft_paused();
 722
 723   if (__kmp_env_consistency_check) {
 724     if (loc == 0) {
 725       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
 726     }
 727     __kmp_check_barrier(global_tid, ct_barrier, loc);
 728   }
 729
 730 #if OMPT_SUPPORT
 731   ompt_frame_t *ompt_frame;
 732   if (ompt_enabled.enabled) {
 733     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
 734     if (ompt_frame->enter_frame.ptr == NULL)
 735       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 736     OMPT_STORE_RETURN_ADDRESS(global_tid);
 737   }
 738 #endif
 739   __kmp_threads[global_tid]->th.th_ident = loc;
 740   // TODO: explicit barrier_wait_id:
 741   //   this function is called when 'barrier' directive is present or
 742   //   implicit barrier at the end of a worksharing construct.
 743   // 1) better to add a per-thread barrier counter to a thread data structure
 744   // 2) set to 0 when a new team is created
 745   // 4) no sync is required
 746
 747   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 748 #if OMPT_SUPPORT && OMPT_OPTIONAL
 749   if (ompt_enabled.enabled) {
 750     ompt_frame->enter_frame = ompt_data_none;
 751   }
 752 #endif
 753 }
 754
 755 /* The BARRIER for a MASTER section is always explicit   */
 756 /*!
 757 @ingroup WORK_SHARING
 758 @param loc  source location information.
 759 @param global_tid  global thread number .
 760 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
 761 */
 762 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
 763   int status = 0;
 764
 765   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
 766
 767   if (!TCR_4(__kmp_init_parallel))
 768     __kmp_parallel_initialize();
 769
 770   __kmp_resume_if_soft_paused();
 771
 772   if (KMP_MASTER_GTID(global_tid)) {
 773     KMP_COUNT_BLOCK(OMP_MASTER);
 774     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
 775     status = 1;
 776   }
 777
 778 #if OMPT_SUPPORT && OMPT_OPTIONAL
 779   if (status) {
 780     if (ompt_enabled.ompt_callback_master) {
 781       kmp_info_t *this_thr = __kmp_threads[global_tid];
 782       kmp_team_t *team = this_thr->th.th_team;
 783
 784       int tid = __kmp_tid_from_gtid(global_tid);
 785       ompt_callbacks.ompt_callback(ompt_callback_master)(
 786           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
 787           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
 788           OMPT_GET_RETURN_ADDRESS(0));
 789     }
 790   }
 791 #endif
 792
 793   if (__kmp_env_consistency_check) {
 794 #if KMP_USE_DYNAMIC_LOCK
 795     if (status)
 796       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
 797     else
 798       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
 799 #else
 800     if (status)
 801       __kmp_push_sync(global_tid, ct_master, loc, NULL);
 802     else
 803       __kmp_check_sync(global_tid, ct_master, loc, NULL);
 804 #endif
 805   }
 806
 807   return status;
 808 }
 809
 810 /*!
 811 @ingroup WORK_SHARING
 812 @param loc  source location information.
 813 @param global_tid  global thread number .
 814
 815 Mark the end of a <tt>master</tt> region. This should only be called by the
 816 thread that executes the <tt>master</tt> region.
 817 */
 818 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
 819   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
 820
 821   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
 822   KMP_POP_PARTITIONED_TIMER();
 823
 824 #if OMPT_SUPPORT && OMPT_OPTIONAL
 825   kmp_info_t *this_thr = __kmp_threads[global_tid];
 826   kmp_team_t *team = this_thr->th.th_team;
 827   if (ompt_enabled.ompt_callback_master) {
 828     int tid = __kmp_tid_from_gtid(global_tid);
 829     ompt_callbacks.ompt_callback(ompt_callback_master)(
 830         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
 831         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
 832         OMPT_GET_RETURN_ADDRESS(0));
 833   }
 834 #endif
 835
 836   if (__kmp_env_consistency_check) {
 837     if (global_tid < 0)
 838       KMP_WARNING(ThreadIdentInvalid);
 839
 840     if (KMP_MASTER_GTID(global_tid))
 841       __kmp_pop_sync(global_tid, ct_master, loc);
 842   }
 843 }
 844
 845 /*!
 846 @ingroup WORK_SHARING
 847 @param loc  source location information.
 848 @param gtid  global thread number.
 849
 850 Start execution of an <tt>ordered</tt> construct.
 851 */
 852 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
 853   int cid = 0;
 854   kmp_info_t *th;
 855   KMP_DEBUG_ASSERT(__kmp_init_serial);
 856
 857   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
 858
 859   if (!TCR_4(__kmp_init_parallel))
 860     __kmp_parallel_initialize();
 861
 862   __kmp_resume_if_soft_paused();
 863
 864 #if USE_ITT_BUILD
 865   __kmp_itt_ordered_prep(gtid);
 866 // TODO: ordered_wait_id
 867 #endif /* USE_ITT_BUILD */
 868
 869   th = __kmp_threads[gtid];
 870
 871 #if OMPT_SUPPORT && OMPT_OPTIONAL
 872   kmp_team_t *team;
 873   ompt_wait_id_t lck;
 874   void *codeptr_ra;
 875   if (ompt_enabled.enabled) {
 876     OMPT_STORE_RETURN_ADDRESS(gtid);
 877     team = __kmp_team_from_gtid(gtid);
 878     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
 879     /* OMPT state update */
 880     th->th.ompt_thread_info.wait_id = lck;
 881     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
 882
 883     /* OMPT event callback */
 884     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
 885     if (ompt_enabled.ompt_callback_mutex_acquire) {
 886       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
 887           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
 888           codeptr_ra);
 889     }
 890   }
 891 #endif
 892
 893   if (th->th.th_dispatch->th_deo_fcn != 0)
 894     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
 895   else
 896     __kmp_parallel_deo(&gtid, &cid, loc);
 897
 898 #if OMPT_SUPPORT && OMPT_OPTIONAL
 899   if (ompt_enabled.enabled) {
 900     /* OMPT state update */
 901     th->th.ompt_thread_info.state = ompt_state_work_parallel;
 902     th->th.ompt_thread_info.wait_id = 0;
 903
 904     /* OMPT event callback */
 905     if (ompt_enabled.ompt_callback_mutex_acquired) {
 906       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
 907           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
 908     }
 909   }
 910 #endif
 911
 912 #if USE_ITT_BUILD
 913   __kmp_itt_ordered_start(gtid);
 914 #endif /* USE_ITT_BUILD */
 915 }
 916
 917 /*!
 918 @ingroup WORK_SHARING
 919 @param loc  source location information.
 920 @param gtid  global thread number.
 921
 922 End execution of an <tt>ordered</tt> construct.
 923 */
 924 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
 925   int cid = 0;
 926   kmp_info_t *th;
 927
 928   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
 929
 930 #if USE_ITT_BUILD
 931   __kmp_itt_ordered_end(gtid);
 932 // TODO: ordered_wait_id
 933 #endif /* USE_ITT_BUILD */
 934
 935   th = __kmp_threads[gtid];
 936
 937   if (th->th.th_dispatch->th_dxo_fcn != 0)
 938     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
 939   else
 940     __kmp_parallel_dxo(&gtid, &cid, loc);
 941
 942 #if OMPT_SUPPORT && OMPT_OPTIONAL
 943   OMPT_STORE_RETURN_ADDRESS(gtid);
 944   if (ompt_enabled.ompt_callback_mutex_released) {
 945     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
 946         ompt_mutex_ordered,
 947         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
 948             ->t.t_ordered.dt.t_value,
 949         OMPT_LOAD_RETURN_ADDRESS(gtid));
 950   }
 951 #endif
 952 }
 953
 954 #if KMP_USE_DYNAMIC_LOCK
 955
 956 static __forceinline void
 957 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
 958                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
 959   // Pointer to the allocated indirect lock is written to crit, while indexing
 960   // is ignored.
 961   void *idx;
 962   kmp_indirect_lock_t **lck;
 963   lck = (kmp_indirect_lock_t **)crit;
 964   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
 965   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
 966   KMP_SET_I_LOCK_LOCATION(ilk, loc);
 967   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
 968   KA_TRACE(20,
 969            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
 970 #if USE_ITT_BUILD
 971   __kmp_itt_critical_creating(ilk->lock, loc);
 972 #endif
 973   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
 974   if (status == 0) {
 975 #if USE_ITT_BUILD
 976     __kmp_itt_critical_destroyed(ilk->lock);
 977 #endif
 978     // We don't really need to destroy the unclaimed lock here since it will be
 979     // cleaned up at program exit.
 980     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
 981   }
 982   KMP_DEBUG_ASSERT(*lck != NULL);
 983 }
 984
 985 // Fast-path acquire tas lock
 986 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
 987   {                                                                            \
 988     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
 989     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
 990     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
 991     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
 992         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
 993       kmp_uint32 spins;                                                        \
 994       KMP_FSYNC_PREPARE(l);                                                    \
 995       KMP_INIT_YIELD(spins);                                                   \
 996       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
 997       do {                                                                     \
 998         if (TCR_4(__kmp_nth) >                                                 \
 999             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1000           KMP_YIELD(TRUE);                                                     \
1001         } else {                                                               \
1002           KMP_YIELD_SPIN(spins);                                               \
1003         }                                                                      \
1004         __kmp_spin_backoff(&backoff);                                          \
1005       } while (                                                                \
1006           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1007           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1008     }                                                                          \
1009     KMP_FSYNC_ACQUIRED(l);                                                     \
1010   }
1011
1012 // Fast-path test tas lock
1013 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1014   {                                                                            \
1015     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1016     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1017     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1018     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1019          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1020   }
1021
1022 // Fast-path release tas lock
1023 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1024   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1025
1026 #if KMP_USE_FUTEX
1027
1028 #include <sys/syscall.h>
1029 #include <unistd.h>
1030 #ifndef FUTEX_WAIT
1031 #define FUTEX_WAIT 0
1032 #endif
1033 #ifndef FUTEX_WAKE
1034 #define FUTEX_WAKE 1
1035 #endif
1036
1037 // Fast-path acquire futex lock
1038 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1039   {                                                                            \
1040     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1041     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1042     KMP_MB();                                                                  \
1043     KMP_FSYNC_PREPARE(ftx);                                                    \
1044     kmp_int32 poll_val;                                                        \
1045     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1046                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1047                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1048       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1049       if (!cond) {                                                             \
1050         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1051                                          poll_val |                            \
1052                                              KMP_LOCK_BUSY(1, futex))) {       \
1053           continue;                                                            \
1054         }                                                                      \
1055         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1056       }                                                                        \
1057       kmp_int32 rc;                                                            \
1058       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1059                         NULL, NULL, 0)) != 0) {                                \
1060         continue;                                                              \
1061       }                                                                        \
1062       gtid_code |= 1;                                                          \
1063     }                                                                          \
1064     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1065   }
1066
1067 // Fast-path test futex lock
1068 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1069   {                                                                            \
1070     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1071     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1072                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1073       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1074       rc = TRUE;                                                               \
1075     } else {                                                                   \
1076       rc = FALSE;                                                              \
1077     }                                                                          \
1078   }
1079
1080 // Fast-path release futex lock
1081 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1082   {                                                                            \
1083     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1084     KMP_MB();                                                                  \
1085     KMP_FSYNC_RELEASING(ftx);                                                  \
1086     kmp_int32 poll_val =                                                       \
1087         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1088     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1089       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1090               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1091     }                                                                          \
1092     KMP_MB();                                                                  \
1093     KMP_YIELD_OVERSUB();                                                       \
1094   }
1095
1096 #endif // KMP_USE_FUTEX
1097
1098 #else // KMP_USE_DYNAMIC_LOCK
1099
1100 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1101                                                       ident_t const *loc,
1102                                                       kmp_int32 gtid) {
1103   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1104
1105   // Because of the double-check, the following load doesn't need to be volatile
1106   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107
1108   if (lck == NULL) {
1109     void *idx;
1110
1111     // Allocate & initialize the lock.
1112     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1113     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1114     __kmp_init_user_lock_with_checks(lck);
1115     __kmp_set_user_lock_location(lck, loc);
1116 #if USE_ITT_BUILD
1117     __kmp_itt_critical_creating(lck);
1118 // __kmp_itt_critical_creating() should be called *before* the first usage
1119 // of underlying lock. It is the only place where we can guarantee it. There
1120 // are chances the lock will destroyed with no usage, but it is not a
1121 // problem, because this is not real event seen by user but rather setting
1122 // name for object (lock). See more details in kmp_itt.h.
1123 #endif /* USE_ITT_BUILD */
1124
1125     // Use a cmpxchg instruction to slam the start of the critical section with
1126     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1127     // and use the lock that the other thread allocated.
1128     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1129
1130     if (status == 0) {
1131 // Deallocate the lock and reload the value.
1132 #if USE_ITT_BUILD
1133       __kmp_itt_critical_destroyed(lck);
1134 // Let ITT know the lock is destroyed and the same memory location may be reused
1135 // for another purpose.
1136 #endif /* USE_ITT_BUILD */
1137       __kmp_destroy_user_lock_with_checks(lck);
1138       __kmp_user_lock_free(&idx, gtid, lck);
1139       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1140       KMP_DEBUG_ASSERT(lck != NULL);
1141     }
1142   }
1143   return lck;
1144 }
1145
1146 #endif // KMP_USE_DYNAMIC_LOCK
1147
1148 /*!
1149 @ingroup WORK_SHARING
1150 @param loc  source location information.
1151 @param global_tid  global thread number .
1152 @param crit identity of the critical section. This could be a pointer to a lock
1153 associated with the critical section, or some other suitably unique value.
1154
1155 Enter code protected by a `critical` construct.
1156 This function blocks until the executing thread can enter the critical section.
1157 */
1158 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1159                      kmp_critical_name *crit) {
1160 #if KMP_USE_DYNAMIC_LOCK
1161 #if OMPT_SUPPORT && OMPT_OPTIONAL
1162   OMPT_STORE_RETURN_ADDRESS(global_tid);
1163 #endif // OMPT_SUPPORT
1164   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1165 #else
1166   KMP_COUNT_BLOCK(OMP_CRITICAL);
1167 #if OMPT_SUPPORT && OMPT_OPTIONAL
1168   ompt_state_t prev_state = ompt_state_undefined;
1169   ompt_thread_info_t ti;
1170 #endif
1171   kmp_user_lock_p lck;
1172
1173   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1174
1175   // TODO: add THR_OVHD_STATE
1176
1177   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1178   KMP_CHECK_USER_LOCK_INIT();
1179
1180   if ((__kmp_user_lock_kind == lk_tas) &&
1181       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1182     lck = (kmp_user_lock_p)crit;
1183   }
1184 #if KMP_USE_FUTEX
1185   else if ((__kmp_user_lock_kind == lk_futex) &&
1186            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1187     lck = (kmp_user_lock_p)crit;
1188   }
1189 #endif
1190   else { // ticket, queuing or drdpa
1191     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1192   }
1193
1194   if (__kmp_env_consistency_check)
1195     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1196
1197 // since the critical directive binds to all threads, not just the current
1198 // team we have to check this even if we are in a serialized team.
1199 // also, even if we are the uber thread, we still have to conduct the lock,
1200 // as we have to contend with sibling threads.
1201
1202 #if USE_ITT_BUILD
1203   __kmp_itt_critical_acquiring(lck);
1204 #endif /* USE_ITT_BUILD */
1205 #if OMPT_SUPPORT && OMPT_OPTIONAL
1206   OMPT_STORE_RETURN_ADDRESS(gtid);
1207   void *codeptr_ra = NULL;
1208   if (ompt_enabled.enabled) {
1209     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1210     /* OMPT state update */
1211     prev_state = ti.state;
1212     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1213     ti.state = ompt_state_wait_critical;
1214
1215     /* OMPT event callback */
1216     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1217     if (ompt_enabled.ompt_callback_mutex_acquire) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1219           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1220           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1221     }
1222   }
1223 #endif
1224   // Value of 'crit' should be good for using as a critical_id of the critical
1225   // section directive.
1226   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1227
1228 #if USE_ITT_BUILD
1229   __kmp_itt_critical_acquired(lck);
1230 #endif /* USE_ITT_BUILD */
1231 #if OMPT_SUPPORT && OMPT_OPTIONAL
1232   if (ompt_enabled.enabled) {
1233     /* OMPT state update */
1234     ti.state = prev_state;
1235     ti.wait_id = 0;
1236
1237     /* OMPT event callback */
1238     if (ompt_enabled.ompt_callback_mutex_acquired) {
1239       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1240           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1241     }
1242   }
1243 #endif
1244   KMP_POP_PARTITIONED_TIMER();
1245
1246   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1247   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1248 #endif // KMP_USE_DYNAMIC_LOCK
1249 }
1250
1251 #if KMP_USE_DYNAMIC_LOCK
1252
1253 // Converts the given hint to an internal lock implementation
1254 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1255 #if KMP_USE_TSX
1256 #define KMP_TSX_LOCK(seq) lockseq_##seq
1257 #else
1258 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1259 #endif
1260
1261 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1262 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1263 #else
1264 #define KMP_CPUINFO_RTM 0
1265 #endif
1266
1267   // Hints that do not require further logic
1268   if (hint & kmp_lock_hint_hle)
1269     return KMP_TSX_LOCK(hle);
1270   if (hint & kmp_lock_hint_rtm)
1271     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1272   if (hint & kmp_lock_hint_adaptive)
1273     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1274
1275   // Rule out conflicting hints first by returning the default lock
1276   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1277     return __kmp_user_lock_seq;
1278   if ((hint & omp_lock_hint_speculative) &&
1279       (hint & omp_lock_hint_nonspeculative))
1280     return __kmp_user_lock_seq;
1281
1282   // Do not even consider speculation when it appears to be contended
1283   if (hint & omp_lock_hint_contended)
1284     return lockseq_queuing;
1285
1286   // Uncontended lock without speculation
1287   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1288     return lockseq_tas;
1289
1290   // HLE lock for speculation
1291   if (hint & omp_lock_hint_speculative)
1292     return KMP_TSX_LOCK(hle);
1293
1294   return __kmp_user_lock_seq;
1295 }
1296
1297 #if OMPT_SUPPORT && OMPT_OPTIONAL
1298 #if KMP_USE_DYNAMIC_LOCK
1299 static kmp_mutex_impl_t
1300 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1301   if (user_lock) {
1302     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1303     case 0:
1304       break;
1305 #if KMP_USE_FUTEX
1306     case locktag_futex:
1307       return kmp_mutex_impl_queuing;
1308 #endif
1309     case locktag_tas:
1310       return kmp_mutex_impl_spin;
1311 #if KMP_USE_TSX
1312     case locktag_hle:
1313       return kmp_mutex_impl_speculative;
1314 #endif
1315     default:
1316       return kmp_mutex_impl_none;
1317     }
1318     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1319   }
1320   KMP_ASSERT(ilock);
1321   switch (ilock->type) {
1322 #if KMP_USE_TSX
1323   case locktag_adaptive:
1324   case locktag_rtm:
1325     return kmp_mutex_impl_speculative;
1326 #endif
1327   case locktag_nested_tas:
1328     return kmp_mutex_impl_spin;
1329 #if KMP_USE_FUTEX
1330   case locktag_nested_futex:
1331 #endif
1332   case locktag_ticket:
1333   case locktag_queuing:
1334   case locktag_drdpa:
1335   case locktag_nested_ticket:
1336   case locktag_nested_queuing:
1337   case locktag_nested_drdpa:
1338     return kmp_mutex_impl_queuing;
1339   default:
1340     return kmp_mutex_impl_none;
1341   }
1342 }
1343 #else
1344 // For locks without dynamic binding
1345 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1346   switch (__kmp_user_lock_kind) {
1347   case lk_tas:
1348     return kmp_mutex_impl_spin;
1349 #if KMP_USE_FUTEX
1350   case lk_futex:
1351 #endif
1352   case lk_ticket:
1353   case lk_queuing:
1354   case lk_drdpa:
1355     return kmp_mutex_impl_queuing;
1356 #if KMP_USE_TSX
1357   case lk_hle:
1358   case lk_rtm:
1359   case lk_adaptive:
1360     return kmp_mutex_impl_speculative;
1361 #endif
1362   default:
1363     return kmp_mutex_impl_none;
1364   }
1365 }
1366 #endif // KMP_USE_DYNAMIC_LOCK
1367 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1368
1369 /*!
1370 @ingroup WORK_SHARING
1371 @param loc  source location information.
1372 @param global_tid  global thread number.
1373 @param crit identity of the critical section. This could be a pointer to a lock
1374 associated with the critical section, or some other suitably unique value.
1375 @param hint the lock hint.
1376
1377 Enter code protected by a `critical` construct with a hint. The hint value is
1378 used to suggest a lock implementation. This function blocks until the executing
1379 thread can enter the critical section unless the hint suggests use of
1380 speculative execution and the hardware supports it.
1381 */
1382 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1383                                kmp_critical_name *crit, uint32_t hint) {
1384   KMP_COUNT_BLOCK(OMP_CRITICAL);
1385   kmp_user_lock_p lck;
1386 #if OMPT_SUPPORT && OMPT_OPTIONAL
1387   ompt_state_t prev_state = ompt_state_undefined;
1388   ompt_thread_info_t ti;
1389   // This is the case, if called from __kmpc_critical:
1390   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1391   if (!codeptr)
1392     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1393 #endif
1394
1395   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1396
1397   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1398   // Check if it is initialized.
1399   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1400   if (*lk == 0) {
1401     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1402     if (KMP_IS_D_LOCK(lckseq)) {
1403       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1404                                   KMP_GET_D_TAG(lckseq));
1405     } else {
1406       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1407     }
1408   }
1409   // Branch for accessing the actual lock object and set operation. This
1410   // branching is inevitable since this lock initialization does not follow the
1411   // normal dispatch path (lock table is not used).
1412   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1413     lck = (kmp_user_lock_p)lk;
1414     if (__kmp_env_consistency_check) {
1415       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1416                       __kmp_map_hint_to_lock(hint));
1417     }
1418 #if USE_ITT_BUILD
1419     __kmp_itt_critical_acquiring(lck);
1420 #endif
1421 #if OMPT_SUPPORT && OMPT_OPTIONAL
1422     if (ompt_enabled.enabled) {
1423       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1424       /* OMPT state update */
1425       prev_state = ti.state;
1426       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1427       ti.state = ompt_state_wait_critical;
1428
1429       /* OMPT event callback */
1430       if (ompt_enabled.ompt_callback_mutex_acquire) {
1431         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1432             ompt_mutex_critical, (unsigned int)hint,
1433             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1434             codeptr);
1435       }
1436     }
1437 #endif
1438 #if KMP_USE_INLINED_TAS
1439     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1440       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1441     } else
1442 #elif KMP_USE_INLINED_FUTEX
1443     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1444       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1445     } else
1446 #endif
1447     {
1448       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1449     }
1450   } else {
1451     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1452     lck = ilk->lock;
1453     if (__kmp_env_consistency_check) {
1454       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1455                       __kmp_map_hint_to_lock(hint));
1456     }
1457 #if USE_ITT_BUILD
1458     __kmp_itt_critical_acquiring(lck);
1459 #endif
1460 #if OMPT_SUPPORT && OMPT_OPTIONAL
1461     if (ompt_enabled.enabled) {
1462       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1463       /* OMPT state update */
1464       prev_state = ti.state;
1465       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1466       ti.state = ompt_state_wait_critical;
1467
1468       /* OMPT event callback */
1469       if (ompt_enabled.ompt_callback_mutex_acquire) {
1470         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1471             ompt_mutex_critical, (unsigned int)hint,
1472             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1473             codeptr);
1474       }
1475     }
1476 #endif
1477     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1478   }
1479   KMP_POP_PARTITIONED_TIMER();
1480
1481 #if USE_ITT_BUILD
1482   __kmp_itt_critical_acquired(lck);
1483 #endif /* USE_ITT_BUILD */
1484 #if OMPT_SUPPORT && OMPT_OPTIONAL
1485   if (ompt_enabled.enabled) {
1486     /* OMPT state update */
1487     ti.state = prev_state;
1488     ti.wait_id = 0;
1489
1490     /* OMPT event callback */
1491     if (ompt_enabled.ompt_callback_mutex_acquired) {
1492       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1493           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1494     }
1495   }
1496 #endif
1497
1498   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1499   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1500 } // __kmpc_critical_with_hint
1501
1502 #endif // KMP_USE_DYNAMIC_LOCK
1503
1504 /*!
1505 @ingroup WORK_SHARING
1506 @param loc  source location information.
1507 @param global_tid  global thread number .
1508 @param crit identity of the critical section. This could be a pointer to a lock
1509 associated with the critical section, or some other suitably unique value.
1510
1511 Leave a critical section, releasing any lock that was held during its execution.
1512 */
1513 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1514                          kmp_critical_name *crit) {
1515   kmp_user_lock_p lck;
1516
1517   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1518
1519 #if KMP_USE_DYNAMIC_LOCK
1520   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1521     lck = (kmp_user_lock_p)crit;
1522     KMP_ASSERT(lck != NULL);
1523     if (__kmp_env_consistency_check) {
1524       __kmp_pop_sync(global_tid, ct_critical, loc);
1525     }
1526 #if USE_ITT_BUILD
1527     __kmp_itt_critical_releasing(lck);
1528 #endif
1529 #if KMP_USE_INLINED_TAS
1530     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1531       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1532     } else
1533 #elif KMP_USE_INLINED_FUTEX
1534     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1535       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1536     } else
1537 #endif
1538     {
1539       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1540     }
1541   } else {
1542     kmp_indirect_lock_t *ilk =
1543         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1544     KMP_ASSERT(ilk != NULL);
1545     lck = ilk->lock;
1546     if (__kmp_env_consistency_check) {
1547       __kmp_pop_sync(global_tid, ct_critical, loc);
1548     }
1549 #if USE_ITT_BUILD
1550     __kmp_itt_critical_releasing(lck);
1551 #endif
1552     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1553   }
1554
1555 #else // KMP_USE_DYNAMIC_LOCK
1556
1557   if ((__kmp_user_lock_kind == lk_tas) &&
1558       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1559     lck = (kmp_user_lock_p)crit;
1560   }
1561 #if KMP_USE_FUTEX
1562   else if ((__kmp_user_lock_kind == lk_futex) &&
1563            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1564     lck = (kmp_user_lock_p)crit;
1565   }
1566 #endif
1567   else { // ticket, queuing or drdpa
1568     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1569   }
1570
1571   KMP_ASSERT(lck != NULL);
1572
1573   if (__kmp_env_consistency_check)
1574     __kmp_pop_sync(global_tid, ct_critical, loc);
1575
1576 #if USE_ITT_BUILD
1577   __kmp_itt_critical_releasing(lck);
1578 #endif /* USE_ITT_BUILD */
1579   // Value of 'crit' should be good for using as a critical_id of the critical
1580   // section directive.
1581   __kmp_release_user_lock_with_checks(lck, global_tid);
1582
1583 #endif // KMP_USE_DYNAMIC_LOCK
1584
1585 #if OMPT_SUPPORT && OMPT_OPTIONAL
1586   /* OMPT release event triggers after lock is released; place here to trigger
1587    * for all #if branches */
1588   OMPT_STORE_RETURN_ADDRESS(global_tid);
1589   if (ompt_enabled.ompt_callback_mutex_released) {
1590     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1591         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1592         OMPT_LOAD_RETURN_ADDRESS(0));
1593   }
1594 #endif
1595
1596   KMP_POP_PARTITIONED_TIMER();
1597   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1598 }
1599
1600 /*!
1601 @ingroup SYNCHRONIZATION
1602 @param loc source location information
1603 @param global_tid thread id.
1604 @return one if the thread should execute the master block, zero otherwise
1605
1606 Start execution of a combined barrier and master. The barrier is executed inside
1607 this function.
1608 */
1609 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1610   int status;
1611
1612   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1613
1614   if (!TCR_4(__kmp_init_parallel))
1615     __kmp_parallel_initialize();
1616
1617   __kmp_resume_if_soft_paused();
1618
1619   if (__kmp_env_consistency_check)
1620     __kmp_check_barrier(global_tid, ct_barrier, loc);
1621
1622 #if OMPT_SUPPORT
1623   ompt_frame_t *ompt_frame;
1624   if (ompt_enabled.enabled) {
1625     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1626     if (ompt_frame->enter_frame.ptr == NULL)
1627       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1628     OMPT_STORE_RETURN_ADDRESS(global_tid);
1629   }
1630 #endif
1631 #if USE_ITT_NOTIFY
1632   __kmp_threads[global_tid]->th.th_ident = loc;
1633 #endif
1634   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1635 #if OMPT_SUPPORT && OMPT_OPTIONAL
1636   if (ompt_enabled.enabled) {
1637     ompt_frame->enter_frame = ompt_data_none;
1638   }
1639 #endif
1640
1641   return (status != 0) ? 0 : 1;
1642 }
1643
1644 /*!
1645 @ingroup SYNCHRONIZATION
1646 @param loc source location information
1647 @param global_tid thread id.
1648
1649 Complete the execution of a combined barrier and master. This function should
1650 only be called at the completion of the <tt>master</tt> code. Other threads will
1651 still be waiting at the barrier and this call releases them.
1652 */
1653 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1654   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1655
1656   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1657 }
1658
1659 /*!
1660 @ingroup SYNCHRONIZATION
1661 @param loc source location information
1662 @param global_tid thread id.
1663 @return one if the thread should execute the master block, zero otherwise
1664
1665 Start execution of a combined barrier and master(nowait) construct.
1666 The barrier is executed inside this function.
1667 There is no equivalent "end" function, since the
1668 */
1669 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1670   kmp_int32 ret;
1671
1672   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1673
1674   if (!TCR_4(__kmp_init_parallel))
1675     __kmp_parallel_initialize();
1676
1677   __kmp_resume_if_soft_paused();
1678
1679   if (__kmp_env_consistency_check) {
1680     if (loc == 0) {
1681       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1682     }
1683     __kmp_check_barrier(global_tid, ct_barrier, loc);
1684   }
1685
1686 #if OMPT_SUPPORT
1687   ompt_frame_t *ompt_frame;
1688   if (ompt_enabled.enabled) {
1689     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1690     if (ompt_frame->enter_frame.ptr == NULL)
1691       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1692     OMPT_STORE_RETURN_ADDRESS(global_tid);
1693   }
1694 #endif
1695 #if USE_ITT_NOTIFY
1696   __kmp_threads[global_tid]->th.th_ident = loc;
1697 #endif
1698   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1699 #if OMPT_SUPPORT && OMPT_OPTIONAL
1700   if (ompt_enabled.enabled) {
1701     ompt_frame->enter_frame = ompt_data_none;
1702   }
1703 #endif
1704
1705   ret = __kmpc_master(loc, global_tid);
1706
1707   if (__kmp_env_consistency_check) {
1708     /*  there's no __kmpc_end_master called; so the (stats) */
1709     /*  actions of __kmpc_end_master are done here          */
1710
1711     if (global_tid < 0) {
1712       KMP_WARNING(ThreadIdentInvalid);
1713     }
1714     if (ret) {
1715       /* only one thread should do the pop since only */
1716       /* one did the push (see __kmpc_master())       */
1717
1718       __kmp_pop_sync(global_tid, ct_master, loc);
1719     }
1720   }
1721
1722   return (ret);
1723 }
1724
1725 /* The BARRIER for a SINGLE process section is always explicit   */
1726 /*!
1727 @ingroup WORK_SHARING
1728 @param loc  source location information
1729 @param global_tid  global thread number
1730 @return One if this thread should execute the single construct, zero otherwise.
1731
1732 Test whether to execute a <tt>single</tt> construct.
1733 There are no implicit barriers in the two "single" calls, rather the compiler
1734 should introduce an explicit barrier if it is required.
1735 */
1736
1737 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1738   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1739
1740   if (rc) {
1741     // We are going to execute the single statement, so we should count it.
1742     KMP_COUNT_BLOCK(OMP_SINGLE);
1743     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1744   }
1745
1746 #if OMPT_SUPPORT && OMPT_OPTIONAL
1747   kmp_info_t *this_thr = __kmp_threads[global_tid];
1748   kmp_team_t *team = this_thr->th.th_team;
1749   int tid = __kmp_tid_from_gtid(global_tid);
1750
1751   if (ompt_enabled.enabled) {
1752     if (rc) {
1753       if (ompt_enabled.ompt_callback_work) {
1754         ompt_callbacks.ompt_callback(ompt_callback_work)(
1755             ompt_work_single_executor, ompt_scope_begin,
1756             &(team->t.ompt_team_info.parallel_data),
1757             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758             1, OMPT_GET_RETURN_ADDRESS(0));
1759       }
1760     } else {
1761       if (ompt_enabled.ompt_callback_work) {
1762         ompt_callbacks.ompt_callback(ompt_callback_work)(
1763             ompt_work_single_other, ompt_scope_begin,
1764             &(team->t.ompt_team_info.parallel_data),
1765             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1766             1, OMPT_GET_RETURN_ADDRESS(0));
1767         ompt_callbacks.ompt_callback(ompt_callback_work)(
1768             ompt_work_single_other, ompt_scope_end,
1769             &(team->t.ompt_team_info.parallel_data),
1770             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1771             1, OMPT_GET_RETURN_ADDRESS(0));
1772       }
1773     }
1774   }
1775 #endif
1776
1777   return rc;
1778 }
1779
1780 /*!
1781 @ingroup WORK_SHARING
1782 @param loc  source location information
1783 @param global_tid  global thread number
1784
1785 Mark the end of a <tt>single</tt> construct.  This function should
1786 only be called by the thread that executed the block of code protected
1787 by the `single` construct.
1788 */
1789 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1790   __kmp_exit_single(global_tid);
1791   KMP_POP_PARTITIONED_TIMER();
1792
1793 #if OMPT_SUPPORT && OMPT_OPTIONAL
1794   kmp_info_t *this_thr = __kmp_threads[global_tid];
1795   kmp_team_t *team = this_thr->th.th_team;
1796   int tid = __kmp_tid_from_gtid(global_tid);
1797
1798   if (ompt_enabled.ompt_callback_work) {
1799     ompt_callbacks.ompt_callback(ompt_callback_work)(
1800         ompt_work_single_executor, ompt_scope_end,
1801         &(team->t.ompt_team_info.parallel_data),
1802         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1803         OMPT_GET_RETURN_ADDRESS(0));
1804   }
1805 #endif
1806 }
1807
1808 /*!
1809 @ingroup WORK_SHARING
1810 @param loc Source location
1811 @param global_tid Global thread id
1812
1813 Mark the end of a statically scheduled loop.
1814 */
1815 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1816   KMP_POP_PARTITIONED_TIMER();
1817   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1818
1819 #if OMPT_SUPPORT && OMPT_OPTIONAL
1820   if (ompt_enabled.ompt_callback_work) {
1821     ompt_work_t ompt_work_type = ompt_work_loop;
1822     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1823     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1824     // Determine workshare type
1825     if (loc != NULL) {
1826       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1827         ompt_work_type = ompt_work_loop;
1828       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1829         ompt_work_type = ompt_work_sections;
1830       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1831         ompt_work_type = ompt_work_distribute;
1832       } else {
1833         // use default set above.
1834         // a warning about this case is provided in __kmpc_for_static_init
1835       }
1836       KMP_DEBUG_ASSERT(ompt_work_type);
1837     }
1838     ompt_callbacks.ompt_callback(ompt_callback_work)(
1839         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1840         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1841   }
1842 #endif
1843   if (__kmp_env_consistency_check)
1844     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1845 }
1846
1847 // User routines which take C-style arguments (call by value)
1848 // different from the Fortran equivalent routines
1849
1850 void ompc_set_num_threads(int arg) {
1851   // !!!!! TODO: check the per-task binding
1852   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1853 }
1854
1855 void ompc_set_dynamic(int flag) {
1856   kmp_info_t *thread;
1857
1858   /* For the thread-private implementation of the internal controls */
1859   thread = __kmp_entry_thread();
1860
1861   __kmp_save_internal_controls(thread);
1862
1863   set__dynamic(thread, flag ? TRUE : FALSE);
1864 }
1865
1866 void ompc_set_nested(int flag) {
1867   kmp_info_t *thread;
1868
1869   /* For the thread-private internal controls implementation */
1870   thread = __kmp_entry_thread();
1871
1872   __kmp_save_internal_controls(thread);
1873
1874   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1875 }
1876
1877 void ompc_set_max_active_levels(int max_active_levels) {
1878   /* TO DO */
1879   /* we want per-task implementation of this internal control */
1880
1881   /* For the per-thread internal controls implementation */
1882   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1883 }
1884
1885 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1886   // !!!!! TODO: check the per-task binding
1887   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1888 }
1889
1890 int ompc_get_ancestor_thread_num(int level) {
1891   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1892 }
1893
1894 int ompc_get_team_size(int level) {
1895   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1896 }
1897
1898 /* OpenMP 5.0 Affinity Format API */
1899
1900 void ompc_set_affinity_format(char const *format) {
1901   if (!__kmp_init_serial) {
1902     __kmp_serial_initialize();
1903   }
1904   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1905                          format, KMP_STRLEN(format) + 1);
1906 }
1907
1908 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1909   size_t format_size;
1910   if (!__kmp_init_serial) {
1911     __kmp_serial_initialize();
1912   }
1913   format_size = KMP_STRLEN(__kmp_affinity_format);
1914   if (buffer && size) {
1915     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1916                            format_size + 1);
1917   }
1918   return format_size;
1919 }
1920
1921 void ompc_display_affinity(char const *format) {
1922   int gtid;
1923   if (!TCR_4(__kmp_init_middle)) {
1924     __kmp_middle_initialize();
1925   }
1926   gtid = __kmp_get_gtid();
1927   __kmp_aux_display_affinity(gtid, format);
1928 }
1929
1930 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1931                              char const *format) {
1932   int gtid;
1933   size_t num_required;
1934   kmp_str_buf_t capture_buf;
1935   if (!TCR_4(__kmp_init_middle)) {
1936     __kmp_middle_initialize();
1937   }
1938   gtid = __kmp_get_gtid();
1939   __kmp_str_buf_init(&capture_buf);
1940   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1941   if (buffer && buf_size) {
1942     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1943                            capture_buf.used + 1);
1944   }
1945   __kmp_str_buf_free(&capture_buf);
1946   return num_required;
1947 }
1948
1949 void kmpc_set_stacksize(int arg) {
1950   // __kmp_aux_set_stacksize initializes the library if needed
1951   __kmp_aux_set_stacksize(arg);
1952 }
1953
1954 void kmpc_set_stacksize_s(size_t arg) {
1955   // __kmp_aux_set_stacksize initializes the library if needed
1956   __kmp_aux_set_stacksize(arg);
1957 }
1958
1959 void kmpc_set_blocktime(int arg) {
1960   int gtid, tid;
1961   kmp_info_t *thread;
1962
1963   gtid = __kmp_entry_gtid();
1964   tid = __kmp_tid_from_gtid(gtid);
1965   thread = __kmp_thread_from_gtid(gtid);
1966
1967   __kmp_aux_set_blocktime(arg, thread, tid);
1968 }
1969
1970 void kmpc_set_library(int arg) {
1971   // __kmp_user_set_library initializes the library if needed
1972   __kmp_user_set_library((enum library_type)arg);
1973 }
1974
1975 void kmpc_set_defaults(char const *str) {
1976   // __kmp_aux_set_defaults initializes the library if needed
1977   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1978 }
1979
1980 void kmpc_set_disp_num_buffers(int arg) {
1981   // ignore after initialization because some teams have already
1982   // allocated dispatch buffers
1983   if (__kmp_init_serial == 0 && arg > 0)
1984     __kmp_dispatch_num_buffers = arg;
1985 }
1986
1987 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1989   return -1;
1990 #else
1991   if (!TCR_4(__kmp_init_middle)) {
1992     __kmp_middle_initialize();
1993   }
1994   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1995 #endif
1996 }
1997
1998 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1999 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2000   return -1;
2001 #else
2002   if (!TCR_4(__kmp_init_middle)) {
2003     __kmp_middle_initialize();
2004   }
2005   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2006 #endif
2007 }
2008
2009 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2010 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2011   return -1;
2012 #else
2013   if (!TCR_4(__kmp_init_middle)) {
2014     __kmp_middle_initialize();
2015   }
2016   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2017 #endif
2018 }
2019
2020 /* -------------------------------------------------------------------------- */
2021 /*!
2022 @ingroup THREADPRIVATE
2023 @param loc       source location information
2024 @param gtid      global thread number
2025 @param cpy_size  size of the cpy_data buffer
2026 @param cpy_data  pointer to data to be copied
2027 @param cpy_func  helper function to call for copying data
2028 @param didit     flag variable: 1=single thread; 0=not single thread
2029
2030 __kmpc_copyprivate implements the interface for the private data broadcast
2031 needed for the copyprivate clause associated with a single region in an
2032 OpenMP<sup>*</sup> program (both C and Fortran).
2033 All threads participating in the parallel region call this routine.
2034 One of the threads (called the single thread) should have the <tt>didit</tt>
2035 variable set to 1 and all other threads should have that variable set to 0.
2036 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2037
2038 The OpenMP specification forbids the use of nowait on the single region when a
2039 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2040 barrier internally to avoid race conditions, so the code generation for the
2041 single region should avoid generating a barrier after the call to @ref
2042 __kmpc_copyprivate.
2043
2044 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2045 The <tt>loc</tt> parameter is a pointer to source location information.
2046
2047 Internal implementation: The single thread will first copy its descriptor
2048 address (cpy_data) to a team-private location, then the other threads will each
2049 call the function pointed to by the parameter cpy_func, which carries out the
2050 copy by copying the data using the cpy_data buffer.
2051
2052 The cpy_func routine used for the copy and the contents of the data area defined
2053 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2054 to be done. For instance, the cpy_data buffer can hold the actual data to be
2055 copied or it may hold a list of pointers to the data. The cpy_func routine must
2056 interpret the cpy_data buffer appropriately.
2057
2058 The interface to cpy_func is as follows:
2059 @code
2060 void cpy_func( void *destination, void *source )
2061 @endcode
2062 where void *destination is the cpy_data pointer for the thread being copied to
2063 and void *source is the cpy_data pointer for the thread being copied from.
2064 */
2065 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2066                         void *cpy_data, void (*cpy_func)(void *, void *),
2067                         kmp_int32 didit) {
2068   void **data_ptr;
2069
2070   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2071
2072   KMP_MB();
2073
2074   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2075
2076   if (__kmp_env_consistency_check) {
2077     if (loc == 0) {
2078       KMP_WARNING(ConstructIdentInvalid);
2079     }
2080   }
2081
2082   // ToDo: Optimize the following two barriers into some kind of split barrier
2083
2084   if (didit)
2085     *data_ptr = cpy_data;
2086
2087 #if OMPT_SUPPORT
2088   ompt_frame_t *ompt_frame;
2089   if (ompt_enabled.enabled) {
2090     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2091     if (ompt_frame->enter_frame.ptr == NULL)
2092       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2093     OMPT_STORE_RETURN_ADDRESS(gtid);
2094   }
2095 #endif
2096 /* This barrier is not a barrier region boundary */
2097 #if USE_ITT_NOTIFY
2098   __kmp_threads[gtid]->th.th_ident = loc;
2099 #endif
2100   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2101
2102   if (!didit)
2103     (*cpy_func)(cpy_data, *data_ptr);
2104
2105 // Consider next barrier a user-visible barrier for barrier region boundaries
2106 // Nesting checks are already handled by the single construct checks
2107
2108 #if OMPT_SUPPORT
2109   if (ompt_enabled.enabled) {
2110     OMPT_STORE_RETURN_ADDRESS(gtid);
2111   }
2112 #endif
2113 #if USE_ITT_NOTIFY
2114   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2115 // tasks can overwrite the location)
2116 #endif
2117   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2118 #if OMPT_SUPPORT && OMPT_OPTIONAL
2119   if (ompt_enabled.enabled) {
2120     ompt_frame->enter_frame = ompt_data_none;
2121   }
2122 #endif
2123 }
2124
2125 /* -------------------------------------------------------------------------- */
2126
2127 #define INIT_LOCK __kmp_init_user_lock_with_checks
2128 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2129 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2130 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2131 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2132 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2133   __kmp_acquire_nested_user_lock_with_checks_timed
2134 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2135 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2136 #define TEST_LOCK __kmp_test_user_lock_with_checks
2137 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2138 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2139 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2140
2141 // TODO: Make check abort messages use location info & pass it into
2142 // with_checks routines
2143
2144 #if KMP_USE_DYNAMIC_LOCK
2145
2146 // internal lock initializer
2147 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2148                                                     kmp_dyna_lockseq_t seq) {
2149   if (KMP_IS_D_LOCK(seq)) {
2150     KMP_INIT_D_LOCK(lock, seq);
2151 #if USE_ITT_BUILD
2152     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2153 #endif
2154   } else {
2155     KMP_INIT_I_LOCK(lock, seq);
2156 #if USE_ITT_BUILD
2157     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2158     __kmp_itt_lock_creating(ilk->lock, loc);
2159 #endif
2160   }
2161 }
2162
2163 // internal nest lock initializer
2164 static __forceinline void
2165 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2166                                kmp_dyna_lockseq_t seq) {
2167 #if KMP_USE_TSX
2168   // Don't have nested lock implementation for speculative locks
2169   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2170     seq = __kmp_user_lock_seq;
2171 #endif
2172   switch (seq) {
2173   case lockseq_tas:
2174     seq = lockseq_nested_tas;
2175     break;
2176 #if KMP_USE_FUTEX
2177   case lockseq_futex:
2178     seq = lockseq_nested_futex;
2179     break;
2180 #endif
2181   case lockseq_ticket:
2182     seq = lockseq_nested_ticket;
2183     break;
2184   case lockseq_queuing:
2185     seq = lockseq_nested_queuing;
2186     break;
2187   case lockseq_drdpa:
2188     seq = lockseq_nested_drdpa;
2189     break;
2190   default:
2191     seq = lockseq_nested_queuing;
2192   }
2193   KMP_INIT_I_LOCK(lock, seq);
2194 #if USE_ITT_BUILD
2195   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2196   __kmp_itt_lock_creating(ilk->lock, loc);
2197 #endif
2198 }
2199
2200 /* initialize the lock with a hint */
2201 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2202                                 uintptr_t hint) {
2203   KMP_DEBUG_ASSERT(__kmp_init_serial);
2204   if (__kmp_env_consistency_check && user_lock == NULL) {
2205     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2206   }
2207
2208   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2209
2210 #if OMPT_SUPPORT && OMPT_OPTIONAL
2211   // This is the case, if called from omp_init_lock_with_hint:
2212   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2213   if (!codeptr)
2214     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2215   if (ompt_enabled.ompt_callback_lock_init) {
2216     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2217         ompt_mutex_lock, (omp_lock_hint_t)hint,
2218         __ompt_get_mutex_impl_type(user_lock),
2219         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2220   }
2221 #endif
2222 }
2223
2224 /* initialize the lock with a hint */
2225 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2226                                      void **user_lock, uintptr_t hint) {
2227   KMP_DEBUG_ASSERT(__kmp_init_serial);
2228   if (__kmp_env_consistency_check && user_lock == NULL) {
2229     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2230   }
2231
2232   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2233
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235   // This is the case, if called from omp_init_lock_with_hint:
2236   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2237   if (!codeptr)
2238     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2239   if (ompt_enabled.ompt_callback_lock_init) {
2240     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2241         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2242         __ompt_get_mutex_impl_type(user_lock),
2243         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2244   }
2245 #endif
2246 }
2247
2248 #endif // KMP_USE_DYNAMIC_LOCK
2249
2250 /* initialize the lock */
2251 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2252 #if KMP_USE_DYNAMIC_LOCK
2253
2254   KMP_DEBUG_ASSERT(__kmp_init_serial);
2255   if (__kmp_env_consistency_check && user_lock == NULL) {
2256     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2257   }
2258   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2259
2260 #if OMPT_SUPPORT && OMPT_OPTIONAL
2261   // This is the case, if called from omp_init_lock_with_hint:
2262   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2263   if (!codeptr)
2264     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2265   if (ompt_enabled.ompt_callback_lock_init) {
2266     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2267         ompt_mutex_lock, omp_lock_hint_none,
2268         __ompt_get_mutex_impl_type(user_lock),
2269         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2270   }
2271 #endif
2272
2273 #else // KMP_USE_DYNAMIC_LOCK
2274
2275   static char const *const func = "omp_init_lock";
2276   kmp_user_lock_p lck;
2277   KMP_DEBUG_ASSERT(__kmp_init_serial);
2278
2279   if (__kmp_env_consistency_check) {
2280     if (user_lock == NULL) {
2281       KMP_FATAL(LockIsUninitialized, func);
2282     }
2283   }
2284
2285   KMP_CHECK_USER_LOCK_INIT();
2286
2287   if ((__kmp_user_lock_kind == lk_tas) &&
2288       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2289     lck = (kmp_user_lock_p)user_lock;
2290   }
2291 #if KMP_USE_FUTEX
2292   else if ((__kmp_user_lock_kind == lk_futex) &&
2293            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2294     lck = (kmp_user_lock_p)user_lock;
2295   }
2296 #endif
2297   else {
2298     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2299   }
2300   INIT_LOCK(lck);
2301   __kmp_set_user_lock_location(lck, loc);
2302
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304   // This is the case, if called from omp_init_lock_with_hint:
2305   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2306   if (!codeptr)
2307     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2308   if (ompt_enabled.ompt_callback_lock_init) {
2309     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2310         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2311         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2312   }
2313 #endif
2314
2315 #if USE_ITT_BUILD
2316   __kmp_itt_lock_creating(lck);
2317 #endif /* USE_ITT_BUILD */
2318
2319 #endif // KMP_USE_DYNAMIC_LOCK
2320 } // __kmpc_init_lock
2321
2322 /* initialize the lock */
2323 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2324 #if KMP_USE_DYNAMIC_LOCK
2325
2326   KMP_DEBUG_ASSERT(__kmp_init_serial);
2327   if (__kmp_env_consistency_check && user_lock == NULL) {
2328     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2329   }
2330   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2331
2332 #if OMPT_SUPPORT && OMPT_OPTIONAL
2333   // This is the case, if called from omp_init_lock_with_hint:
2334   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2335   if (!codeptr)
2336     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2337   if (ompt_enabled.ompt_callback_lock_init) {
2338     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2339         ompt_mutex_nest_lock, omp_lock_hint_none,
2340         __ompt_get_mutex_impl_type(user_lock),
2341         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2342   }
2343 #endif
2344
2345 #else // KMP_USE_DYNAMIC_LOCK
2346
2347   static char const *const func = "omp_init_nest_lock";
2348   kmp_user_lock_p lck;
2349   KMP_DEBUG_ASSERT(__kmp_init_serial);
2350
2351   if (__kmp_env_consistency_check) {
2352     if (user_lock == NULL) {
2353       KMP_FATAL(LockIsUninitialized, func);
2354     }
2355   }
2356
2357   KMP_CHECK_USER_LOCK_INIT();
2358
2359   if ((__kmp_user_lock_kind == lk_tas) &&
2360       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2361        OMP_NEST_LOCK_T_SIZE)) {
2362     lck = (kmp_user_lock_p)user_lock;
2363   }
2364 #if KMP_USE_FUTEX
2365   else if ((__kmp_user_lock_kind == lk_futex) &&
2366            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2367             OMP_NEST_LOCK_T_SIZE)) {
2368     lck = (kmp_user_lock_p)user_lock;
2369   }
2370 #endif
2371   else {
2372     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2373   }
2374
2375   INIT_NESTED_LOCK(lck);
2376   __kmp_set_user_lock_location(lck, loc);
2377
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379   // This is the case, if called from omp_init_lock_with_hint:
2380   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2381   if (!codeptr)
2382     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2383   if (ompt_enabled.ompt_callback_lock_init) {
2384     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2385         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2386         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2387   }
2388 #endif
2389
2390 #if USE_ITT_BUILD
2391   __kmp_itt_lock_creating(lck);
2392 #endif /* USE_ITT_BUILD */
2393
2394 #endif // KMP_USE_DYNAMIC_LOCK
2395 } // __kmpc_init_nest_lock
2396
2397 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2398 #if KMP_USE_DYNAMIC_LOCK
2399
2400 #if USE_ITT_BUILD
2401   kmp_user_lock_p lck;
2402   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2403     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2404   } else {
2405     lck = (kmp_user_lock_p)user_lock;
2406   }
2407   __kmp_itt_lock_destroyed(lck);
2408 #endif
2409 #if OMPT_SUPPORT && OMPT_OPTIONAL
2410   // This is the case, if called from omp_init_lock_with_hint:
2411   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2412   if (!codeptr)
2413     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2414   if (ompt_enabled.ompt_callback_lock_destroy) {
2415     kmp_user_lock_p lck;
2416     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2417       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2418     } else {
2419       lck = (kmp_user_lock_p)user_lock;
2420     }
2421     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2422         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2423   }
2424 #endif
2425   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2426 #else
2427   kmp_user_lock_p lck;
2428
2429   if ((__kmp_user_lock_kind == lk_tas) &&
2430       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2431     lck = (kmp_user_lock_p)user_lock;
2432   }
2433 #if KMP_USE_FUTEX
2434   else if ((__kmp_user_lock_kind == lk_futex) &&
2435            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2436     lck = (kmp_user_lock_p)user_lock;
2437   }
2438 #endif
2439   else {
2440     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2441   }
2442
2443 #if OMPT_SUPPORT && OMPT_OPTIONAL
2444   // This is the case, if called from omp_init_lock_with_hint:
2445   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2446   if (!codeptr)
2447     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2448   if (ompt_enabled.ompt_callback_lock_destroy) {
2449     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2450         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2451   }
2452 #endif
2453
2454 #if USE_ITT_BUILD
2455   __kmp_itt_lock_destroyed(lck);
2456 #endif /* USE_ITT_BUILD */
2457   DESTROY_LOCK(lck);
2458
2459   if ((__kmp_user_lock_kind == lk_tas) &&
2460       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2461     ;
2462   }
2463 #if KMP_USE_FUTEX
2464   else if ((__kmp_user_lock_kind == lk_futex) &&
2465            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2466     ;
2467   }
2468 #endif
2469   else {
2470     __kmp_user_lock_free(user_lock, gtid, lck);
2471   }
2472 #endif // KMP_USE_DYNAMIC_LOCK
2473 } // __kmpc_destroy_lock
2474
2475 /* destroy the lock */
2476 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2477 #if KMP_USE_DYNAMIC_LOCK
2478
2479 #if USE_ITT_BUILD
2480   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2481   __kmp_itt_lock_destroyed(ilk->lock);
2482 #endif
2483 #if OMPT_SUPPORT && OMPT_OPTIONAL
2484   // This is the case, if called from omp_init_lock_with_hint:
2485   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2486   if (!codeptr)
2487     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2488   if (ompt_enabled.ompt_callback_lock_destroy) {
2489     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2490         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2491   }
2492 #endif
2493   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2494
2495 #else // KMP_USE_DYNAMIC_LOCK
2496
2497   kmp_user_lock_p lck;
2498
2499   if ((__kmp_user_lock_kind == lk_tas) &&
2500       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2501        OMP_NEST_LOCK_T_SIZE)) {
2502     lck = (kmp_user_lock_p)user_lock;
2503   }
2504 #if KMP_USE_FUTEX
2505   else if ((__kmp_user_lock_kind == lk_futex) &&
2506            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2507             OMP_NEST_LOCK_T_SIZE)) {
2508     lck = (kmp_user_lock_p)user_lock;
2509   }
2510 #endif
2511   else {
2512     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2513   }
2514
2515 #if OMPT_SUPPORT && OMPT_OPTIONAL
2516   // This is the case, if called from omp_init_lock_with_hint:
2517   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2518   if (!codeptr)
2519     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2520   if (ompt_enabled.ompt_callback_lock_destroy) {
2521     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2522         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2523   }
2524 #endif
2525
2526 #if USE_ITT_BUILD
2527   __kmp_itt_lock_destroyed(lck);
2528 #endif /* USE_ITT_BUILD */
2529
2530   DESTROY_NESTED_LOCK(lck);
2531
2532   if ((__kmp_user_lock_kind == lk_tas) &&
2533       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2534        OMP_NEST_LOCK_T_SIZE)) {
2535     ;
2536   }
2537 #if KMP_USE_FUTEX
2538   else if ((__kmp_user_lock_kind == lk_futex) &&
2539            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2540             OMP_NEST_LOCK_T_SIZE)) {
2541     ;
2542   }
2543 #endif
2544   else {
2545     __kmp_user_lock_free(user_lock, gtid, lck);
2546   }
2547 #endif // KMP_USE_DYNAMIC_LOCK
2548 } // __kmpc_destroy_nest_lock
2549
2550 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2551   KMP_COUNT_BLOCK(OMP_set_lock);
2552 #if KMP_USE_DYNAMIC_LOCK
2553   int tag = KMP_EXTRACT_D_TAG(user_lock);
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquiring(
2556       (kmp_user_lock_p)
2557           user_lock); // itt function will get to the right lock object.
2558 #endif
2559 #if OMPT_SUPPORT && OMPT_OPTIONAL
2560   // This is the case, if called from omp_init_lock_with_hint:
2561   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2562   if (!codeptr)
2563     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2564   if (ompt_enabled.ompt_callback_mutex_acquire) {
2565     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2566         ompt_mutex_lock, omp_lock_hint_none,
2567         __ompt_get_mutex_impl_type(user_lock),
2568         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2569   }
2570 #endif
2571 #if KMP_USE_INLINED_TAS
2572   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2573     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2574   } else
2575 #elif KMP_USE_INLINED_FUTEX
2576   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2577     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2578   } else
2579 #endif
2580   {
2581     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2582   }
2583 #if USE_ITT_BUILD
2584   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2585 #endif
2586 #if OMPT_SUPPORT && OMPT_OPTIONAL
2587   if (ompt_enabled.ompt_callback_mutex_acquired) {
2588     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2589         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2590   }
2591 #endif
2592
2593 #else // KMP_USE_DYNAMIC_LOCK
2594
2595   kmp_user_lock_p lck;
2596
2597   if ((__kmp_user_lock_kind == lk_tas) &&
2598       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2599     lck = (kmp_user_lock_p)user_lock;
2600   }
2601 #if KMP_USE_FUTEX
2602   else if ((__kmp_user_lock_kind == lk_futex) &&
2603            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2604     lck = (kmp_user_lock_p)user_lock;
2605   }
2606 #endif
2607   else {
2608     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2609   }
2610
2611 #if USE_ITT_BUILD
2612   __kmp_itt_lock_acquiring(lck);
2613 #endif /* USE_ITT_BUILD */
2614 #if OMPT_SUPPORT && OMPT_OPTIONAL
2615   // This is the case, if called from omp_init_lock_with_hint:
2616   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2617   if (!codeptr)
2618     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2619   if (ompt_enabled.ompt_callback_mutex_acquire) {
2620     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2621         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2622         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2623   }
2624 #endif
2625
2626   ACQUIRE_LOCK(lck, gtid);
2627
2628 #if USE_ITT_BUILD
2629   __kmp_itt_lock_acquired(lck);
2630 #endif /* USE_ITT_BUILD */
2631
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633   if (ompt_enabled.ompt_callback_mutex_acquired) {
2634     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2635         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2636   }
2637 #endif
2638
2639 #endif // KMP_USE_DYNAMIC_LOCK
2640 }
2641
2642 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2643 #if KMP_USE_DYNAMIC_LOCK
2644
2645 #if USE_ITT_BUILD
2646   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2647 #endif
2648 #if OMPT_SUPPORT && OMPT_OPTIONAL
2649   // This is the case, if called from omp_init_lock_with_hint:
2650   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2651   if (!codeptr)
2652     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2653   if (ompt_enabled.enabled) {
2654     if (ompt_enabled.ompt_callback_mutex_acquire) {
2655       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2656           ompt_mutex_nest_lock, omp_lock_hint_none,
2657           __ompt_get_mutex_impl_type(user_lock),
2658           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2659     }
2660   }
2661 #endif
2662   int acquire_status =
2663       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2664   (void) acquire_status;
2665 #if USE_ITT_BUILD
2666   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2667 #endif
2668
2669 #if OMPT_SUPPORT && OMPT_OPTIONAL
2670   if (ompt_enabled.enabled) {
2671     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2672       if (ompt_enabled.ompt_callback_mutex_acquired) {
2673         // lock_first
2674         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2675             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2676             codeptr);
2677       }
2678     } else {
2679       if (ompt_enabled.ompt_callback_nest_lock) {
2680         // lock_next
2681         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2682             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2683       }
2684     }
2685   }
2686 #endif
2687
2688 #else // KMP_USE_DYNAMIC_LOCK
2689   int acquire_status;
2690   kmp_user_lock_p lck;
2691
2692   if ((__kmp_user_lock_kind == lk_tas) &&
2693       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2694        OMP_NEST_LOCK_T_SIZE)) {
2695     lck = (kmp_user_lock_p)user_lock;
2696   }
2697 #if KMP_USE_FUTEX
2698   else if ((__kmp_user_lock_kind == lk_futex) &&
2699            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2700             OMP_NEST_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #endif
2704   else {
2705     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2706   }
2707
2708 #if USE_ITT_BUILD
2709   __kmp_itt_lock_acquiring(lck);
2710 #endif /* USE_ITT_BUILD */
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712   // This is the case, if called from omp_init_lock_with_hint:
2713   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714   if (!codeptr)
2715     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716   if (ompt_enabled.enabled) {
2717     if (ompt_enabled.ompt_callback_mutex_acquire) {
2718       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2719           ompt_mutex_nest_lock, omp_lock_hint_none,
2720           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2721           codeptr);
2722     }
2723   }
2724 #endif
2725
2726   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2727
2728 #if USE_ITT_BUILD
2729   __kmp_itt_lock_acquired(lck);
2730 #endif /* USE_ITT_BUILD */
2731
2732 #if OMPT_SUPPORT && OMPT_OPTIONAL
2733   if (ompt_enabled.enabled) {
2734     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2735       if (ompt_enabled.ompt_callback_mutex_acquired) {
2736         // lock_first
2737         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2738             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2739       }
2740     } else {
2741       if (ompt_enabled.ompt_callback_nest_lock) {
2742         // lock_next
2743         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2744             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2745       }
2746     }
2747   }
2748 #endif
2749
2750 #endif // KMP_USE_DYNAMIC_LOCK
2751 }
2752
2753 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2754 #if KMP_USE_DYNAMIC_LOCK
2755
2756   int tag = KMP_EXTRACT_D_TAG(user_lock);
2757 #if USE_ITT_BUILD
2758   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2759 #endif
2760 #if KMP_USE_INLINED_TAS
2761   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2762     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2763   } else
2764 #elif KMP_USE_INLINED_FUTEX
2765   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2766     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2767   } else
2768 #endif
2769   {
2770     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2771   }
2772
2773 #if OMPT_SUPPORT && OMPT_OPTIONAL
2774   // This is the case, if called from omp_init_lock_with_hint:
2775   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2776   if (!codeptr)
2777     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2778   if (ompt_enabled.ompt_callback_mutex_released) {
2779     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2780         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2781   }
2782 #endif
2783
2784 #else // KMP_USE_DYNAMIC_LOCK
2785
2786   kmp_user_lock_p lck;
2787
2788   /* Can't use serial interval since not block structured */
2789   /* release the lock */
2790
2791   if ((__kmp_user_lock_kind == lk_tas) &&
2792       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2793 #if KMP_OS_LINUX &&                                                            \
2794     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2795 // "fast" path implemented to fix customer performance issue
2796 #if USE_ITT_BUILD
2797     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2798 #endif /* USE_ITT_BUILD */
2799     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2800     KMP_MB();
2801
2802 #if OMPT_SUPPORT && OMPT_OPTIONAL
2803     // This is the case, if called from omp_init_lock_with_hint:
2804     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2805     if (!codeptr)
2806       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2807     if (ompt_enabled.ompt_callback_mutex_released) {
2808       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2809           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2810     }
2811 #endif
2812
2813     return;
2814 #else
2815     lck = (kmp_user_lock_p)user_lock;
2816 #endif
2817   }
2818 #if KMP_USE_FUTEX
2819   else if ((__kmp_user_lock_kind == lk_futex) &&
2820            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2821     lck = (kmp_user_lock_p)user_lock;
2822   }
2823 #endif
2824   else {
2825     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2826   }
2827
2828 #if USE_ITT_BUILD
2829   __kmp_itt_lock_releasing(lck);
2830 #endif /* USE_ITT_BUILD */
2831
2832   RELEASE_LOCK(lck, gtid);
2833
2834 #if OMPT_SUPPORT && OMPT_OPTIONAL
2835   // This is the case, if called from omp_init_lock_with_hint:
2836   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2837   if (!codeptr)
2838     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2839   if (ompt_enabled.ompt_callback_mutex_released) {
2840     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2841         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2842   }
2843 #endif
2844
2845 #endif // KMP_USE_DYNAMIC_LOCK
2846 }
2847
2848 /* release the lock */
2849 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2850 #if KMP_USE_DYNAMIC_LOCK
2851
2852 #if USE_ITT_BUILD
2853   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2854 #endif
2855   int release_status =
2856       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2857   (void) release_status;
2858
2859 #if OMPT_SUPPORT && OMPT_OPTIONAL
2860   // This is the case, if called from omp_init_lock_with_hint:
2861   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2862   if (!codeptr)
2863     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2864   if (ompt_enabled.enabled) {
2865     if (release_status == KMP_LOCK_RELEASED) {
2866       if (ompt_enabled.ompt_callback_mutex_released) {
2867         // release_lock_last
2868         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2869             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2870             codeptr);
2871       }
2872     } else if (ompt_enabled.ompt_callback_nest_lock) {
2873       // release_lock_prev
2874       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2875           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2876     }
2877   }
2878 #endif
2879
2880 #else // KMP_USE_DYNAMIC_LOCK
2881
2882   kmp_user_lock_p lck;
2883
2884   /* Can't use serial interval since not block structured */
2885
2886   if ((__kmp_user_lock_kind == lk_tas) &&
2887       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2888        OMP_NEST_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX &&                                                            \
2890     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891     // "fast" path implemented to fix customer performance issue
2892     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2893 #if USE_ITT_BUILD
2894     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2896
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898     int release_status = KMP_LOCK_STILL_HELD;
2899 #endif
2900
2901     if (--(tl->lk.depth_locked) == 0) {
2902       TCW_4(tl->lk.poll, 0);
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904       release_status = KMP_LOCK_RELEASED;
2905 #endif
2906     }
2907     KMP_MB();
2908
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910     // This is the case, if called from omp_init_lock_with_hint:
2911     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912     if (!codeptr)
2913       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914     if (ompt_enabled.enabled) {
2915       if (release_status == KMP_LOCK_RELEASED) {
2916         if (ompt_enabled.ompt_callback_mutex_released) {
2917           // release_lock_last
2918           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2919               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2920         }
2921       } else if (ompt_enabled.ompt_callback_nest_lock) {
2922         // release_lock_previous
2923         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2924             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2925       }
2926     }
2927 #endif
2928
2929     return;
2930 #else
2931     lck = (kmp_user_lock_p)user_lock;
2932 #endif
2933   }
2934 #if KMP_USE_FUTEX
2935   else if ((__kmp_user_lock_kind == lk_futex) &&
2936            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2937             OMP_NEST_LOCK_T_SIZE)) {
2938     lck = (kmp_user_lock_p)user_lock;
2939   }
2940 #endif
2941   else {
2942     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2943   }
2944
2945 #if USE_ITT_BUILD
2946   __kmp_itt_lock_releasing(lck);
2947 #endif /* USE_ITT_BUILD */
2948
2949   int release_status;
2950   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2951 #if OMPT_SUPPORT && OMPT_OPTIONAL
2952   // This is the case, if called from omp_init_lock_with_hint:
2953   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2954   if (!codeptr)
2955     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2956   if (ompt_enabled.enabled) {
2957     if (release_status == KMP_LOCK_RELEASED) {
2958       if (ompt_enabled.ompt_callback_mutex_released) {
2959         // release_lock_last
2960         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2961             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2962       }
2963     } else if (ompt_enabled.ompt_callback_nest_lock) {
2964       // release_lock_previous
2965       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2966           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2967     }
2968   }
2969 #endif
2970
2971 #endif // KMP_USE_DYNAMIC_LOCK
2972 }
2973
2974 /* try to acquire the lock */
2975 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2976   KMP_COUNT_BLOCK(OMP_test_lock);
2977
2978 #if KMP_USE_DYNAMIC_LOCK
2979   int rc;
2980   int tag = KMP_EXTRACT_D_TAG(user_lock);
2981 #if USE_ITT_BUILD
2982   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2983 #endif
2984 #if OMPT_SUPPORT && OMPT_OPTIONAL
2985   // This is the case, if called from omp_init_lock_with_hint:
2986   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2987   if (!codeptr)
2988     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2989   if (ompt_enabled.ompt_callback_mutex_acquire) {
2990     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2991         ompt_mutex_lock, omp_lock_hint_none,
2992         __ompt_get_mutex_impl_type(user_lock),
2993         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2994   }
2995 #endif
2996 #if KMP_USE_INLINED_TAS
2997   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2998     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2999   } else
3000 #elif KMP_USE_INLINED_FUTEX
3001   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3002     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3003   } else
3004 #endif
3005   {
3006     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3007   }
3008   if (rc) {
3009 #if USE_ITT_BUILD
3010     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3011 #endif
3012 #if OMPT_SUPPORT && OMPT_OPTIONAL
3013     if (ompt_enabled.ompt_callback_mutex_acquired) {
3014       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3015           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3016     }
3017 #endif
3018     return FTN_TRUE;
3019   } else {
3020 #if USE_ITT_BUILD
3021     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3022 #endif
3023     return FTN_FALSE;
3024   }
3025
3026 #else // KMP_USE_DYNAMIC_LOCK
3027
3028   kmp_user_lock_p lck;
3029   int rc;
3030
3031   if ((__kmp_user_lock_kind == lk_tas) &&
3032       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3033     lck = (kmp_user_lock_p)user_lock;
3034   }
3035 #if KMP_USE_FUTEX
3036   else if ((__kmp_user_lock_kind == lk_futex) &&
3037            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3038     lck = (kmp_user_lock_p)user_lock;
3039   }
3040 #endif
3041   else {
3042     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3043   }
3044
3045 #if USE_ITT_BUILD
3046   __kmp_itt_lock_acquiring(lck);
3047 #endif /* USE_ITT_BUILD */
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049   // This is the case, if called from omp_init_lock_with_hint:
3050   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051   if (!codeptr)
3052     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053   if (ompt_enabled.ompt_callback_mutex_acquire) {
3054     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3055         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3056         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3057   }
3058 #endif
3059
3060   rc = TEST_LOCK(lck, gtid);
3061 #if USE_ITT_BUILD
3062   if (rc) {
3063     __kmp_itt_lock_acquired(lck);
3064   } else {
3065     __kmp_itt_lock_cancelled(lck);
3066   }
3067 #endif /* USE_ITT_BUILD */
3068 #if OMPT_SUPPORT && OMPT_OPTIONAL
3069   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3070     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3071         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3072   }
3073 #endif
3074
3075   return (rc ? FTN_TRUE : FTN_FALSE);
3076
3077 /* Can't use serial interval since not block structured */
3078
3079 #endif // KMP_USE_DYNAMIC_LOCK
3080 }
3081
3082 /* try to acquire the lock */
3083 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3084 #if KMP_USE_DYNAMIC_LOCK
3085   int rc;
3086 #if USE_ITT_BUILD
3087   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3088 #endif
3089 #if OMPT_SUPPORT && OMPT_OPTIONAL
3090   // This is the case, if called from omp_init_lock_with_hint:
3091   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3092   if (!codeptr)
3093     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3094   if (ompt_enabled.ompt_callback_mutex_acquire) {
3095     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3096         ompt_mutex_nest_lock, omp_lock_hint_none,
3097         __ompt_get_mutex_impl_type(user_lock),
3098         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3099   }
3100 #endif
3101   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3102 #if USE_ITT_BUILD
3103   if (rc) {
3104     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3105   } else {
3106     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3107   }
3108 #endif
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110   if (ompt_enabled.enabled && rc) {
3111     if (rc == 1) {
3112       if (ompt_enabled.ompt_callback_mutex_acquired) {
3113         // lock_first
3114         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3115             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3116             codeptr);
3117       }
3118     } else {
3119       if (ompt_enabled.ompt_callback_nest_lock) {
3120         // lock_next
3121         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3122             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3123       }
3124     }
3125   }
3126 #endif
3127   return rc;
3128
3129 #else // KMP_USE_DYNAMIC_LOCK
3130
3131   kmp_user_lock_p lck;
3132   int rc;
3133
3134   if ((__kmp_user_lock_kind == lk_tas) &&
3135       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3136        OMP_NEST_LOCK_T_SIZE)) {
3137     lck = (kmp_user_lock_p)user_lock;
3138   }
3139 #if KMP_USE_FUTEX
3140   else if ((__kmp_user_lock_kind == lk_futex) &&
3141            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3142             OMP_NEST_LOCK_T_SIZE)) {
3143     lck = (kmp_user_lock_p)user_lock;
3144   }
3145 #endif
3146   else {
3147     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3148   }
3149
3150 #if USE_ITT_BUILD
3151   __kmp_itt_lock_acquiring(lck);
3152 #endif /* USE_ITT_BUILD */
3153
3154 #if OMPT_SUPPORT && OMPT_OPTIONAL
3155   // This is the case, if called from omp_init_lock_with_hint:
3156   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3157   if (!codeptr)
3158     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3159   if (ompt_enabled.enabled) &&
3160         ompt_enabled.ompt_callback_mutex_acquire) {
3161       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3162           ompt_mutex_nest_lock, omp_lock_hint_none,
3163           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3164           codeptr);
3165     }
3166 #endif
3167
3168   rc = TEST_NESTED_LOCK(lck, gtid);
3169 #if USE_ITT_BUILD
3170   if (rc) {
3171     __kmp_itt_lock_acquired(lck);
3172   } else {
3173     __kmp_itt_lock_cancelled(lck);
3174   }
3175 #endif /* USE_ITT_BUILD */
3176 #if OMPT_SUPPORT && OMPT_OPTIONAL
3177   if (ompt_enabled.enabled && rc) {
3178     if (rc == 1) {
3179       if (ompt_enabled.ompt_callback_mutex_acquired) {
3180         // lock_first
3181         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3182             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3183       }
3184     } else {
3185       if (ompt_enabled.ompt_callback_nest_lock) {
3186         // lock_next
3187         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3188             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3189       }
3190     }
3191   }
3192 #endif
3193   return rc;
3194
3195 /* Can't use serial interval since not block structured */
3196
3197 #endif // KMP_USE_DYNAMIC_LOCK
3198 }
3199
3200 // Interface to fast scalable reduce methods routines
3201
3202 // keep the selected method in a thread local structure for cross-function
3203 // usage: will be used in __kmpc_end_reduce* functions;
3204 // another solution: to re-determine the method one more time in
3205 // __kmpc_end_reduce* functions (new prototype required then)
3206 // AT: which solution is better?
3207 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3208   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3209
3210 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3211   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3212
3213 // description of the packed_reduction_method variable: look at the macros in
3214 // kmp.h
3215
3216 // used in a critical section reduce block
3217 static __forceinline void
3218 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3219                                           kmp_critical_name *crit) {
3220
3221   // this lock was visible to a customer and to the threading profile tool as a
3222   // serial overhead span (although it's used for an internal purpose only)
3223   //            why was it visible in previous implementation?
3224   //            should we keep it visible in new reduce block?
3225   kmp_user_lock_p lck;
3226
3227 #if KMP_USE_DYNAMIC_LOCK
3228
3229   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3230   // Check if it is initialized.
3231   if (*lk == 0) {
3232     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3233       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3234                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3235     } else {
3236       __kmp_init_indirect_csptr(crit, loc, global_tid,
3237                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3238     }
3239   }
3240   // Branch for accessing the actual lock object and set operation. This
3241   // branching is inevitable since this lock initialization does not follow the
3242   // normal dispatch path (lock table is not used).
3243   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3244     lck = (kmp_user_lock_p)lk;
3245     KMP_DEBUG_ASSERT(lck != NULL);
3246     if (__kmp_env_consistency_check) {
3247       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3248     }
3249     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3250   } else {
3251     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3252     lck = ilk->lock;
3253     KMP_DEBUG_ASSERT(lck != NULL);
3254     if (__kmp_env_consistency_check) {
3255       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3256     }
3257     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3258   }
3259
3260 #else // KMP_USE_DYNAMIC_LOCK
3261
3262   // We know that the fast reduction code is only emitted by Intel compilers
3263   // with 32 byte critical sections. If there isn't enough space, then we
3264   // have to use a pointer.
3265   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3266     lck = (kmp_user_lock_p)crit;
3267   } else {
3268     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3269   }
3270   KMP_DEBUG_ASSERT(lck != NULL);
3271
3272   if (__kmp_env_consistency_check)
3273     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3274
3275   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3276
3277 #endif // KMP_USE_DYNAMIC_LOCK
3278 }
3279
3280 // used in a critical section reduce block
3281 static __forceinline void
3282 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3283                                         kmp_critical_name *crit) {
3284
3285   kmp_user_lock_p lck;
3286
3287 #if KMP_USE_DYNAMIC_LOCK
3288
3289   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3290     lck = (kmp_user_lock_p)crit;
3291     if (__kmp_env_consistency_check)
3292       __kmp_pop_sync(global_tid, ct_critical, loc);
3293     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3294   } else {
3295     kmp_indirect_lock_t *ilk =
3296         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3297     if (__kmp_env_consistency_check)
3298       __kmp_pop_sync(global_tid, ct_critical, loc);
3299     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3300   }
3301
3302 #else // KMP_USE_DYNAMIC_LOCK
3303
3304   // We know that the fast reduction code is only emitted by Intel compilers
3305   // with 32 byte critical sections. If there isn't enough space, then we have
3306   // to use a pointer.
3307   if (__kmp_base_user_lock_size > 32) {
3308     lck = *((kmp_user_lock_p *)crit);
3309     KMP_ASSERT(lck != NULL);
3310   } else {
3311     lck = (kmp_user_lock_p)crit;
3312   }
3313
3314   if (__kmp_env_consistency_check)
3315     __kmp_pop_sync(global_tid, ct_critical, loc);
3316
3317   __kmp_release_user_lock_with_checks(lck, global_tid);
3318
3319 #endif // KMP_USE_DYNAMIC_LOCK
3320 } // __kmp_end_critical_section_reduce_block
3321
3322 static __forceinline int
3323 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3324                                      int *task_state) {
3325   kmp_team_t *team;
3326
3327   // Check if we are inside the teams construct?
3328   if (th->th.th_teams_microtask) {
3329     *team_p = team = th->th.th_team;
3330     if (team->t.t_level == th->th.th_teams_level) {
3331       // This is reduction at teams construct.
3332       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3333       // Let's swap teams temporarily for the reduction.
3334       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3335       th->th.th_team = team->t.t_parent;
3336       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3337       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3338       *task_state = th->th.th_task_state;
3339       th->th.th_task_state = 0;
3340
3341       return 1;
3342     }
3343   }
3344   return 0;
3345 }
3346
3347 static __forceinline void
3348 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3349   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3350   th->th.th_info.ds.ds_tid = 0;
3351   th->th.th_team = team;
3352   th->th.th_team_nproc = team->t.t_nproc;
3353   th->th.th_task_team = team->t.t_task_team[task_state];
3354   th->th.th_task_state = task_state;
3355 }
3356
3357 /* 2.a.i. Reduce Block without a terminating barrier */
3358 /*!
3359 @ingroup SYNCHRONIZATION
3360 @param loc source location information
3361 @param global_tid global thread number
3362 @param num_vars number of items (variables) to be reduced
3363 @param reduce_size size of data in bytes to be reduced
3364 @param reduce_data pointer to data to be reduced
3365 @param reduce_func callback function providing reduction operation on two
3366 operands and returning result of reduction in lhs_data
3367 @param lck pointer to the unique lock data structure
3368 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3369 threads if atomic reduction needed
3370
3371 The nowait version is used for a reduce clause with the nowait argument.
3372 */
3373 kmp_int32
3374 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3375                      size_t reduce_size, void *reduce_data,
3376                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3377                      kmp_critical_name *lck) {
3378
3379   KMP_COUNT_BLOCK(REDUCE_nowait);
3380   int retval = 0;
3381   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3382   kmp_info_t *th;
3383   kmp_team_t *team;
3384   int teams_swapped = 0, task_state;
3385   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3386
3387   // why do we need this initialization here at all?
3388   // Reduction clause can not be used as a stand-alone directive.
3389
3390   // do not call __kmp_serial_initialize(), it will be called by
3391   // __kmp_parallel_initialize() if needed
3392   // possible detection of false-positive race by the threadchecker ???
3393   if (!TCR_4(__kmp_init_parallel))
3394     __kmp_parallel_initialize();
3395
3396   __kmp_resume_if_soft_paused();
3397
3398 // check correctness of reduce block nesting
3399 #if KMP_USE_DYNAMIC_LOCK
3400   if (__kmp_env_consistency_check)
3401     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3402 #else
3403   if (__kmp_env_consistency_check)
3404     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3405 #endif
3406
3407   th = __kmp_thread_from_gtid(global_tid);
3408   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3409
3410   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3411   // the value should be kept in a variable
3412   // the variable should be either a construct-specific or thread-specific
3413   // property, not a team specific property
3414   //     (a thread can reach the next reduce block on the next construct, reduce
3415   //     method may differ on the next construct)
3416   // an ident_t "loc" parameter could be used as a construct-specific property
3417   // (what if loc == 0?)
3418   //     (if both construct-specific and team-specific variables were shared,
3419   //     then unness extra syncs should be needed)
3420   // a thread-specific variable is better regarding two issues above (next
3421   // construct and extra syncs)
3422   // a thread-specific "th_local.reduction_method" variable is used currently
3423   // each thread executes 'determine' and 'set' lines (no need to execute by one
3424   // thread, to avoid unness extra syncs)
3425
3426   packed_reduction_method = __kmp_determine_reduction_method(
3427       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3428   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3429
3430   if (packed_reduction_method == critical_reduce_block) {
3431
3432     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3433     retval = 1;
3434
3435   } else if (packed_reduction_method == empty_reduce_block) {
3436
3437     // usage: if team size == 1, no synchronization is required ( Intel
3438     // platforms only )
3439     retval = 1;
3440
3441   } else if (packed_reduction_method == atomic_reduce_block) {
3442
3443     retval = 2;
3444
3445     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3446     // won't be called by the code gen)
3447     //     (it's not quite good, because the checking block has been closed by
3448     //     this 'pop',
3449     //      but atomic operation has not been executed yet, will be executed
3450     //      slightly later, literally on next instruction)
3451     if (__kmp_env_consistency_check)
3452       __kmp_pop_sync(global_tid, ct_reduce, loc);
3453
3454   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3455                                    tree_reduce_block)) {
3456
3457 // AT: performance issue: a real barrier here
3458 // AT:     (if master goes slow, other threads are blocked here waiting for the
3459 // master to come and release them)
3460 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3461 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3462 // be confusing to a customer)
3463 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3464 // might go faster and be more in line with sense of NOWAIT
3465 // AT: TO DO: do epcc test and compare times
3466
3467 // this barrier should be invisible to a customer and to the threading profile
3468 // tool (it's neither a terminating barrier nor customer's code, it's
3469 // used for an internal purpose)
3470 #if OMPT_SUPPORT
3471     // JP: can this barrier potentially leed to task scheduling?
3472     // JP: as long as there is a barrier in the implementation, OMPT should and
3473     // will provide the barrier events
3474     //         so we set-up the necessary frame/return addresses.
3475     ompt_frame_t *ompt_frame;
3476     if (ompt_enabled.enabled) {
3477       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3478       if (ompt_frame->enter_frame.ptr == NULL)
3479         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3480       OMPT_STORE_RETURN_ADDRESS(global_tid);
3481     }
3482 #endif
3483 #if USE_ITT_NOTIFY
3484     __kmp_threads[global_tid]->th.th_ident = loc;
3485 #endif
3486     retval =
3487         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3488                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3489     retval = (retval != 0) ? (0) : (1);
3490 #if OMPT_SUPPORT && OMPT_OPTIONAL
3491     if (ompt_enabled.enabled) {
3492       ompt_frame->enter_frame = ompt_data_none;
3493     }
3494 #endif
3495
3496     // all other workers except master should do this pop here
3497     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3498     if (__kmp_env_consistency_check) {
3499       if (retval == 0) {
3500         __kmp_pop_sync(global_tid, ct_reduce, loc);
3501       }
3502     }
3503
3504   } else {
3505
3506     // should never reach this block
3507     KMP_ASSERT(0); // "unexpected method"
3508   }
3509   if (teams_swapped) {
3510     __kmp_restore_swapped_teams(th, team, task_state);
3511   }
3512   KA_TRACE(
3513       10,
3514       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3515        global_tid, packed_reduction_method, retval));
3516
3517   return retval;
3518 }
3519
3520 /*!
3521 @ingroup SYNCHRONIZATION
3522 @param loc source location information
3523 @param global_tid global thread id.
3524 @param lck pointer to the unique lock data structure
3525
3526 Finish the execution of a reduce nowait.
3527 */
3528 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3529                               kmp_critical_name *lck) {
3530
3531   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3532
3533   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3534
3535   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3536
3537   if (packed_reduction_method == critical_reduce_block) {
3538
3539     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3540
3541   } else if (packed_reduction_method == empty_reduce_block) {
3542
3543     // usage: if team size == 1, no synchronization is required ( on Intel
3544     // platforms only )
3545
3546   } else if (packed_reduction_method == atomic_reduce_block) {
3547
3548     // neither master nor other workers should get here
3549     //     (code gen does not generate this call in case 2: atomic reduce block)
3550     // actually it's better to remove this elseif at all;
3551     // after removal this value will checked by the 'else' and will assert
3552
3553   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3554                                    tree_reduce_block)) {
3555
3556     // only master gets here
3557
3558   } else {
3559
3560     // should never reach this block
3561     KMP_ASSERT(0); // "unexpected method"
3562   }
3563
3564   if (__kmp_env_consistency_check)
3565     __kmp_pop_sync(global_tid, ct_reduce, loc);
3566
3567   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3568                 global_tid, packed_reduction_method));
3569
3570   return;
3571 }
3572
3573 /* 2.a.ii. Reduce Block with a terminating barrier */
3574
3575 /*!
3576 @ingroup SYNCHRONIZATION
3577 @param loc source location information
3578 @param global_tid global thread number
3579 @param num_vars number of items (variables) to be reduced
3580 @param reduce_size size of data in bytes to be reduced
3581 @param reduce_data pointer to data to be reduced
3582 @param reduce_func callback function providing reduction operation on two
3583 operands and returning result of reduction in lhs_data
3584 @param lck pointer to the unique lock data structure
3585 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3586 threads if atomic reduction needed
3587
3588 A blocking reduce that includes an implicit barrier.
3589 */
3590 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3591                         size_t reduce_size, void *reduce_data,
3592                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3593                         kmp_critical_name *lck) {
3594   KMP_COUNT_BLOCK(REDUCE_wait);
3595   int retval = 0;
3596   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3597   kmp_info_t *th;
3598   kmp_team_t *team;
3599   int teams_swapped = 0, task_state;
3600
3601   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3602
3603   // why do we need this initialization here at all?
3604   // Reduction clause can not be a stand-alone directive.
3605
3606   // do not call __kmp_serial_initialize(), it will be called by
3607   // __kmp_parallel_initialize() if needed
3608   // possible detection of false-positive race by the threadchecker ???
3609   if (!TCR_4(__kmp_init_parallel))
3610     __kmp_parallel_initialize();
3611
3612   __kmp_resume_if_soft_paused();
3613
3614 // check correctness of reduce block nesting
3615 #if KMP_USE_DYNAMIC_LOCK
3616   if (__kmp_env_consistency_check)
3617     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3618 #else
3619   if (__kmp_env_consistency_check)
3620     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3621 #endif
3622
3623   th = __kmp_thread_from_gtid(global_tid);
3624   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3625
3626   packed_reduction_method = __kmp_determine_reduction_method(
3627       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3628   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3629
3630   if (packed_reduction_method == critical_reduce_block) {
3631
3632     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3633     retval = 1;
3634
3635   } else if (packed_reduction_method == empty_reduce_block) {
3636
3637     // usage: if team size == 1, no synchronization is required ( Intel
3638     // platforms only )
3639     retval = 1;
3640
3641   } else if (packed_reduction_method == atomic_reduce_block) {
3642
3643     retval = 2;
3644
3645   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3646                                    tree_reduce_block)) {
3647
3648 // case tree_reduce_block:
3649 // this barrier should be visible to a customer and to the threading profile
3650 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3651 #if OMPT_SUPPORT
3652     ompt_frame_t *ompt_frame;
3653     if (ompt_enabled.enabled) {
3654       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3655       if (ompt_frame->enter_frame.ptr == NULL)
3656         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3657       OMPT_STORE_RETURN_ADDRESS(global_tid);
3658     }
3659 #endif
3660 #if USE_ITT_NOTIFY
3661     __kmp_threads[global_tid]->th.th_ident =
3662         loc; // needed for correct notification of frames
3663 #endif
3664     retval =
3665         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3666                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3667     retval = (retval != 0) ? (0) : (1);
3668 #if OMPT_SUPPORT && OMPT_OPTIONAL
3669     if (ompt_enabled.enabled) {
3670       ompt_frame->enter_frame = ompt_data_none;
3671     }
3672 #endif
3673
3674     // all other workers except master should do this pop here
3675     // ( none of other workers except master will enter __kmpc_end_reduce() )
3676     if (__kmp_env_consistency_check) {
3677       if (retval == 0) { // 0: all other workers; 1: master
3678         __kmp_pop_sync(global_tid, ct_reduce, loc);
3679       }
3680     }
3681
3682   } else {
3683
3684     // should never reach this block
3685     KMP_ASSERT(0); // "unexpected method"
3686   }
3687   if (teams_swapped) {
3688     __kmp_restore_swapped_teams(th, team, task_state);
3689   }
3690
3691   KA_TRACE(10,
3692            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3693             global_tid, packed_reduction_method, retval));
3694   return retval;
3695 }
3696
3697 /*!
3698 @ingroup SYNCHRONIZATION
3699 @param loc source location information
3700 @param global_tid global thread id.
3701 @param lck pointer to the unique lock data structure
3702
3703 Finish the execution of a blocking reduce.
3704 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3705 start function.
3706 */
3707 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3708                        kmp_critical_name *lck) {
3709
3710   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3711   kmp_info_t *th;
3712   kmp_team_t *team;
3713   int teams_swapped = 0, task_state;
3714
3715   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3716
3717   th = __kmp_thread_from_gtid(global_tid);
3718   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3719
3720   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3721
3722   // this barrier should be visible to a customer and to the threading profile
3723   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3724
3725   if (packed_reduction_method == critical_reduce_block) {
3726     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3727
3728 // TODO: implicit barrier: should be exposed
3729 #if OMPT_SUPPORT
3730     ompt_frame_t *ompt_frame;
3731     if (ompt_enabled.enabled) {
3732       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3733       if (ompt_frame->enter_frame.ptr == NULL)
3734         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3735       OMPT_STORE_RETURN_ADDRESS(global_tid);
3736     }
3737 #endif
3738 #if USE_ITT_NOTIFY
3739     __kmp_threads[global_tid]->th.th_ident = loc;
3740 #endif
3741     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3742 #if OMPT_SUPPORT && OMPT_OPTIONAL
3743     if (ompt_enabled.enabled) {
3744       ompt_frame->enter_frame = ompt_data_none;
3745     }
3746 #endif
3747
3748   } else if (packed_reduction_method == empty_reduce_block) {
3749
3750 // usage: if team size==1, no synchronization is required (Intel platforms only)
3751
3752 // TODO: implicit barrier: should be exposed
3753 #if OMPT_SUPPORT
3754     ompt_frame_t *ompt_frame;
3755     if (ompt_enabled.enabled) {
3756       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3757       if (ompt_frame->enter_frame.ptr == NULL)
3758         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3759       OMPT_STORE_RETURN_ADDRESS(global_tid);
3760     }
3761 #endif
3762 #if USE_ITT_NOTIFY
3763     __kmp_threads[global_tid]->th.th_ident = loc;
3764 #endif
3765     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3766 #if OMPT_SUPPORT && OMPT_OPTIONAL
3767     if (ompt_enabled.enabled) {
3768       ompt_frame->enter_frame = ompt_data_none;
3769     }
3770 #endif
3771
3772   } else if (packed_reduction_method == atomic_reduce_block) {
3773
3774 #if OMPT_SUPPORT
3775     ompt_frame_t *ompt_frame;
3776     if (ompt_enabled.enabled) {
3777       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3778       if (ompt_frame->enter_frame.ptr == NULL)
3779         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3780       OMPT_STORE_RETURN_ADDRESS(global_tid);
3781     }
3782 #endif
3783 // TODO: implicit barrier: should be exposed
3784 #if USE_ITT_NOTIFY
3785     __kmp_threads[global_tid]->th.th_ident = loc;
3786 #endif
3787     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789     if (ompt_enabled.enabled) {
3790       ompt_frame->enter_frame = ompt_data_none;
3791     }
3792 #endif
3793
3794   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3795                                    tree_reduce_block)) {
3796
3797     // only master executes here (master releases all other workers)
3798     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3799                             global_tid);
3800
3801   } else {
3802
3803     // should never reach this block
3804     KMP_ASSERT(0); // "unexpected method"
3805   }
3806   if (teams_swapped) {
3807     __kmp_restore_swapped_teams(th, team, task_state);
3808   }
3809
3810   if (__kmp_env_consistency_check)
3811     __kmp_pop_sync(global_tid, ct_reduce, loc);
3812
3813   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3814                 global_tid, packed_reduction_method));
3815
3816   return;
3817 }
3818
3819 #undef __KMP_GET_REDUCTION_METHOD
3820 #undef __KMP_SET_REDUCTION_METHOD
3821
3822 /* end of interface to fast scalable reduce routines */
3823
3824 kmp_uint64 __kmpc_get_taskid() {
3825
3826   kmp_int32 gtid;
3827   kmp_info_t *thread;
3828
3829   gtid = __kmp_get_gtid();
3830   if (gtid < 0) {
3831     return 0;
3832   }
3833   thread = __kmp_thread_from_gtid(gtid);
3834   return thread->th.th_current_task->td_task_id;
3835
3836 } // __kmpc_get_taskid
3837
3838 kmp_uint64 __kmpc_get_parent_taskid() {
3839
3840   kmp_int32 gtid;
3841   kmp_info_t *thread;
3842   kmp_taskdata_t *parent_task;
3843
3844   gtid = __kmp_get_gtid();
3845   if (gtid < 0) {
3846     return 0;
3847   }
3848   thread = __kmp_thread_from_gtid(gtid);
3849   parent_task = thread->th.th_current_task->td_parent;
3850   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3851
3852 } // __kmpc_get_parent_taskid
3853
3854 /*!
3855 @ingroup WORK_SHARING
3856 @param loc  source location information.
3857 @param gtid  global thread number.
3858 @param num_dims  number of associated doacross loops.
3859 @param dims  info on loops bounds.
3860
3861 Initialize doacross loop information.
3862 Expect compiler send us inclusive bounds,
3863 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3864 */
3865 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3866                           const struct kmp_dim *dims) {
3867   int j, idx;
3868   kmp_int64 last, trace_count;
3869   kmp_info_t *th = __kmp_threads[gtid];
3870   kmp_team_t *team = th->th.th_team;
3871   kmp_uint32 *flags;
3872   kmp_disp_t *pr_buf = th->th.th_dispatch;
3873   dispatch_shared_info_t *sh_buf;
3874
3875   KA_TRACE(
3876       20,
3877       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3878        gtid, num_dims, !team->t.t_serialized));
3879   KMP_DEBUG_ASSERT(dims != NULL);
3880   KMP_DEBUG_ASSERT(num_dims > 0);
3881
3882   if (team->t.t_serialized) {
3883     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3884     return; // no dependencies if team is serialized
3885   }
3886   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3887   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3888   // the next loop
3889   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3890
3891   // Save bounds info into allocated private buffer
3892   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3893   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3894       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3895   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3896   pr_buf->th_doacross_info[0] =
3897       (kmp_int64)num_dims; // first element is number of dimensions
3898   // Save also address of num_done in order to access it later without knowing
3899   // the buffer index
3900   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3901   pr_buf->th_doacross_info[2] = dims[0].lo;
3902   pr_buf->th_doacross_info[3] = dims[0].up;
3903   pr_buf->th_doacross_info[4] = dims[0].st;
3904   last = 5;
3905   for (j = 1; j < num_dims; ++j) {
3906     kmp_int64
3907         range_length; // To keep ranges of all dimensions but the first dims[0]
3908     if (dims[j].st == 1) { // most common case
3909       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3910       range_length = dims[j].up - dims[j].lo + 1;
3911     } else {
3912       if (dims[j].st > 0) {
3913         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3914         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3915       } else { // negative increment
3916         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3917         range_length =
3918             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3919       }
3920     }
3921     pr_buf->th_doacross_info[last++] = range_length;
3922     pr_buf->th_doacross_info[last++] = dims[j].lo;
3923     pr_buf->th_doacross_info[last++] = dims[j].up;
3924     pr_buf->th_doacross_info[last++] = dims[j].st;
3925   }
3926
3927   // Compute total trip count.
3928   // Start with range of dims[0] which we don't need to keep in the buffer.
3929   if (dims[0].st == 1) { // most common case
3930     trace_count = dims[0].up - dims[0].lo + 1;
3931   } else if (dims[0].st > 0) {
3932     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3933     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3934   } else { // negative increment
3935     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3936     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3937   }
3938   for (j = 1; j < num_dims; ++j) {
3939     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3940   }
3941   KMP_DEBUG_ASSERT(trace_count > 0);
3942
3943   // Check if shared buffer is not occupied by other loop (idx -
3944   // __kmp_dispatch_num_buffers)
3945   if (idx != sh_buf->doacross_buf_idx) {
3946     // Shared buffer is occupied, wait for it to be free
3947     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3948                  __kmp_eq_4, NULL);
3949   }
3950 #if KMP_32_BIT_ARCH
3951   // Check if we are the first thread. After the CAS the first thread gets 0,
3952   // others get 1 if initialization is in progress, allocated pointer otherwise.
3953   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3954   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3955       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3956 #else
3957   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3958       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3959 #endif
3960   if (flags == NULL) {
3961     // we are the first thread, allocate the array of flags
3962     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3963     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3964     KMP_MB();
3965     sh_buf->doacross_flags = flags;
3966   } else if (flags == (kmp_uint32 *)1) {
3967 #if KMP_32_BIT_ARCH
3968     // initialization is still in progress, need to wait
3969     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3970 #else
3971     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3972 #endif
3973       KMP_YIELD(TRUE);
3974     KMP_MB();
3975   } else {
3976     KMP_MB();
3977   }
3978   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3979   pr_buf->th_doacross_flags =
3980       sh_buf->doacross_flags; // save private copy in order to not
3981   // touch shared buffer on each iteration
3982   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3983 }
3984
3985 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3986   kmp_int32 shft, num_dims, i;
3987   kmp_uint32 flag;
3988   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3989   kmp_info_t *th = __kmp_threads[gtid];
3990   kmp_team_t *team = th->th.th_team;
3991   kmp_disp_t *pr_buf;
3992   kmp_int64 lo, up, st;
3993
3994   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3995   if (team->t.t_serialized) {
3996     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3997     return; // no dependencies if team is serialized
3998   }
3999
4000   // calculate sequential iteration number and check out-of-bounds condition
4001   pr_buf = th->th.th_dispatch;
4002   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4003   num_dims = pr_buf->th_doacross_info[0];
4004   lo = pr_buf->th_doacross_info[2];
4005   up = pr_buf->th_doacross_info[3];
4006   st = pr_buf->th_doacross_info[4];
4007   if (st == 1) { // most common case
4008     if (vec[0] < lo || vec[0] > up) {
4009       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4010                     "bounds [%lld,%lld]\n",
4011                     gtid, vec[0], lo, up));
4012       return;
4013     }
4014     iter_number = vec[0] - lo;
4015   } else if (st > 0) {
4016     if (vec[0] < lo || vec[0] > up) {
4017       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4018                     "bounds [%lld,%lld]\n",
4019                     gtid, vec[0], lo, up));
4020       return;
4021     }
4022     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4023   } else { // negative increment
4024     if (vec[0] > lo || vec[0] < up) {
4025       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4026                     "bounds [%lld,%lld]\n",
4027                     gtid, vec[0], lo, up));
4028       return;
4029     }
4030     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4031   }
4032   for (i = 1; i < num_dims; ++i) {
4033     kmp_int64 iter, ln;
4034     kmp_int32 j = i * 4;
4035     ln = pr_buf->th_doacross_info[j + 1];
4036     lo = pr_buf->th_doacross_info[j + 2];
4037     up = pr_buf->th_doacross_info[j + 3];
4038     st = pr_buf->th_doacross_info[j + 4];
4039     if (st == 1) {
4040       if (vec[i] < lo || vec[i] > up) {
4041         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4042                       "bounds [%lld,%lld]\n",
4043                       gtid, vec[i], lo, up));
4044         return;
4045       }
4046       iter = vec[i] - lo;
4047     } else if (st > 0) {
4048       if (vec[i] < lo || vec[i] > up) {
4049         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4050                       "bounds [%lld,%lld]\n",
4051                       gtid, vec[i], lo, up));
4052         return;
4053       }
4054       iter = (kmp_uint64)(vec[i] - lo) / st;
4055     } else { // st < 0
4056       if (vec[i] > lo || vec[i] < up) {
4057         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058                       "bounds [%lld,%lld]\n",
4059                       gtid, vec[i], lo, up));
4060         return;
4061       }
4062       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4063     }
4064     iter_number = iter + ln * iter_number;
4065   }
4066   shft = iter_number % 32; // use 32-bit granularity
4067   iter_number >>= 5; // divided by 32
4068   flag = 1 << shft;
4069   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4070     KMP_YIELD(TRUE);
4071   }
4072   KMP_MB();
4073   KA_TRACE(20,
4074            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4075             gtid, (iter_number << 5) + shft));
4076 }
4077
4078 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4079   kmp_int32 shft, num_dims, i;
4080   kmp_uint32 flag;
4081   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4082   kmp_info_t *th = __kmp_threads[gtid];
4083   kmp_team_t *team = th->th.th_team;
4084   kmp_disp_t *pr_buf;
4085   kmp_int64 lo, st;
4086
4087   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4088   if (team->t.t_serialized) {
4089     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4090     return; // no dependencies if team is serialized
4091   }
4092
4093   // calculate sequential iteration number (same as in "wait" but no
4094   // out-of-bounds checks)
4095   pr_buf = th->th.th_dispatch;
4096   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4097   num_dims = pr_buf->th_doacross_info[0];
4098   lo = pr_buf->th_doacross_info[2];
4099   st = pr_buf->th_doacross_info[4];
4100   if (st == 1) { // most common case
4101     iter_number = vec[0] - lo;
4102   } else if (st > 0) {
4103     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4104   } else { // negative increment
4105     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4106   }
4107   for (i = 1; i < num_dims; ++i) {
4108     kmp_int64 iter, ln;
4109     kmp_int32 j = i * 4;
4110     ln = pr_buf->th_doacross_info[j + 1];
4111     lo = pr_buf->th_doacross_info[j + 2];
4112     st = pr_buf->th_doacross_info[j + 4];
4113     if (st == 1) {
4114       iter = vec[i] - lo;
4115     } else if (st > 0) {
4116       iter = (kmp_uint64)(vec[i] - lo) / st;
4117     } else { // st < 0
4118       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4119     }
4120     iter_number = iter + ln * iter_number;
4121   }
4122   shft = iter_number % 32; // use 32-bit granularity
4123   iter_number >>= 5; // divided by 32
4124   flag = 1 << shft;
4125   KMP_MB();
4126   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4127     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4128   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4129                 (iter_number << 5) + shft));
4130 }
4131
4132 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4133   kmp_int32 num_done;
4134   kmp_info_t *th = __kmp_threads[gtid];
4135   kmp_team_t *team = th->th.th_team;
4136   kmp_disp_t *pr_buf = th->th.th_dispatch;
4137
4138   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4139   if (team->t.t_serialized) {
4140     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4141     return; // nothing to do
4142   }
4143   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4144   if (num_done == th->th.th_team_nproc) {
4145     // we are the last thread, need to free shared resources
4146     int idx = pr_buf->th_doacross_buf_idx - 1;
4147     dispatch_shared_info_t *sh_buf =
4148         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4149     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4150                      (kmp_int64)&sh_buf->doacross_num_done);
4151     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4152     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4153     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4154     sh_buf->doacross_flags = NULL;
4155     sh_buf->doacross_num_done = 0;
4156     sh_buf->doacross_buf_idx +=
4157         __kmp_dispatch_num_buffers; // free buffer for future re-use
4158   }
4159   // free private resources (need to keep buffer index forever)
4160   pr_buf->th_doacross_flags = NULL;
4161   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4162   pr_buf->th_doacross_info = NULL;
4163   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4164 }
4165
4166 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4167 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4168   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4169 }
4170
4171 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4172   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4173 }
4174
4175 int __kmpc_get_target_offload(void) {
4176   if (!__kmp_init_serial) {
4177     __kmp_serial_initialize();
4178   }
4179   return __kmp_target_offload;
4180 }
4181
4182 int __kmpc_pause_resource(kmp_pause_status_t level) {
4183   if (!__kmp_init_serial) {
4184     return 1; // Can't pause if runtime is not initialized
4185   }
4186   return __kmp_pause_resource(level);
4187 }