contrib/openmp/runtime/src/kmp_csupport.cpp

   1 /*
   2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
   3  */
   4
   5 //===----------------------------------------------------------------------===//
   6 //
   7 //                     The LLVM Compiler Infrastructure
   8 //
   9 // This file is dual licensed under the MIT and the University of Illinois Open
  10 // Source Licenses. See LICENSE.txt for details.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #define __KMP_IMP
  15 #include "omp.h" /* extern "C" declarations of user-visible routines */
  16 #include "kmp.h"
  17 #include "kmp_error.h"
  18 #include "kmp_i18n.h"
  19 #include "kmp_itt.h"
  20 #include "kmp_lock.h"
  21 #include "kmp_stats.h"
  22
  23 #if OMPT_SUPPORT
  24 #include "ompt-specific.h"
  25 #endif
  26
  27 #define MAX_MESSAGE 512
  28
  29 // flags will be used in future, e.g. to implement openmp_strict library
  30 // restrictions
  31
  32 /*!
  33  * @ingroup STARTUP_SHUTDOWN
  34  * @param loc   in   source location information
  35  * @param flags in   for future use (currently ignored)
  36  *
  37  * Initialize the runtime library. This call is optional; if it is not made then
  38  * it will be implicitly called by attempts to use other library functions.
  39  */
  40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
  41   // By default __kmpc_begin() is no-op.
  42   char *env;
  43   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
  44       __kmp_str_match_true(env)) {
  45     __kmp_middle_initialize();
  46     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
  47   } else if (__kmp_ignore_mppbeg() == FALSE) {
  48     // By default __kmp_ignore_mppbeg() returns TRUE.
  49     __kmp_internal_begin();
  50     KC_TRACE(10, ("__kmpc_begin: called\n"));
  51   }
  52 }
  53
  54 /*!
  55  * @ingroup STARTUP_SHUTDOWN
  56  * @param loc source location information
  57  *
  58  * Shutdown the runtime library. This is also optional, and even if called will
  59  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
  60  * zero.
  61  */
  62 void __kmpc_end(ident_t *loc) {
  63   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
  64   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
  65   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
  66   // returns FALSE and __kmpc_end() will unregister this root (it can cause
  67   // library shut down).
  68   if (__kmp_ignore_mppend() == FALSE) {
  69     KC_TRACE(10, ("__kmpc_end: called\n"));
  70     KA_TRACE(30, ("__kmpc_end\n"));
  71
  72     __kmp_internal_end_thread(-1);
  73   }
  74 #if KMP_OS_WINDOWS && OMPT_SUPPORT
  75   // Normal exit process on Windows does not allow worker threads of the final
  76   // parallel region to finish reporting their events, so shutting down the
  77   // library here fixes the issue at least for the cases where __kmpc_end() is
  78   // placed properly.
  79   if (ompt_enabled.enabled)
  80     __kmp_internal_end_library(__kmp_gtid_get_specific());
  81 #endif
  82 }
  83
  84 /*!
  85 @ingroup THREAD_STATES
  86 @param loc Source location information.
  87 @return The global thread index of the active thread.
  88
  89 This function can be called in any context.
  90
  91 If the runtime has ony been entered at the outermost level from a
  92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
  93 that which would be returned by omp_get_thread_num() in the outermost
  94 active parallel construct. (Or zero if there is no active parallel
  95 construct, since the master thread is necessarily thread zero).
  96
  97 If multiple non-OpenMP threads all enter an OpenMP construct then this
  98 will be a unique thread identifier among all the threads created by
  99 the OpenMP runtime (but the value cannote be defined in terms of
 100 OpenMP thread ids returned by omp_get_thread_num()).
 101 */
 102 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
 103   kmp_int32 gtid = __kmp_entry_gtid();
 104
 105   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
 106
 107   return gtid;
 108 }
 109
 110 /*!
 111 @ingroup THREAD_STATES
 112 @param loc Source location information.
 113 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
 114
 115 This function can be called in any context.
 116 It returns the total number of threads under the control of the OpenMP runtime.
 117 That is not a number that can be determined by any OpenMP standard calls, since
 118 the library may be called from more than one non-OpenMP thread, and this
 119 reflects the total over all such calls. Similarly the runtime maintains
 120 underlying threads even when they are not active (since the cost of creating
 121 and destroying OS threads is high), this call counts all such threads even if
 122 they are not waiting for work.
 123 */
 124 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
 125   KC_TRACE(10,
 126            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
 127
 128   return TCR_4(__kmp_all_nth);
 129 }
 130
 131 /*!
 132 @ingroup THREAD_STATES
 133 @param loc Source location information.
 134 @return The thread number of the calling thread in the innermost active parallel
 135 construct.
 136 */
 137 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
 138   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
 139   return __kmp_tid_from_gtid(__kmp_entry_gtid());
 140 }
 141
 142 /*!
 143 @ingroup THREAD_STATES
 144 @param loc Source location information.
 145 @return The number of threads in the innermost active parallel construct.
 146 */
 147 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
 148   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
 149
 150   return __kmp_entry_thread()->th.th_team->t.t_nproc;
 151 }
 152
 153 /*!
 154  * @ingroup DEPRECATED
 155  * @param loc location description
 156  *
 157  * This function need not be called. It always returns TRUE.
 158  */
 159 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
 160 #ifndef KMP_DEBUG
 161
 162   return TRUE;
 163
 164 #else
 165
 166   const char *semi2;
 167   const char *semi3;
 168   int line_no;
 169
 170   if (__kmp_par_range == 0) {
 171     return TRUE;
 172   }
 173   semi2 = loc->psource;
 174   if (semi2 == NULL) {
 175     return TRUE;
 176   }
 177   semi2 = strchr(semi2, ';');
 178   if (semi2 == NULL) {
 179     return TRUE;
 180   }
 181   semi2 = strchr(semi2 + 1, ';');
 182   if (semi2 == NULL) {
 183     return TRUE;
 184   }
 185   if (__kmp_par_range_filename[0]) {
 186     const char *name = semi2 - 1;
 187     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
 188       name--;
 189     }
 190     if ((*name == '/') || (*name == ';')) {
 191       name++;
 192     }
 193     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
 194       return __kmp_par_range < 0;
 195     }
 196   }
 197   semi3 = strchr(semi2 + 1, ';');
 198   if (__kmp_par_range_routine[0]) {
 199     if ((semi3 != NULL) && (semi3 > semi2) &&
 200         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
 201       return __kmp_par_range < 0;
 202     }
 203   }
 204   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
 205     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
 206       return __kmp_par_range > 0;
 207     }
 208     return __kmp_par_range < 0;
 209   }
 210   return TRUE;
 211
 212 #endif /* KMP_DEBUG */
 213 }
 214
 215 /*!
 216 @ingroup THREAD_STATES
 217 @param loc Source location information.
 218 @return 1 if this thread is executing inside an active parallel region, zero if
 219 not.
 220 */
 221 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
 222   return __kmp_entry_thread()->th.th_root->r.r_active;
 223 }
 224
 225 /*!
 226 @ingroup PARALLEL
 227 @param loc source location information
 228 @param global_tid global thread number
 229 @param num_threads number of threads requested for this parallel construct
 230
 231 Set the number of threads to be used by the next fork spawned by this thread.
 232 This call is only required if the parallel construct has a `num_threads` clause.
 233 */
 234 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
 235                              kmp_int32 num_threads) {
 236   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
 237                 global_tid, num_threads));
 238
 239   __kmp_push_num_threads(loc, global_tid, num_threads);
 240 }
 241
 242 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
 243   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
 244
 245   /* the num_threads are automatically popped */
 246 }
 247
 248 #if OMP_40_ENABLED
 249
 250 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
 251                            kmp_int32 proc_bind) {
 252   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
 253                 proc_bind));
 254
 255   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
 256 }
 257
 258 #endif /* OMP_40_ENABLED */
 259
 260 /*!
 261 @ingroup PARALLEL
 262 @param loc  source location information
 263 @param argc  total number of arguments in the ellipsis
 264 @param microtask  pointer to callback routine consisting of outlined parallel
 265 construct
 266 @param ...  pointers to shared variables that aren't global
 267
 268 Do the actual fork and call the microtask in the relevant number of threads.
 269 */
 270 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
 271   int gtid = __kmp_entry_gtid();
 272
 273 #if (KMP_STATS_ENABLED)
 274   // If we were in a serial region, then stop the serial timer, record
 275   // the event, and start parallel region timer
 276   stats_state_e previous_state = KMP_GET_THREAD_STATE();
 277   if (previous_state == stats_state_e::SERIAL_REGION) {
 278     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
 279   } else {
 280     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
 281   }
 282   int inParallel = __kmpc_in_parallel(loc);
 283   if (inParallel) {
 284     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
 285   } else {
 286     KMP_COUNT_BLOCK(OMP_PARALLEL);
 287   }
 288 #endif
 289
 290   // maybe to save thr_state is enough here
 291   {
 292     va_list ap;
 293     va_start(ap, microtask);
 294
 295 #if OMPT_SUPPORT
 296     ompt_frame_t *ompt_frame;
 297     if (ompt_enabled.enabled) {
 298       kmp_info_t *master_th = __kmp_threads[gtid];
 299       kmp_team_t *parent_team = master_th->th.th_team;
 300       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
 301       if (lwt)
 302         ompt_frame = &(lwt->ompt_task_info.frame);
 303       else {
 304         int tid = __kmp_tid_from_gtid(gtid);
 305         ompt_frame = &(
 306             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
 307       }
 308       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 309       OMPT_STORE_RETURN_ADDRESS(gtid);
 310     }
 311 #endif
 312
 313 #if INCLUDE_SSC_MARKS
 314     SSC_MARK_FORKING();
 315 #endif
 316     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
 317                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
 318                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
 319 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 320 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
 321                     &ap
 322 #else
 323                     ap
 324 #endif
 325                     );
 326 #if INCLUDE_SSC_MARKS
 327     SSC_MARK_JOINING();
 328 #endif
 329     __kmp_join_call(loc, gtid
 330 #if OMPT_SUPPORT
 331                     ,
 332                     fork_context_intel
 333 #endif
 334                     );
 335
 336     va_end(ap);
 337   }
 338
 339 #if KMP_STATS_ENABLED
 340   if (previous_state == stats_state_e::SERIAL_REGION) {
 341     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
 342   } else {
 343     KMP_POP_PARTITIONED_TIMER();
 344   }
 345 #endif // KMP_STATS_ENABLED
 346 }
 347
 348 #if OMP_40_ENABLED
 349 /*!
 350 @ingroup PARALLEL
 351 @param loc source location information
 352 @param global_tid global thread number
 353 @param num_teams number of teams requested for the teams construct
 354 @param num_threads number of threads per team requested for the teams construct
 355
 356 Set the number of teams to be used by the teams construct.
 357 This call is only required if the teams construct has a `num_teams` clause
 358 or a `thread_limit` clause (or both).
 359 */
 360 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
 361                            kmp_int32 num_teams, kmp_int32 num_threads) {
 362   KA_TRACE(20,
 363            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
 364             global_tid, num_teams, num_threads));
 365
 366   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
 367 }
 368
 369 /*!
 370 @ingroup PARALLEL
 371 @param loc  source location information
 372 @param argc  total number of arguments in the ellipsis
 373 @param microtask  pointer to callback routine consisting of outlined teams
 374 construct
 375 @param ...  pointers to shared variables that aren't global
 376
 377 Do the actual fork and call the microtask in the relevant number of threads.
 378 */
 379 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
 380                        ...) {
 381   int gtid = __kmp_entry_gtid();
 382   kmp_info_t *this_thr = __kmp_threads[gtid];
 383   va_list ap;
 384   va_start(ap, microtask);
 385
 386   KMP_COUNT_BLOCK(OMP_TEAMS);
 387
 388   // remember teams entry point and nesting level
 389   this_thr->th.th_teams_microtask = microtask;
 390   this_thr->th.th_teams_level =
 391       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
 392
 393 #if OMPT_SUPPORT
 394   kmp_team_t *parent_team = this_thr->th.th_team;
 395   int tid = __kmp_tid_from_gtid(gtid);
 396   if (ompt_enabled.enabled) {
 397     parent_team->t.t_implicit_task_taskdata[tid]
 398         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 399   }
 400   OMPT_STORE_RETURN_ADDRESS(gtid);
 401 #endif
 402
 403   // check if __kmpc_push_num_teams called, set default number of teams
 404   // otherwise
 405   if (this_thr->th.th_teams_size.nteams == 0) {
 406     __kmp_push_num_teams(loc, gtid, 0, 0);
 407   }
 408   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
 409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
 410   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 411
 412   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
 413                   VOLATILE_CAST(microtask_t)
 414                       __kmp_teams_master, // "wrapped" task
 415                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
 416 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
 417                   &ap
 418 #else
 419                   ap
 420 #endif
 421                   );
 422   __kmp_join_call(loc, gtid
 423 #if OMPT_SUPPORT
 424                   ,
 425                   fork_context_intel
 426 #endif
 427                   );
 428
 429   this_thr->th.th_teams_microtask = NULL;
 430   this_thr->th.th_teams_level = 0;
 431   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
 432   va_end(ap);
 433 }
 434 #endif /* OMP_40_ENABLED */
 435
 436 // I don't think this function should ever have been exported.
 437 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
 438 // openmp code ever called it, but it's been exported from the RTL for so
 439 // long that I'm afraid to remove the definition.
 440 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
 441
 442 /*!
 443 @ingroup PARALLEL
 444 @param loc  source location information
 445 @param global_tid  global thread number
 446
 447 Enter a serialized parallel construct. This interface is used to handle a
 448 conditional parallel region, like this,
 449 @code
 450 #pragma omp parallel if (condition)
 451 @endcode
 452 when the condition is false.
 453 */
 454 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 455 // The implementation is now in kmp_runtime.cpp so that it can share static
 456 // functions with kmp_fork_call since the tasks to be done are similar in
 457 // each case.
 458 #if OMPT_SUPPORT
 459   OMPT_STORE_RETURN_ADDRESS(global_tid);
 460 #endif
 461   __kmp_serialized_parallel(loc, global_tid);
 462 }
 463
 464 /*!
 465 @ingroup PARALLEL
 466 @param loc  source location information
 467 @param global_tid  global thread number
 468
 469 Leave a serialized parallel construct.
 470 */
 471 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 472   kmp_internal_control_t *top;
 473   kmp_info_t *this_thr;
 474   kmp_team_t *serial_team;
 475
 476   KC_TRACE(10,
 477            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
 478
 479   /* skip all this code for autopar serialized loops since it results in
 480      unacceptable overhead */
 481   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
 482     return;
 483
 484   // Not autopar code
 485   if (!TCR_4(__kmp_init_parallel))
 486     __kmp_parallel_initialize();
 487
 488   this_thr = __kmp_threads[global_tid];
 489   serial_team = this_thr->th.th_serial_team;
 490
 491 #if OMP_45_ENABLED
 492   kmp_task_team_t *task_team = this_thr->th.th_task_team;
 493
 494   // we need to wait for the proxy tasks before finishing the thread
 495   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
 496     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
 497 #endif
 498
 499   KMP_MB();
 500   KMP_DEBUG_ASSERT(serial_team);
 501   KMP_ASSERT(serial_team->t.t_serialized);
 502   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
 503   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
 504   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
 505   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
 506
 507 #if OMPT_SUPPORT
 508   if (ompt_enabled.enabled &&
 509       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
 510     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
 511     if (ompt_enabled.ompt_callback_implicit_task) {
 512       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
 513           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
 514           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
 515     }
 516
 517     // reset clear the task id only after unlinking the task
 518     ompt_data_t *parent_task_data;
 519     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
 520
 521     if (ompt_enabled.ompt_callback_parallel_end) {
 522       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
 523           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
 524           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
 525     }
 526     __ompt_lw_taskteam_unlink(this_thr);
 527     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
 528   }
 529 #endif
 530
 531   /* If necessary, pop the internal control stack values and replace the team
 532    * values */
 533   top = serial_team->t.t_control_stack_top;
 534   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
 535     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
 536     serial_team->t.t_control_stack_top = top->next;
 537     __kmp_free(top);
 538   }
 539
 540   // if( serial_team -> t.t_serialized > 1 )
 541   serial_team->t.t_level--;
 542
 543   /* pop dispatch buffers stack */
 544   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
 545   {
 546     dispatch_private_info_t *disp_buffer =
 547         serial_team->t.t_dispatch->th_disp_buffer;
 548     serial_team->t.t_dispatch->th_disp_buffer =
 549         serial_team->t.t_dispatch->th_disp_buffer->next;
 550     __kmp_free(disp_buffer);
 551   }
 552 #if OMP_50_ENABLED
 553   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 554 #endif
 555
 556   --serial_team->t.t_serialized;
 557   if (serial_team->t.t_serialized == 0) {
 558
 559 /* return to the parallel section */
 560
 561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 562     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
 563       __kmp_clear_x87_fpu_status_word();
 564       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
 565       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
 566     }
 567 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 568
 569     this_thr->th.th_team = serial_team->t.t_parent;
 570     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
 571
 572     /* restore values cached in the thread */
 573     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
 574     this_thr->th.th_team_master =
 575         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
 576     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
 577
 578     /* TODO the below shouldn't need to be adjusted for serialized teams */
 579     this_thr->th.th_dispatch =
 580         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
 581
 582     __kmp_pop_current_task_from_thread(this_thr);
 583
 584     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
 585     this_thr->th.th_current_task->td_flags.executing = 1;
 586
 587     if (__kmp_tasking_mode != tskm_immediate_exec) {
 588       // Copy the task team from the new child / old parent team to the thread.
 589       this_thr->th.th_task_team =
 590           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
 591       KA_TRACE(20,
 592                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
 593                 "team %p\n",
 594                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
 595     }
 596   } else {
 597     if (__kmp_tasking_mode != tskm_immediate_exec) {
 598       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
 599                     "depth of serial team %p to %d\n",
 600                     global_tid, serial_team, serial_team->t.t_serialized));
 601     }
 602   }
 603
 604   if (__kmp_env_consistency_check)
 605     __kmp_pop_parallel(global_tid, NULL);
 606 #if OMPT_SUPPORT
 607   if (ompt_enabled.enabled)
 608     this_thr->th.ompt_thread_info.state =
 609         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
 610                                            : ompt_state_work_parallel);
 611 #endif
 612 }
 613
 614 /*!
 615 @ingroup SYNCHRONIZATION
 616 @param loc  source location information.
 617
 618 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
 619 depending on the memory ordering convention obeyed by the compiler
 620 even that may not be necessary).
 621 */
 622 void __kmpc_flush(ident_t *loc) {
 623   KC_TRACE(10, ("__kmpc_flush: called\n"));
 624
 625   /* need explicit __mf() here since use volatile instead in library */
 626   KMP_MB(); /* Flush all pending memory write invalidates.  */
 627
 628 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 629 #if KMP_MIC
 630 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
 631 // We shouldn't need it, though, since the ABI rules require that
 632 // * If the compiler generates NGO stores it also generates the fence
 633 // * If users hand-code NGO stores they should insert the fence
 634 // therefore no incomplete unordered stores should be visible.
 635 #else
 636   // C74404
 637   // This is to address non-temporal store instructions (sfence needed).
 638   // The clflush instruction is addressed either (mfence needed).
 639   // Probably the non-temporal load monvtdqa instruction should also be
 640   // addressed.
 641   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
 642   if (!__kmp_cpuinfo.initialized) {
 643     __kmp_query_cpuid(&__kmp_cpuinfo);
 644   }
 645   if (!__kmp_cpuinfo.sse2) {
 646     // CPU cannot execute SSE2 instructions.
 647   } else {
 648 #if KMP_COMPILER_ICC
 649     _mm_mfence();
 650 #elif KMP_COMPILER_MSVC
 651     MemoryBarrier();
 652 #else
 653     __sync_synchronize();
 654 #endif // KMP_COMPILER_ICC
 655   }
 656 #endif // KMP_MIC
 657 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
 658 // Nothing to see here move along
 659 #elif KMP_ARCH_PPC64
 660 // Nothing needed here (we have a real MB above).
 661 #if KMP_OS_CNK
 662   // The flushing thread needs to yield here; this prevents a
 663   // busy-waiting thread from saturating the pipeline. flush is
 664   // often used in loops like this:
 665   // while (!flag) {
 666   //   #pragma omp flush(flag)
 667   // }
 668   // and adding the yield here is good for at least a 10x speedup
 669   // when running >2 threads per core (on the NAS LU benchmark).
 670   __kmp_yield(TRUE);
 671 #endif
 672 #else
 673 #error Unknown or unsupported architecture
 674 #endif
 675
 676 #if OMPT_SUPPORT && OMPT_OPTIONAL
 677   if (ompt_enabled.ompt_callback_flush) {
 678     ompt_callbacks.ompt_callback(ompt_callback_flush)(
 679         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
 680   }
 681 #endif
 682 }
 683
 684 /* -------------------------------------------------------------------------- */
 685 /*!
 686 @ingroup SYNCHRONIZATION
 687 @param loc source location information
 688 @param global_tid thread id.
 689
 690 Execute a barrier.
 691 */
 692 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
 693   KMP_COUNT_BLOCK(OMP_BARRIER);
 694   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
 695
 696   if (!TCR_4(__kmp_init_parallel))
 697     __kmp_parallel_initialize();
 698
 699   if (__kmp_env_consistency_check) {
 700     if (loc == 0) {
 701       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
 702     }
 703
 704     __kmp_check_barrier(global_tid, ct_barrier, loc);
 705   }
 706
 707 #if OMPT_SUPPORT
 708   ompt_frame_t *ompt_frame;
 709   if (ompt_enabled.enabled) {
 710     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
 711     if (ompt_frame->enter_frame.ptr == NULL)
 712       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 713     OMPT_STORE_RETURN_ADDRESS(global_tid);
 714   }
 715 #endif
 716   __kmp_threads[global_tid]->th.th_ident = loc;
 717   // TODO: explicit barrier_wait_id:
 718   //   this function is called when 'barrier' directive is present or
 719   //   implicit barrier at the end of a worksharing construct.
 720   // 1) better to add a per-thread barrier counter to a thread data structure
 721   // 2) set to 0 when a new team is created
 722   // 4) no sync is required
 723
 724   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 725 #if OMPT_SUPPORT && OMPT_OPTIONAL
 726   if (ompt_enabled.enabled) {
 727     ompt_frame->enter_frame = ompt_data_none;
 728   }
 729 #endif
 730 }
 731
 732 /* The BARRIER for a MASTER section is always explicit   */
 733 /*!
 734 @ingroup WORK_SHARING
 735 @param loc  source location information.
 736 @param global_tid  global thread number .
 737 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
 738 */
 739 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
 740   int status = 0;
 741
 742   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
 743
 744   if (!TCR_4(__kmp_init_parallel))
 745     __kmp_parallel_initialize();
 746
 747   if (KMP_MASTER_GTID(global_tid)) {
 748     KMP_COUNT_BLOCK(OMP_MASTER);
 749     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
 750     status = 1;
 751   }
 752
 753 #if OMPT_SUPPORT && OMPT_OPTIONAL
 754   if (status) {
 755     if (ompt_enabled.ompt_callback_master) {
 756       kmp_info_t *this_thr = __kmp_threads[global_tid];
 757       kmp_team_t *team = this_thr->th.th_team;
 758
 759       int tid = __kmp_tid_from_gtid(global_tid);
 760       ompt_callbacks.ompt_callback(ompt_callback_master)(
 761           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
 762           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
 763           OMPT_GET_RETURN_ADDRESS(0));
 764     }
 765   }
 766 #endif
 767
 768   if (__kmp_env_consistency_check) {
 769 #if KMP_USE_DYNAMIC_LOCK
 770     if (status)
 771       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
 772     else
 773       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
 774 #else
 775     if (status)
 776       __kmp_push_sync(global_tid, ct_master, loc, NULL);
 777     else
 778       __kmp_check_sync(global_tid, ct_master, loc, NULL);
 779 #endif
 780   }
 781
 782   return status;
 783 }
 784
 785 /*!
 786 @ingroup WORK_SHARING
 787 @param loc  source location information.
 788 @param global_tid  global thread number .
 789
 790 Mark the end of a <tt>master</tt> region. This should only be called by the
 791 thread that executes the <tt>master</tt> region.
 792 */
 793 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
 794   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
 795
 796   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
 797   KMP_POP_PARTITIONED_TIMER();
 798
 799 #if OMPT_SUPPORT && OMPT_OPTIONAL
 800   kmp_info_t *this_thr = __kmp_threads[global_tid];
 801   kmp_team_t *team = this_thr->th.th_team;
 802   if (ompt_enabled.ompt_callback_master) {
 803     int tid = __kmp_tid_from_gtid(global_tid);
 804     ompt_callbacks.ompt_callback(ompt_callback_master)(
 805         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
 806         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
 807         OMPT_GET_RETURN_ADDRESS(0));
 808   }
 809 #endif
 810
 811   if (__kmp_env_consistency_check) {
 812     if (global_tid < 0)
 813       KMP_WARNING(ThreadIdentInvalid);
 814
 815     if (KMP_MASTER_GTID(global_tid))
 816       __kmp_pop_sync(global_tid, ct_master, loc);
 817   }
 818 }
 819
 820 /*!
 821 @ingroup WORK_SHARING
 822 @param loc  source location information.
 823 @param gtid  global thread number.
 824
 825 Start execution of an <tt>ordered</tt> construct.
 826 */
 827 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
 828   int cid = 0;
 829   kmp_info_t *th;
 830   KMP_DEBUG_ASSERT(__kmp_init_serial);
 831
 832   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
 833
 834   if (!TCR_4(__kmp_init_parallel))
 835     __kmp_parallel_initialize();
 836
 837 #if USE_ITT_BUILD
 838   __kmp_itt_ordered_prep(gtid);
 839 // TODO: ordered_wait_id
 840 #endif /* USE_ITT_BUILD */
 841
 842   th = __kmp_threads[gtid];
 843
 844 #if OMPT_SUPPORT && OMPT_OPTIONAL
 845   kmp_team_t *team;
 846   ompt_wait_id_t lck;
 847   void *codeptr_ra;
 848   if (ompt_enabled.enabled) {
 849     OMPT_STORE_RETURN_ADDRESS(gtid);
 850     team = __kmp_team_from_gtid(gtid);
 851     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
 852     /* OMPT state update */
 853     th->th.ompt_thread_info.wait_id = lck;
 854     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
 855
 856     /* OMPT event callback */
 857     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
 858     if (ompt_enabled.ompt_callback_mutex_acquire) {
 859       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
 860           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
 861           (ompt_wait_id_t)lck, codeptr_ra);
 862     }
 863   }
 864 #endif
 865
 866   if (th->th.th_dispatch->th_deo_fcn != 0)
 867     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
 868   else
 869     __kmp_parallel_deo(&gtid, &cid, loc);
 870
 871 #if OMPT_SUPPORT && OMPT_OPTIONAL
 872   if (ompt_enabled.enabled) {
 873     /* OMPT state update */
 874     th->th.ompt_thread_info.state = ompt_state_work_parallel;
 875     th->th.ompt_thread_info.wait_id = 0;
 876
 877     /* OMPT event callback */
 878     if (ompt_enabled.ompt_callback_mutex_acquired) {
 879       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
 880           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
 881     }
 882   }
 883 #endif
 884
 885 #if USE_ITT_BUILD
 886   __kmp_itt_ordered_start(gtid);
 887 #endif /* USE_ITT_BUILD */
 888 }
 889
 890 /*!
 891 @ingroup WORK_SHARING
 892 @param loc  source location information.
 893 @param gtid  global thread number.
 894
 895 End execution of an <tt>ordered</tt> construct.
 896 */
 897 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
 898   int cid = 0;
 899   kmp_info_t *th;
 900
 901   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
 902
 903 #if USE_ITT_BUILD
 904   __kmp_itt_ordered_end(gtid);
 905 // TODO: ordered_wait_id
 906 #endif /* USE_ITT_BUILD */
 907
 908   th = __kmp_threads[gtid];
 909
 910   if (th->th.th_dispatch->th_dxo_fcn != 0)
 911     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
 912   else
 913     __kmp_parallel_dxo(&gtid, &cid, loc);
 914
 915 #if OMPT_SUPPORT && OMPT_OPTIONAL
 916   OMPT_STORE_RETURN_ADDRESS(gtid);
 917   if (ompt_enabled.ompt_callback_mutex_released) {
 918     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
 919         ompt_mutex_ordered,
 920         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
 921         OMPT_LOAD_RETURN_ADDRESS(gtid));
 922   }
 923 #endif
 924 }
 925
 926 #if KMP_USE_DYNAMIC_LOCK
 927
 928 static __forceinline void
 929 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
 930                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
 931   // Pointer to the allocated indirect lock is written to crit, while indexing
 932   // is ignored.
 933   void *idx;
 934   kmp_indirect_lock_t **lck;
 935   lck = (kmp_indirect_lock_t **)crit;
 936   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
 937   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
 938   KMP_SET_I_LOCK_LOCATION(ilk, loc);
 939   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
 940   KA_TRACE(20,
 941            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
 942 #if USE_ITT_BUILD
 943   __kmp_itt_critical_creating(ilk->lock, loc);
 944 #endif
 945   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
 946   if (status == 0) {
 947 #if USE_ITT_BUILD
 948     __kmp_itt_critical_destroyed(ilk->lock);
 949 #endif
 950     // We don't really need to destroy the unclaimed lock here since it will be
 951     // cleaned up at program exit.
 952     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
 953   }
 954   KMP_DEBUG_ASSERT(*lck != NULL);
 955 }
 956
 957 // Fast-path acquire tas lock
 958 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
 959   {                                                                            \
 960     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
 961     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
 962     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
 963     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
 964         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
 965       kmp_uint32 spins;                                                        \
 966       KMP_FSYNC_PREPARE(l);                                                    \
 967       KMP_INIT_YIELD(spins);                                                   \
 968       if (TCR_4(__kmp_nth) >                                                   \
 969           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
 970         KMP_YIELD(TRUE);                                                       \
 971       } else {                                                                 \
 972         KMP_YIELD_SPIN(spins);                                                 \
 973       }                                                                        \
 974       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
 975       while (                                                                  \
 976           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
 977           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
 978         __kmp_spin_backoff(&backoff);                                          \
 979         if (TCR_4(__kmp_nth) >                                                 \
 980             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
 981           KMP_YIELD(TRUE);                                                     \
 982         } else {                                                               \
 983           KMP_YIELD_SPIN(spins);                                               \
 984         }                                                                      \
 985       }                                                                        \
 986     }                                                                          \
 987     KMP_FSYNC_ACQUIRED(l);                                                     \
 988   }
 989
 990 // Fast-path test tas lock
 991 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
 992   {                                                                            \
 993     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
 994     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
 995     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
 996     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
 997          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
 998   }
 999
1000 // Fast-path release tas lock
1001 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1002   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1003
1004 #if KMP_USE_FUTEX
1005
1006 #include <sys/syscall.h>
1007 #include <unistd.h>
1008 #ifndef FUTEX_WAIT
1009 #define FUTEX_WAIT 0
1010 #endif
1011 #ifndef FUTEX_WAKE
1012 #define FUTEX_WAKE 1
1013 #endif
1014
1015 // Fast-path acquire futex lock
1016 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1017   {                                                                            \
1018     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1019     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1020     KMP_MB();                                                                  \
1021     KMP_FSYNC_PREPARE(ftx);                                                    \
1022     kmp_int32 poll_val;                                                        \
1023     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1024                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1025                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1026       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1027       if (!cond) {                                                             \
1028         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1029                                          poll_val |                            \
1030                                              KMP_LOCK_BUSY(1, futex))) {       \
1031           continue;                                                            \
1032         }                                                                      \
1033         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1034       }                                                                        \
1035       kmp_int32 rc;                                                            \
1036       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1037                         NULL, NULL, 0)) != 0) {                                \
1038         continue;                                                              \
1039       }                                                                        \
1040       gtid_code |= 1;                                                          \
1041     }                                                                          \
1042     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1043   }
1044
1045 // Fast-path test futex lock
1046 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1047   {                                                                            \
1048     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1049     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1050                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1051       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1052       rc = TRUE;                                                               \
1053     } else {                                                                   \
1054       rc = FALSE;                                                              \
1055     }                                                                          \
1056   }
1057
1058 // Fast-path release futex lock
1059 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1060   {                                                                            \
1061     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1062     KMP_MB();                                                                  \
1063     KMP_FSYNC_RELEASING(ftx);                                                  \
1064     kmp_int32 poll_val =                                                       \
1065         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1066     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1067       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1068               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1069     }                                                                          \
1070     KMP_MB();                                                                  \
1071     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1072               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1073   }
1074
1075 #endif // KMP_USE_FUTEX
1076
1077 #else // KMP_USE_DYNAMIC_LOCK
1078
1079 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1080                                                       ident_t const *loc,
1081                                                       kmp_int32 gtid) {
1082   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1083
1084   // Because of the double-check, the following load doesn't need to be volatile
1085   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1086
1087   if (lck == NULL) {
1088     void *idx;
1089
1090     // Allocate & initialize the lock.
1091     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1092     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1093     __kmp_init_user_lock_with_checks(lck);
1094     __kmp_set_user_lock_location(lck, loc);
1095 #if USE_ITT_BUILD
1096     __kmp_itt_critical_creating(lck);
1097 // __kmp_itt_critical_creating() should be called *before* the first usage
1098 // of underlying lock. It is the only place where we can guarantee it. There
1099 // are chances the lock will destroyed with no usage, but it is not a
1100 // problem, because this is not real event seen by user but rather setting
1101 // name for object (lock). See more details in kmp_itt.h.
1102 #endif /* USE_ITT_BUILD */
1103
1104     // Use a cmpxchg instruction to slam the start of the critical section with
1105     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1106     // and use the lock that the other thread allocated.
1107     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1108
1109     if (status == 0) {
1110 // Deallocate the lock and reload the value.
1111 #if USE_ITT_BUILD
1112       __kmp_itt_critical_destroyed(lck);
1113 // Let ITT know the lock is destroyed and the same memory location may be reused
1114 // for another purpose.
1115 #endif /* USE_ITT_BUILD */
1116       __kmp_destroy_user_lock_with_checks(lck);
1117       __kmp_user_lock_free(&idx, gtid, lck);
1118       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1119       KMP_DEBUG_ASSERT(lck != NULL);
1120     }
1121   }
1122   return lck;
1123 }
1124
1125 #endif // KMP_USE_DYNAMIC_LOCK
1126
1127 /*!
1128 @ingroup WORK_SHARING
1129 @param loc  source location information.
1130 @param global_tid  global thread number .
1131 @param crit identity of the critical section. This could be a pointer to a lock
1132 associated with the critical section, or some other suitably unique value.
1133
1134 Enter code protected by a `critical` construct.
1135 This function blocks until the executing thread can enter the critical section.
1136 */
1137 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1138                      kmp_critical_name *crit) {
1139 #if KMP_USE_DYNAMIC_LOCK
1140 #if OMPT_SUPPORT && OMPT_OPTIONAL
1141   OMPT_STORE_RETURN_ADDRESS(global_tid);
1142 #endif // OMPT_SUPPORT
1143   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1144 #else
1145   KMP_COUNT_BLOCK(OMP_CRITICAL);
1146 #if OMPT_SUPPORT && OMPT_OPTIONAL
1147   ompt_state_t prev_state = ompt_state_undefined;
1148   ompt_thread_info_t ti;
1149 #endif
1150   kmp_user_lock_p lck;
1151
1152   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1153
1154   // TODO: add THR_OVHD_STATE
1155
1156   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1157   KMP_CHECK_USER_LOCK_INIT();
1158
1159   if ((__kmp_user_lock_kind == lk_tas) &&
1160       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1161     lck = (kmp_user_lock_p)crit;
1162   }
1163 #if KMP_USE_FUTEX
1164   else if ((__kmp_user_lock_kind == lk_futex) &&
1165            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1166     lck = (kmp_user_lock_p)crit;
1167   }
1168 #endif
1169   else { // ticket, queuing or drdpa
1170     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1171   }
1172
1173   if (__kmp_env_consistency_check)
1174     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1175
1176 // since the critical directive binds to all threads, not just the current
1177 // team we have to check this even if we are in a serialized team.
1178 // also, even if we are the uber thread, we still have to conduct the lock,
1179 // as we have to contend with sibling threads.
1180
1181 #if USE_ITT_BUILD
1182   __kmp_itt_critical_acquiring(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185   OMPT_STORE_RETURN_ADDRESS(gtid);
1186   void *codeptr_ra = NULL;
1187   if (ompt_enabled.enabled) {
1188     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1189     /* OMPT state update */
1190     prev_state = ti.state;
1191     ti.wait_id = (ompt_wait_id_t)lck;
1192     ti.state = ompt_state_wait_critical;
1193
1194     /* OMPT event callback */
1195     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1196     if (ompt_enabled.ompt_callback_mutex_acquire) {
1197       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1198           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1199           (ompt_wait_id_t)crit, codeptr_ra);
1200     }
1201   }
1202 #endif
1203   // Value of 'crit' should be good for using as a critical_id of the critical
1204   // section directive.
1205   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1206
1207 #if USE_ITT_BUILD
1208   __kmp_itt_critical_acquired(lck);
1209 #endif /* USE_ITT_BUILD */
1210 #if OMPT_SUPPORT && OMPT_OPTIONAL
1211   if (ompt_enabled.enabled) {
1212     /* OMPT state update */
1213     ti.state = prev_state;
1214     ti.wait_id = 0;
1215
1216     /* OMPT event callback */
1217     if (ompt_enabled.ompt_callback_mutex_acquired) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1219           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1220     }
1221   }
1222 #endif
1223   KMP_POP_PARTITIONED_TIMER();
1224
1225   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1226   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1227 #endif // KMP_USE_DYNAMIC_LOCK
1228 }
1229
1230 #if KMP_USE_DYNAMIC_LOCK
1231
1232 // Converts the given hint to an internal lock implementation
1233 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1234 #if KMP_USE_TSX
1235 #define KMP_TSX_LOCK(seq) lockseq_##seq
1236 #else
1237 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1238 #endif
1239
1240 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1241 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1242 #else
1243 #define KMP_CPUINFO_RTM 0
1244 #endif
1245
1246   // Hints that do not require further logic
1247   if (hint & kmp_lock_hint_hle)
1248     return KMP_TSX_LOCK(hle);
1249   if (hint & kmp_lock_hint_rtm)
1250     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1251   if (hint & kmp_lock_hint_adaptive)
1252     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1253
1254   // Rule out conflicting hints first by returning the default lock
1255   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1256     return __kmp_user_lock_seq;
1257   if ((hint & omp_lock_hint_speculative) &&
1258       (hint & omp_lock_hint_nonspeculative))
1259     return __kmp_user_lock_seq;
1260
1261   // Do not even consider speculation when it appears to be contended
1262   if (hint & omp_lock_hint_contended)
1263     return lockseq_queuing;
1264
1265   // Uncontended lock without speculation
1266   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1267     return lockseq_tas;
1268
1269   // HLE lock for speculation
1270   if (hint & omp_lock_hint_speculative)
1271     return KMP_TSX_LOCK(hle);
1272
1273   return __kmp_user_lock_seq;
1274 }
1275
1276 #if OMPT_SUPPORT && OMPT_OPTIONAL
1277 #if KMP_USE_DYNAMIC_LOCK
1278 static kmp_mutex_impl_t
1279 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1280   if (user_lock) {
1281     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1282     case 0:
1283       break;
1284 #if KMP_USE_FUTEX
1285     case locktag_futex:
1286       return kmp_mutex_impl_queuing;
1287 #endif
1288     case locktag_tas:
1289       return kmp_mutex_impl_spin;
1290 #if KMP_USE_TSX
1291     case locktag_hle:
1292       return kmp_mutex_impl_speculative;
1293 #endif
1294     default:
1295       return kmp_mutex_impl_none;
1296     }
1297     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1298   }
1299   KMP_ASSERT(ilock);
1300   switch (ilock->type) {
1301 #if KMP_USE_TSX
1302   case locktag_adaptive:
1303   case locktag_rtm:
1304     return kmp_mutex_impl_speculative;
1305 #endif
1306   case locktag_nested_tas:
1307     return kmp_mutex_impl_spin;
1308 #if KMP_USE_FUTEX
1309   case locktag_nested_futex:
1310 #endif
1311   case locktag_ticket:
1312   case locktag_queuing:
1313   case locktag_drdpa:
1314   case locktag_nested_ticket:
1315   case locktag_nested_queuing:
1316   case locktag_nested_drdpa:
1317     return kmp_mutex_impl_queuing;
1318   default:
1319     return kmp_mutex_impl_none;
1320   }
1321 }
1322 #else
1323 // For locks without dynamic binding
1324 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1325   switch (__kmp_user_lock_kind) {
1326   case lk_tas:
1327     return kmp_mutex_impl_spin;
1328 #if KMP_USE_FUTEX
1329   case lk_futex:
1330 #endif
1331   case lk_ticket:
1332   case lk_queuing:
1333   case lk_drdpa:
1334     return kmp_mutex_impl_queuing;
1335 #if KMP_USE_TSX
1336   case lk_hle:
1337   case lk_rtm:
1338   case lk_adaptive:
1339     return kmp_mutex_impl_speculative;
1340 #endif
1341   default:
1342     return kmp_mutex_impl_none;
1343   }
1344 }
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1347
1348 /*!
1349 @ingroup WORK_SHARING
1350 @param loc  source location information.
1351 @param global_tid  global thread number.
1352 @param crit identity of the critical section. This could be a pointer to a lock
1353 associated with the critical section, or some other suitably unique value.
1354 @param hint the lock hint.
1355
1356 Enter code protected by a `critical` construct with a hint. The hint value is
1357 used to suggest a lock implementation. This function blocks until the executing
1358 thread can enter the critical section unless the hint suggests use of
1359 speculative execution and the hardware supports it.
1360 */
1361 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1362                                kmp_critical_name *crit, uint32_t hint) {
1363   KMP_COUNT_BLOCK(OMP_CRITICAL);
1364   kmp_user_lock_p lck;
1365 #if OMPT_SUPPORT && OMPT_OPTIONAL
1366   ompt_state_t prev_state = ompt_state_undefined;
1367   ompt_thread_info_t ti;
1368   // This is the case, if called from __kmpc_critical:
1369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1370   if (!codeptr)
1371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1372 #endif
1373
1374   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1375
1376   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1377   // Check if it is initialized.
1378   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1379   if (*lk == 0) {
1380     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1381     if (KMP_IS_D_LOCK(lckseq)) {
1382       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1383                                   KMP_GET_D_TAG(lckseq));
1384     } else {
1385       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1386     }
1387   }
1388   // Branch for accessing the actual lock object and set operation. This
1389   // branching is inevitable since this lock initialization does not follow the
1390   // normal dispatch path (lock table is not used).
1391   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1392     lck = (kmp_user_lock_p)lk;
1393     if (__kmp_env_consistency_check) {
1394       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1395                       __kmp_map_hint_to_lock(hint));
1396     }
1397 #if USE_ITT_BUILD
1398     __kmp_itt_critical_acquiring(lck);
1399 #endif
1400 #if OMPT_SUPPORT && OMPT_OPTIONAL
1401     if (ompt_enabled.enabled) {
1402       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1403       /* OMPT state update */
1404       prev_state = ti.state;
1405       ti.wait_id = (ompt_wait_id_t)lck;
1406       ti.state = ompt_state_wait_critical;
1407
1408       /* OMPT event callback */
1409       if (ompt_enabled.ompt_callback_mutex_acquire) {
1410         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1411             ompt_mutex_critical, (unsigned int)hint,
1412             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1413       }
1414     }
1415 #endif
1416 #if KMP_USE_INLINED_TAS
1417     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1418       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1419     } else
1420 #elif KMP_USE_INLINED_FUTEX
1421     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1422       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1423     } else
1424 #endif
1425     {
1426       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1427     }
1428   } else {
1429     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1430     lck = ilk->lock;
1431     if (__kmp_env_consistency_check) {
1432       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1433                       __kmp_map_hint_to_lock(hint));
1434     }
1435 #if USE_ITT_BUILD
1436     __kmp_itt_critical_acquiring(lck);
1437 #endif
1438 #if OMPT_SUPPORT && OMPT_OPTIONAL
1439     if (ompt_enabled.enabled) {
1440       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1441       /* OMPT state update */
1442       prev_state = ti.state;
1443       ti.wait_id = (ompt_wait_id_t)lck;
1444       ti.state = ompt_state_wait_critical;
1445
1446       /* OMPT event callback */
1447       if (ompt_enabled.ompt_callback_mutex_acquire) {
1448         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1449             ompt_mutex_critical, (unsigned int)hint,
1450             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1451       }
1452     }
1453 #endif
1454     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1455   }
1456   KMP_POP_PARTITIONED_TIMER();
1457
1458 #if USE_ITT_BUILD
1459   __kmp_itt_critical_acquired(lck);
1460 #endif /* USE_ITT_BUILD */
1461 #if OMPT_SUPPORT && OMPT_OPTIONAL
1462   if (ompt_enabled.enabled) {
1463     /* OMPT state update */
1464     ti.state = prev_state;
1465     ti.wait_id = 0;
1466
1467     /* OMPT event callback */
1468     if (ompt_enabled.ompt_callback_mutex_acquired) {
1469       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1470           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1471     }
1472   }
1473 #endif
1474
1475   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1476   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1477 } // __kmpc_critical_with_hint
1478
1479 #endif // KMP_USE_DYNAMIC_LOCK
1480
1481 /*!
1482 @ingroup WORK_SHARING
1483 @param loc  source location information.
1484 @param global_tid  global thread number .
1485 @param crit identity of the critical section. This could be a pointer to a lock
1486 associated with the critical section, or some other suitably unique value.
1487
1488 Leave a critical section, releasing any lock that was held during its execution.
1489 */
1490 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1491                          kmp_critical_name *crit) {
1492   kmp_user_lock_p lck;
1493
1494   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1495
1496 #if KMP_USE_DYNAMIC_LOCK
1497   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1498     lck = (kmp_user_lock_p)crit;
1499     KMP_ASSERT(lck != NULL);
1500     if (__kmp_env_consistency_check) {
1501       __kmp_pop_sync(global_tid, ct_critical, loc);
1502     }
1503 #if USE_ITT_BUILD
1504     __kmp_itt_critical_releasing(lck);
1505 #endif
1506 #if KMP_USE_INLINED_TAS
1507     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1508       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1509     } else
1510 #elif KMP_USE_INLINED_FUTEX
1511     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1512       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1513     } else
1514 #endif
1515     {
1516       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1517     }
1518   } else {
1519     kmp_indirect_lock_t *ilk =
1520         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1521     KMP_ASSERT(ilk != NULL);
1522     lck = ilk->lock;
1523     if (__kmp_env_consistency_check) {
1524       __kmp_pop_sync(global_tid, ct_critical, loc);
1525     }
1526 #if USE_ITT_BUILD
1527     __kmp_itt_critical_releasing(lck);
1528 #endif
1529     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1530   }
1531
1532 #else // KMP_USE_DYNAMIC_LOCK
1533
1534   if ((__kmp_user_lock_kind == lk_tas) &&
1535       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1536     lck = (kmp_user_lock_p)crit;
1537   }
1538 #if KMP_USE_FUTEX
1539   else if ((__kmp_user_lock_kind == lk_futex) &&
1540            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1541     lck = (kmp_user_lock_p)crit;
1542   }
1543 #endif
1544   else { // ticket, queuing or drdpa
1545     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1546   }
1547
1548   KMP_ASSERT(lck != NULL);
1549
1550   if (__kmp_env_consistency_check)
1551     __kmp_pop_sync(global_tid, ct_critical, loc);
1552
1553 #if USE_ITT_BUILD
1554   __kmp_itt_critical_releasing(lck);
1555 #endif /* USE_ITT_BUILD */
1556   // Value of 'crit' should be good for using as a critical_id of the critical
1557   // section directive.
1558   __kmp_release_user_lock_with_checks(lck, global_tid);
1559
1560 #endif // KMP_USE_DYNAMIC_LOCK
1561
1562 #if OMPT_SUPPORT && OMPT_OPTIONAL
1563   /* OMPT release event triggers after lock is released; place here to trigger
1564    * for all #if branches */
1565   OMPT_STORE_RETURN_ADDRESS(global_tid);
1566   if (ompt_enabled.ompt_callback_mutex_released) {
1567     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1568         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1569   }
1570 #endif
1571
1572   KMP_POP_PARTITIONED_TIMER();
1573   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1574 }
1575
1576 /*!
1577 @ingroup SYNCHRONIZATION
1578 @param loc source location information
1579 @param global_tid thread id.
1580 @return one if the thread should execute the master block, zero otherwise
1581
1582 Start execution of a combined barrier and master. The barrier is executed inside
1583 this function.
1584 */
1585 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1586   int status;
1587
1588   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1589
1590   if (!TCR_4(__kmp_init_parallel))
1591     __kmp_parallel_initialize();
1592
1593   if (__kmp_env_consistency_check)
1594     __kmp_check_barrier(global_tid, ct_barrier, loc);
1595
1596 #if OMPT_SUPPORT
1597   ompt_frame_t *ompt_frame;
1598   if (ompt_enabled.enabled) {
1599     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1600     if (ompt_frame->enter_frame.ptr == NULL)
1601       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1602     OMPT_STORE_RETURN_ADDRESS(global_tid);
1603   }
1604 #endif
1605 #if USE_ITT_NOTIFY
1606   __kmp_threads[global_tid]->th.th_ident = loc;
1607 #endif
1608   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1609 #if OMPT_SUPPORT && OMPT_OPTIONAL
1610   if (ompt_enabled.enabled) {
1611     ompt_frame->enter_frame = ompt_data_none;
1612   }
1613 #endif
1614
1615   return (status != 0) ? 0 : 1;
1616 }
1617
1618 /*!
1619 @ingroup SYNCHRONIZATION
1620 @param loc source location information
1621 @param global_tid thread id.
1622
1623 Complete the execution of a combined barrier and master. This function should
1624 only be called at the completion of the <tt>master</tt> code. Other threads will
1625 still be waiting at the barrier and this call releases them.
1626 */
1627 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1628   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1629
1630   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1631 }
1632
1633 /*!
1634 @ingroup SYNCHRONIZATION
1635 @param loc source location information
1636 @param global_tid thread id.
1637 @return one if the thread should execute the master block, zero otherwise
1638
1639 Start execution of a combined barrier and master(nowait) construct.
1640 The barrier is executed inside this function.
1641 There is no equivalent "end" function, since the
1642 */
1643 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1644   kmp_int32 ret;
1645
1646   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1647
1648   if (!TCR_4(__kmp_init_parallel))
1649     __kmp_parallel_initialize();
1650
1651   if (__kmp_env_consistency_check) {
1652     if (loc == 0) {
1653       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1654     }
1655     __kmp_check_barrier(global_tid, ct_barrier, loc);
1656   }
1657
1658 #if OMPT_SUPPORT
1659   ompt_frame_t *ompt_frame;
1660   if (ompt_enabled.enabled) {
1661     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1662     if (ompt_frame->enter_frame.ptr == NULL)
1663       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1664     OMPT_STORE_RETURN_ADDRESS(global_tid);
1665   }
1666 #endif
1667 #if USE_ITT_NOTIFY
1668   __kmp_threads[global_tid]->th.th_ident = loc;
1669 #endif
1670   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1671 #if OMPT_SUPPORT && OMPT_OPTIONAL
1672   if (ompt_enabled.enabled) {
1673     ompt_frame->enter_frame = ompt_data_none;
1674   }
1675 #endif
1676
1677   ret = __kmpc_master(loc, global_tid);
1678
1679   if (__kmp_env_consistency_check) {
1680     /*  there's no __kmpc_end_master called; so the (stats) */
1681     /*  actions of __kmpc_end_master are done here          */
1682
1683     if (global_tid < 0) {
1684       KMP_WARNING(ThreadIdentInvalid);
1685     }
1686     if (ret) {
1687       /* only one thread should do the pop since only */
1688       /* one did the push (see __kmpc_master())       */
1689
1690       __kmp_pop_sync(global_tid, ct_master, loc);
1691     }
1692   }
1693
1694   return (ret);
1695 }
1696
1697 /* The BARRIER for a SINGLE process section is always explicit   */
1698 /*!
1699 @ingroup WORK_SHARING
1700 @param loc  source location information
1701 @param global_tid  global thread number
1702 @return One if this thread should execute the single construct, zero otherwise.
1703
1704 Test whether to execute a <tt>single</tt> construct.
1705 There are no implicit barriers in the two "single" calls, rather the compiler
1706 should introduce an explicit barrier if it is required.
1707 */
1708
1709 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1710   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1711
1712   if (rc) {
1713     // We are going to execute the single statement, so we should count it.
1714     KMP_COUNT_BLOCK(OMP_SINGLE);
1715     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1716   }
1717
1718 #if OMPT_SUPPORT && OMPT_OPTIONAL
1719   kmp_info_t *this_thr = __kmp_threads[global_tid];
1720   kmp_team_t *team = this_thr->th.th_team;
1721   int tid = __kmp_tid_from_gtid(global_tid);
1722
1723   if (ompt_enabled.enabled) {
1724     if (rc) {
1725       if (ompt_enabled.ompt_callback_work) {
1726         ompt_callbacks.ompt_callback(ompt_callback_work)(
1727             ompt_work_single_executor, ompt_scope_begin,
1728             &(team->t.ompt_team_info.parallel_data),
1729             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1730             1, OMPT_GET_RETURN_ADDRESS(0));
1731       }
1732     } else {
1733       if (ompt_enabled.ompt_callback_work) {
1734         ompt_callbacks.ompt_callback(ompt_callback_work)(
1735             ompt_work_single_other, ompt_scope_begin,
1736             &(team->t.ompt_team_info.parallel_data),
1737             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1738             1, OMPT_GET_RETURN_ADDRESS(0));
1739         ompt_callbacks.ompt_callback(ompt_callback_work)(
1740             ompt_work_single_other, ompt_scope_end,
1741             &(team->t.ompt_team_info.parallel_data),
1742             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1743             1, OMPT_GET_RETURN_ADDRESS(0));
1744       }
1745     }
1746   }
1747 #endif
1748
1749   return rc;
1750 }
1751
1752 /*!
1753 @ingroup WORK_SHARING
1754 @param loc  source location information
1755 @param global_tid  global thread number
1756
1757 Mark the end of a <tt>single</tt> construct.  This function should
1758 only be called by the thread that executed the block of code protected
1759 by the `single` construct.
1760 */
1761 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1762   __kmp_exit_single(global_tid);
1763   KMP_POP_PARTITIONED_TIMER();
1764
1765 #if OMPT_SUPPORT && OMPT_OPTIONAL
1766   kmp_info_t *this_thr = __kmp_threads[global_tid];
1767   kmp_team_t *team = this_thr->th.th_team;
1768   int tid = __kmp_tid_from_gtid(global_tid);
1769
1770   if (ompt_enabled.ompt_callback_work) {
1771     ompt_callbacks.ompt_callback(ompt_callback_work)(
1772         ompt_work_single_executor, ompt_scope_end,
1773         &(team->t.ompt_team_info.parallel_data),
1774         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1775         OMPT_GET_RETURN_ADDRESS(0));
1776   }
1777 #endif
1778 }
1779
1780 /*!
1781 @ingroup WORK_SHARING
1782 @param loc Source location
1783 @param global_tid Global thread id
1784
1785 Mark the end of a statically scheduled loop.
1786 */
1787 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1788   KMP_POP_PARTITIONED_TIMER();
1789   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1790
1791 #if OMPT_SUPPORT && OMPT_OPTIONAL
1792   if (ompt_enabled.ompt_callback_work) {
1793     ompt_work_t ompt_work_type = ompt_work_loop;
1794     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1795     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1796     // Determine workshare type
1797     if (loc != NULL) {
1798       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1799         ompt_work_type = ompt_work_loop;
1800       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1801         ompt_work_type = ompt_work_sections;
1802       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1803         ompt_work_type = ompt_work_distribute;
1804       } else {
1805         // use default set above.
1806         // a warning about this case is provided in __kmpc_for_static_init
1807       }
1808       KMP_DEBUG_ASSERT(ompt_work_type);
1809     }
1810     ompt_callbacks.ompt_callback(ompt_callback_work)(
1811         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1812         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1813   }
1814 #endif
1815   if (__kmp_env_consistency_check)
1816     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1817 }
1818
1819 // User routines which take C-style arguments (call by value)
1820 // different from the Fortran equivalent routines
1821
1822 void ompc_set_num_threads(int arg) {
1823   // !!!!! TODO: check the per-task binding
1824   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1825 }
1826
1827 void ompc_set_dynamic(int flag) {
1828   kmp_info_t *thread;
1829
1830   /* For the thread-private implementation of the internal controls */
1831   thread = __kmp_entry_thread();
1832
1833   __kmp_save_internal_controls(thread);
1834
1835   set__dynamic(thread, flag ? TRUE : FALSE);
1836 }
1837
1838 void ompc_set_nested(int flag) {
1839   kmp_info_t *thread;
1840
1841   /* For the thread-private internal controls implementation */
1842   thread = __kmp_entry_thread();
1843
1844   __kmp_save_internal_controls(thread);
1845
1846   set__nested(thread, flag ? TRUE : FALSE);
1847 }
1848
1849 void ompc_set_max_active_levels(int max_active_levels) {
1850   /* TO DO */
1851   /* we want per-task implementation of this internal control */
1852
1853   /* For the per-thread internal controls implementation */
1854   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1855 }
1856
1857 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1858   // !!!!! TODO: check the per-task binding
1859   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1860 }
1861
1862 int ompc_get_ancestor_thread_num(int level) {
1863   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1864 }
1865
1866 int ompc_get_team_size(int level) {
1867   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1868 }
1869
1870 #if OMP_50_ENABLED
1871 /* OpenMP 5.0 Affinity Format API */
1872
1873 void ompc_set_affinity_format(char const *format) {
1874   if (!__kmp_init_serial) {
1875     __kmp_serial_initialize();
1876   }
1877   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1878                          format, KMP_STRLEN(format) + 1);
1879 }
1880
1881 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1882   size_t format_size;
1883   if (!__kmp_init_serial) {
1884     __kmp_serial_initialize();
1885   }
1886   format_size = KMP_STRLEN(__kmp_affinity_format);
1887   if (buffer && size) {
1888     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1889                            format_size + 1);
1890   }
1891   return format_size;
1892 }
1893
1894 void ompc_display_affinity(char const *format) {
1895   int gtid;
1896   if (!TCR_4(__kmp_init_middle)) {
1897     __kmp_middle_initialize();
1898   }
1899   gtid = __kmp_get_gtid();
1900   __kmp_aux_display_affinity(gtid, format);
1901 }
1902
1903 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1904                              char const *format) {
1905   int gtid;
1906   size_t num_required;
1907   kmp_str_buf_t capture_buf;
1908   if (!TCR_4(__kmp_init_middle)) {
1909     __kmp_middle_initialize();
1910   }
1911   gtid = __kmp_get_gtid();
1912   __kmp_str_buf_init(&capture_buf);
1913   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1914   if (buffer && buf_size) {
1915     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1916                            capture_buf.used + 1);
1917   }
1918   __kmp_str_buf_free(&capture_buf);
1919   return num_required;
1920 }
1921 #endif /* OMP_50_ENABLED */
1922
1923 void kmpc_set_stacksize(int arg) {
1924   // __kmp_aux_set_stacksize initializes the library if needed
1925   __kmp_aux_set_stacksize(arg);
1926 }
1927
1928 void kmpc_set_stacksize_s(size_t arg) {
1929   // __kmp_aux_set_stacksize initializes the library if needed
1930   __kmp_aux_set_stacksize(arg);
1931 }
1932
1933 void kmpc_set_blocktime(int arg) {
1934   int gtid, tid;
1935   kmp_info_t *thread;
1936
1937   gtid = __kmp_entry_gtid();
1938   tid = __kmp_tid_from_gtid(gtid);
1939   thread = __kmp_thread_from_gtid(gtid);
1940
1941   __kmp_aux_set_blocktime(arg, thread, tid);
1942 }
1943
1944 void kmpc_set_library(int arg) {
1945   // __kmp_user_set_library initializes the library if needed
1946   __kmp_user_set_library((enum library_type)arg);
1947 }
1948
1949 void kmpc_set_defaults(char const *str) {
1950   // __kmp_aux_set_defaults initializes the library if needed
1951   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1952 }
1953
1954 void kmpc_set_disp_num_buffers(int arg) {
1955   // ignore after initialization because some teams have already
1956   // allocated dispatch buffers
1957   if (__kmp_init_serial == 0 && arg > 0)
1958     __kmp_dispatch_num_buffers = arg;
1959 }
1960
1961 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1962 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1963   return -1;
1964 #else
1965   if (!TCR_4(__kmp_init_middle)) {
1966     __kmp_middle_initialize();
1967   }
1968   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1969 #endif
1970 }
1971
1972 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1973 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1974   return -1;
1975 #else
1976   if (!TCR_4(__kmp_init_middle)) {
1977     __kmp_middle_initialize();
1978   }
1979   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1980 #endif
1981 }
1982
1983 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1984 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1985   return -1;
1986 #else
1987   if (!TCR_4(__kmp_init_middle)) {
1988     __kmp_middle_initialize();
1989   }
1990   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1991 #endif
1992 }
1993
1994 /* -------------------------------------------------------------------------- */
1995 /*!
1996 @ingroup THREADPRIVATE
1997 @param loc       source location information
1998 @param gtid      global thread number
1999 @param cpy_size  size of the cpy_data buffer
2000 @param cpy_data  pointer to data to be copied
2001 @param cpy_func  helper function to call for copying data
2002 @param didit     flag variable: 1=single thread; 0=not single thread
2003
2004 __kmpc_copyprivate implements the interface for the private data broadcast
2005 needed for the copyprivate clause associated with a single region in an
2006 OpenMP<sup>*</sup> program (both C and Fortran).
2007 All threads participating in the parallel region call this routine.
2008 One of the threads (called the single thread) should have the <tt>didit</tt>
2009 variable set to 1 and all other threads should have that variable set to 0.
2010 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2011
2012 The OpenMP specification forbids the use of nowait on the single region when a
2013 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2014 barrier internally to avoid race conditions, so the code generation for the
2015 single region should avoid generating a barrier after the call to @ref
2016 __kmpc_copyprivate.
2017
2018 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2019 The <tt>loc</tt> parameter is a pointer to source location information.
2020
2021 Internal implementation: The single thread will first copy its descriptor
2022 address (cpy_data) to a team-private location, then the other threads will each
2023 call the function pointed to by the parameter cpy_func, which carries out the
2024 copy by copying the data using the cpy_data buffer.
2025
2026 The cpy_func routine used for the copy and the contents of the data area defined
2027 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2028 to be done. For instance, the cpy_data buffer can hold the actual data to be
2029 copied or it may hold a list of pointers to the data. The cpy_func routine must
2030 interpret the cpy_data buffer appropriately.
2031
2032 The interface to cpy_func is as follows:
2033 @code
2034 void cpy_func( void *destination, void *source )
2035 @endcode
2036 where void *destination is the cpy_data pointer for the thread being copied to
2037 and void *source is the cpy_data pointer for the thread being copied from.
2038 */
2039 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2040                         void *cpy_data, void (*cpy_func)(void *, void *),
2041                         kmp_int32 didit) {
2042   void **data_ptr;
2043
2044   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2045
2046   KMP_MB();
2047
2048   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2049
2050   if (__kmp_env_consistency_check) {
2051     if (loc == 0) {
2052       KMP_WARNING(ConstructIdentInvalid);
2053     }
2054   }
2055
2056   // ToDo: Optimize the following two barriers into some kind of split barrier
2057
2058   if (didit)
2059     *data_ptr = cpy_data;
2060
2061 #if OMPT_SUPPORT
2062   ompt_frame_t *ompt_frame;
2063   if (ompt_enabled.enabled) {
2064     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2065     if (ompt_frame->enter_frame.ptr == NULL)
2066       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2067     OMPT_STORE_RETURN_ADDRESS(gtid);
2068   }
2069 #endif
2070 /* This barrier is not a barrier region boundary */
2071 #if USE_ITT_NOTIFY
2072   __kmp_threads[gtid]->th.th_ident = loc;
2073 #endif
2074   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2075
2076   if (!didit)
2077     (*cpy_func)(cpy_data, *data_ptr);
2078
2079 // Consider next barrier a user-visible barrier for barrier region boundaries
2080 // Nesting checks are already handled by the single construct checks
2081
2082 #if OMPT_SUPPORT
2083   if (ompt_enabled.enabled) {
2084     OMPT_STORE_RETURN_ADDRESS(gtid);
2085   }
2086 #endif
2087 #if USE_ITT_NOTIFY
2088   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2089 // tasks can overwrite the location)
2090 #endif
2091   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2092 #if OMPT_SUPPORT && OMPT_OPTIONAL
2093   if (ompt_enabled.enabled) {
2094     ompt_frame->enter_frame = ompt_data_none;
2095   }
2096 #endif
2097 }
2098
2099 /* -------------------------------------------------------------------------- */
2100
2101 #define INIT_LOCK __kmp_init_user_lock_with_checks
2102 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2103 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2104 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2105 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2106 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2107   __kmp_acquire_nested_user_lock_with_checks_timed
2108 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2109 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2110 #define TEST_LOCK __kmp_test_user_lock_with_checks
2111 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2112 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2113 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2114
2115 // TODO: Make check abort messages use location info & pass it into
2116 // with_checks routines
2117
2118 #if KMP_USE_DYNAMIC_LOCK
2119
2120 // internal lock initializer
2121 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2122                                                     kmp_dyna_lockseq_t seq) {
2123   if (KMP_IS_D_LOCK(seq)) {
2124     KMP_INIT_D_LOCK(lock, seq);
2125 #if USE_ITT_BUILD
2126     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2127 #endif
2128   } else {
2129     KMP_INIT_I_LOCK(lock, seq);
2130 #if USE_ITT_BUILD
2131     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2132     __kmp_itt_lock_creating(ilk->lock, loc);
2133 #endif
2134   }
2135 }
2136
2137 // internal nest lock initializer
2138 static __forceinline void
2139 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2140                                kmp_dyna_lockseq_t seq) {
2141 #if KMP_USE_TSX
2142   // Don't have nested lock implementation for speculative locks
2143   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2144     seq = __kmp_user_lock_seq;
2145 #endif
2146   switch (seq) {
2147   case lockseq_tas:
2148     seq = lockseq_nested_tas;
2149     break;
2150 #if KMP_USE_FUTEX
2151   case lockseq_futex:
2152     seq = lockseq_nested_futex;
2153     break;
2154 #endif
2155   case lockseq_ticket:
2156     seq = lockseq_nested_ticket;
2157     break;
2158   case lockseq_queuing:
2159     seq = lockseq_nested_queuing;
2160     break;
2161   case lockseq_drdpa:
2162     seq = lockseq_nested_drdpa;
2163     break;
2164   default:
2165     seq = lockseq_nested_queuing;
2166   }
2167   KMP_INIT_I_LOCK(lock, seq);
2168 #if USE_ITT_BUILD
2169   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2170   __kmp_itt_lock_creating(ilk->lock, loc);
2171 #endif
2172 }
2173
2174 /* initialize the lock with a hint */
2175 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2176                                 uintptr_t hint) {
2177   KMP_DEBUG_ASSERT(__kmp_init_serial);
2178   if (__kmp_env_consistency_check && user_lock == NULL) {
2179     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2180   }
2181
2182   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2183
2184 #if OMPT_SUPPORT && OMPT_OPTIONAL
2185   // This is the case, if called from omp_init_lock_with_hint:
2186   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2187   if (!codeptr)
2188     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2189   if (ompt_enabled.ompt_callback_lock_init) {
2190     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2191         ompt_mutex_lock, (omp_lock_hint_t)hint,
2192         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2193         codeptr);
2194   }
2195 #endif
2196 }
2197
2198 /* initialize the lock with a hint */
2199 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2200                                      void **user_lock, uintptr_t hint) {
2201   KMP_DEBUG_ASSERT(__kmp_init_serial);
2202   if (__kmp_env_consistency_check && user_lock == NULL) {
2203     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2204   }
2205
2206   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2207
2208 #if OMPT_SUPPORT && OMPT_OPTIONAL
2209   // This is the case, if called from omp_init_lock_with_hint:
2210   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2211   if (!codeptr)
2212     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2213   if (ompt_enabled.ompt_callback_lock_init) {
2214     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2215         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2216         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2217         codeptr);
2218   }
2219 #endif
2220 }
2221
2222 #endif // KMP_USE_DYNAMIC_LOCK
2223
2224 /* initialize the lock */
2225 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2226 #if KMP_USE_DYNAMIC_LOCK
2227
2228   KMP_DEBUG_ASSERT(__kmp_init_serial);
2229   if (__kmp_env_consistency_check && user_lock == NULL) {
2230     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2231   }
2232   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2233
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235   // This is the case, if called from omp_init_lock_with_hint:
2236   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2237   if (!codeptr)
2238     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2239   if (ompt_enabled.ompt_callback_lock_init) {
2240     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2241         ompt_mutex_lock, omp_lock_hint_none,
2242         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2243         codeptr);
2244   }
2245 #endif
2246
2247 #else // KMP_USE_DYNAMIC_LOCK
2248
2249   static char const *const func = "omp_init_lock";
2250   kmp_user_lock_p lck;
2251   KMP_DEBUG_ASSERT(__kmp_init_serial);
2252
2253   if (__kmp_env_consistency_check) {
2254     if (user_lock == NULL) {
2255       KMP_FATAL(LockIsUninitialized, func);
2256     }
2257   }
2258
2259   KMP_CHECK_USER_LOCK_INIT();
2260
2261   if ((__kmp_user_lock_kind == lk_tas) &&
2262       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2263     lck = (kmp_user_lock_p)user_lock;
2264   }
2265 #if KMP_USE_FUTEX
2266   else if ((__kmp_user_lock_kind == lk_futex) &&
2267            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2268     lck = (kmp_user_lock_p)user_lock;
2269   }
2270 #endif
2271   else {
2272     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2273   }
2274   INIT_LOCK(lck);
2275   __kmp_set_user_lock_location(lck, loc);
2276
2277 #if OMPT_SUPPORT && OMPT_OPTIONAL
2278   // This is the case, if called from omp_init_lock_with_hint:
2279   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2280   if (!codeptr)
2281     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2282   if (ompt_enabled.ompt_callback_lock_init) {
2283     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2284         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2285         (ompt_wait_id_t)user_lock, codeptr);
2286   }
2287 #endif
2288
2289 #if USE_ITT_BUILD
2290   __kmp_itt_lock_creating(lck);
2291 #endif /* USE_ITT_BUILD */
2292
2293 #endif // KMP_USE_DYNAMIC_LOCK
2294 } // __kmpc_init_lock
2295
2296 /* initialize the lock */
2297 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2298 #if KMP_USE_DYNAMIC_LOCK
2299
2300   KMP_DEBUG_ASSERT(__kmp_init_serial);
2301   if (__kmp_env_consistency_check && user_lock == NULL) {
2302     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2303   }
2304   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2305
2306 #if OMPT_SUPPORT && OMPT_OPTIONAL
2307   // This is the case, if called from omp_init_lock_with_hint:
2308   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2309   if (!codeptr)
2310     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2311   if (ompt_enabled.ompt_callback_lock_init) {
2312     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2313         ompt_mutex_nest_lock, omp_lock_hint_none,
2314         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2315         codeptr);
2316   }
2317 #endif
2318
2319 #else // KMP_USE_DYNAMIC_LOCK
2320
2321   static char const *const func = "omp_init_nest_lock";
2322   kmp_user_lock_p lck;
2323   KMP_DEBUG_ASSERT(__kmp_init_serial);
2324
2325   if (__kmp_env_consistency_check) {
2326     if (user_lock == NULL) {
2327       KMP_FATAL(LockIsUninitialized, func);
2328     }
2329   }
2330
2331   KMP_CHECK_USER_LOCK_INIT();
2332
2333   if ((__kmp_user_lock_kind == lk_tas) &&
2334       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2335        OMP_NEST_LOCK_T_SIZE)) {
2336     lck = (kmp_user_lock_p)user_lock;
2337   }
2338 #if KMP_USE_FUTEX
2339   else if ((__kmp_user_lock_kind == lk_futex) &&
2340            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2341             OMP_NEST_LOCK_T_SIZE)) {
2342     lck = (kmp_user_lock_p)user_lock;
2343   }
2344 #endif
2345   else {
2346     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2347   }
2348
2349   INIT_NESTED_LOCK(lck);
2350   __kmp_set_user_lock_location(lck, loc);
2351
2352 #if OMPT_SUPPORT && OMPT_OPTIONAL
2353   // This is the case, if called from omp_init_lock_with_hint:
2354   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2355   if (!codeptr)
2356     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2357   if (ompt_enabled.ompt_callback_lock_init) {
2358     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2359         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2360         (ompt_wait_id_t)user_lock, codeptr);
2361   }
2362 #endif
2363
2364 #if USE_ITT_BUILD
2365   __kmp_itt_lock_creating(lck);
2366 #endif /* USE_ITT_BUILD */
2367
2368 #endif // KMP_USE_DYNAMIC_LOCK
2369 } // __kmpc_init_nest_lock
2370
2371 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2372 #if KMP_USE_DYNAMIC_LOCK
2373
2374 #if USE_ITT_BUILD
2375   kmp_user_lock_p lck;
2376   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2377     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2378   } else {
2379     lck = (kmp_user_lock_p)user_lock;
2380   }
2381   __kmp_itt_lock_destroyed(lck);
2382 #endif
2383 #if OMPT_SUPPORT && OMPT_OPTIONAL
2384   // This is the case, if called from omp_init_lock_with_hint:
2385   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2386   if (!codeptr)
2387     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2388   if (ompt_enabled.ompt_callback_lock_destroy) {
2389     kmp_user_lock_p lck;
2390     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2391       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2392     } else {
2393       lck = (kmp_user_lock_p)user_lock;
2394     }
2395     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2396         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2397   }
2398 #endif
2399   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2400 #else
2401   kmp_user_lock_p lck;
2402
2403   if ((__kmp_user_lock_kind == lk_tas) &&
2404       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2405     lck = (kmp_user_lock_p)user_lock;
2406   }
2407 #if KMP_USE_FUTEX
2408   else if ((__kmp_user_lock_kind == lk_futex) &&
2409            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2410     lck = (kmp_user_lock_p)user_lock;
2411   }
2412 #endif
2413   else {
2414     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2415   }
2416
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418   // This is the case, if called from omp_init_lock_with_hint:
2419   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2420   if (!codeptr)
2421     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2422   if (ompt_enabled.ompt_callback_lock_destroy) {
2423     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2424         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2425   }
2426 #endif
2427
2428 #if USE_ITT_BUILD
2429   __kmp_itt_lock_destroyed(lck);
2430 #endif /* USE_ITT_BUILD */
2431   DESTROY_LOCK(lck);
2432
2433   if ((__kmp_user_lock_kind == lk_tas) &&
2434       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2435     ;
2436   }
2437 #if KMP_USE_FUTEX
2438   else if ((__kmp_user_lock_kind == lk_futex) &&
2439            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2440     ;
2441   }
2442 #endif
2443   else {
2444     __kmp_user_lock_free(user_lock, gtid, lck);
2445   }
2446 #endif // KMP_USE_DYNAMIC_LOCK
2447 } // __kmpc_destroy_lock
2448
2449 /* destroy the lock */
2450 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2451 #if KMP_USE_DYNAMIC_LOCK
2452
2453 #if USE_ITT_BUILD
2454   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2455   __kmp_itt_lock_destroyed(ilk->lock);
2456 #endif
2457 #if OMPT_SUPPORT && OMPT_OPTIONAL
2458   // This is the case, if called from omp_init_lock_with_hint:
2459   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2460   if (!codeptr)
2461     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2462   if (ompt_enabled.ompt_callback_lock_destroy) {
2463     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2464         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2465   }
2466 #endif
2467   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2468
2469 #else // KMP_USE_DYNAMIC_LOCK
2470
2471   kmp_user_lock_p lck;
2472
2473   if ((__kmp_user_lock_kind == lk_tas) &&
2474       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2475        OMP_NEST_LOCK_T_SIZE)) {
2476     lck = (kmp_user_lock_p)user_lock;
2477   }
2478 #if KMP_USE_FUTEX
2479   else if ((__kmp_user_lock_kind == lk_futex) &&
2480            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2481             OMP_NEST_LOCK_T_SIZE)) {
2482     lck = (kmp_user_lock_p)user_lock;
2483   }
2484 #endif
2485   else {
2486     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2487   }
2488
2489 #if OMPT_SUPPORT && OMPT_OPTIONAL
2490   // This is the case, if called from omp_init_lock_with_hint:
2491   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2492   if (!codeptr)
2493     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2494   if (ompt_enabled.ompt_callback_lock_destroy) {
2495     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2496         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2497   }
2498 #endif
2499
2500 #if USE_ITT_BUILD
2501   __kmp_itt_lock_destroyed(lck);
2502 #endif /* USE_ITT_BUILD */
2503
2504   DESTROY_NESTED_LOCK(lck);
2505
2506   if ((__kmp_user_lock_kind == lk_tas) &&
2507       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2508        OMP_NEST_LOCK_T_SIZE)) {
2509     ;
2510   }
2511 #if KMP_USE_FUTEX
2512   else if ((__kmp_user_lock_kind == lk_futex) &&
2513            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2514             OMP_NEST_LOCK_T_SIZE)) {
2515     ;
2516   }
2517 #endif
2518   else {
2519     __kmp_user_lock_free(user_lock, gtid, lck);
2520   }
2521 #endif // KMP_USE_DYNAMIC_LOCK
2522 } // __kmpc_destroy_nest_lock
2523
2524 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2525   KMP_COUNT_BLOCK(OMP_set_lock);
2526 #if KMP_USE_DYNAMIC_LOCK
2527   int tag = KMP_EXTRACT_D_TAG(user_lock);
2528 #if USE_ITT_BUILD
2529   __kmp_itt_lock_acquiring(
2530       (kmp_user_lock_p)
2531           user_lock); // itt function will get to the right lock object.
2532 #endif
2533 #if OMPT_SUPPORT && OMPT_OPTIONAL
2534   // This is the case, if called from omp_init_lock_with_hint:
2535   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2536   if (!codeptr)
2537     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2538   if (ompt_enabled.ompt_callback_mutex_acquire) {
2539     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2540         ompt_mutex_lock, omp_lock_hint_none,
2541         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2542         codeptr);
2543   }
2544 #endif
2545 #if KMP_USE_INLINED_TAS
2546   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2547     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2548   } else
2549 #elif KMP_USE_INLINED_FUTEX
2550   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2551     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2552   } else
2553 #endif
2554   {
2555     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2556   }
2557 #if USE_ITT_BUILD
2558   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2559 #endif
2560 #if OMPT_SUPPORT && OMPT_OPTIONAL
2561   if (ompt_enabled.ompt_callback_mutex_acquired) {
2562     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2563         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2564   }
2565 #endif
2566
2567 #else // KMP_USE_DYNAMIC_LOCK
2568
2569   kmp_user_lock_p lck;
2570
2571   if ((__kmp_user_lock_kind == lk_tas) &&
2572       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2573     lck = (kmp_user_lock_p)user_lock;
2574   }
2575 #if KMP_USE_FUTEX
2576   else if ((__kmp_user_lock_kind == lk_futex) &&
2577            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2578     lck = (kmp_user_lock_p)user_lock;
2579   }
2580 #endif
2581   else {
2582     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2583   }
2584
2585 #if USE_ITT_BUILD
2586   __kmp_itt_lock_acquiring(lck);
2587 #endif /* USE_ITT_BUILD */
2588 #if OMPT_SUPPORT && OMPT_OPTIONAL
2589   // This is the case, if called from omp_init_lock_with_hint:
2590   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2591   if (!codeptr)
2592     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2593   if (ompt_enabled.ompt_callback_mutex_acquire) {
2594     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2595         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2596         (ompt_wait_id_t)lck, codeptr);
2597   }
2598 #endif
2599
2600   ACQUIRE_LOCK(lck, gtid);
2601
2602 #if USE_ITT_BUILD
2603   __kmp_itt_lock_acquired(lck);
2604 #endif /* USE_ITT_BUILD */
2605
2606 #if OMPT_SUPPORT && OMPT_OPTIONAL
2607   if (ompt_enabled.ompt_callback_mutex_acquired) {
2608     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2609         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2610   }
2611 #endif
2612
2613 #endif // KMP_USE_DYNAMIC_LOCK
2614 }
2615
2616 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2617 #if KMP_USE_DYNAMIC_LOCK
2618
2619 #if USE_ITT_BUILD
2620   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2621 #endif
2622 #if OMPT_SUPPORT && OMPT_OPTIONAL
2623   // This is the case, if called from omp_init_lock_with_hint:
2624   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2625   if (!codeptr)
2626     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2627   if (ompt_enabled.enabled) {
2628     if (ompt_enabled.ompt_callback_mutex_acquire) {
2629       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2630           ompt_mutex_nest_lock, omp_lock_hint_none,
2631           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2632           codeptr);
2633     }
2634   }
2635 #endif
2636   int acquire_status =
2637       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2638   (void) acquire_status;
2639 #if USE_ITT_BUILD
2640   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2641 #endif
2642
2643 #if OMPT_SUPPORT && OMPT_OPTIONAL
2644   if (ompt_enabled.enabled) {
2645     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2646       if (ompt_enabled.ompt_callback_mutex_acquired) {
2647         // lock_first
2648         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2649             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2650       }
2651     } else {
2652       if (ompt_enabled.ompt_callback_nest_lock) {
2653         // lock_next
2654         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2655             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2656       }
2657     }
2658   }
2659 #endif
2660
2661 #else // KMP_USE_DYNAMIC_LOCK
2662   int acquire_status;
2663   kmp_user_lock_p lck;
2664
2665   if ((__kmp_user_lock_kind == lk_tas) &&
2666       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2667        OMP_NEST_LOCK_T_SIZE)) {
2668     lck = (kmp_user_lock_p)user_lock;
2669   }
2670 #if KMP_USE_FUTEX
2671   else if ((__kmp_user_lock_kind == lk_futex) &&
2672            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2673             OMP_NEST_LOCK_T_SIZE)) {
2674     lck = (kmp_user_lock_p)user_lock;
2675   }
2676 #endif
2677   else {
2678     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2679   }
2680
2681 #if USE_ITT_BUILD
2682   __kmp_itt_lock_acquiring(lck);
2683 #endif /* USE_ITT_BUILD */
2684 #if OMPT_SUPPORT && OMPT_OPTIONAL
2685   // This is the case, if called from omp_init_lock_with_hint:
2686   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2687   if (!codeptr)
2688     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2689   if (ompt_enabled.enabled) {
2690     if (ompt_enabled.ompt_callback_mutex_acquire) {
2691       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2692           ompt_mutex_nest_lock, omp_lock_hint_none,
2693           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2694     }
2695   }
2696 #endif
2697
2698   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2699
2700 #if USE_ITT_BUILD
2701   __kmp_itt_lock_acquired(lck);
2702 #endif /* USE_ITT_BUILD */
2703
2704 #if OMPT_SUPPORT && OMPT_OPTIONAL
2705   if (ompt_enabled.enabled) {
2706     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2707       if (ompt_enabled.ompt_callback_mutex_acquired) {
2708         // lock_first
2709         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2710             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2711       }
2712     } else {
2713       if (ompt_enabled.ompt_callback_nest_lock) {
2714         // lock_next
2715         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2716             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2717       }
2718     }
2719   }
2720 #endif
2721
2722 #endif // KMP_USE_DYNAMIC_LOCK
2723 }
2724
2725 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2726 #if KMP_USE_DYNAMIC_LOCK
2727
2728   int tag = KMP_EXTRACT_D_TAG(user_lock);
2729 #if USE_ITT_BUILD
2730   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2731 #endif
2732 #if KMP_USE_INLINED_TAS
2733   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2734     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2735   } else
2736 #elif KMP_USE_INLINED_FUTEX
2737   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2738     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2739   } else
2740 #endif
2741   {
2742     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2743   }
2744
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.ompt_callback_mutex_released) {
2751     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2752         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2753   }
2754 #endif
2755
2756 #else // KMP_USE_DYNAMIC_LOCK
2757
2758   kmp_user_lock_p lck;
2759
2760   /* Can't use serial interval since not block structured */
2761   /* release the lock */
2762
2763   if ((__kmp_user_lock_kind == lk_tas) &&
2764       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2765 #if KMP_OS_LINUX &&                                                            \
2766     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2767 // "fast" path implemented to fix customer performance issue
2768 #if USE_ITT_BUILD
2769     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2770 #endif /* USE_ITT_BUILD */
2771     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2772     KMP_MB();
2773
2774 #if OMPT_SUPPORT && OMPT_OPTIONAL
2775     // This is the case, if called from omp_init_lock_with_hint:
2776     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2777     if (!codeptr)
2778       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2779     if (ompt_enabled.ompt_callback_mutex_released) {
2780       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2781           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2782     }
2783 #endif
2784
2785     return;
2786 #else
2787     lck = (kmp_user_lock_p)user_lock;
2788 #endif
2789   }
2790 #if KMP_USE_FUTEX
2791   else if ((__kmp_user_lock_kind == lk_futex) &&
2792            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2793     lck = (kmp_user_lock_p)user_lock;
2794   }
2795 #endif
2796   else {
2797     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2798   }
2799
2800 #if USE_ITT_BUILD
2801   __kmp_itt_lock_releasing(lck);
2802 #endif /* USE_ITT_BUILD */
2803
2804   RELEASE_LOCK(lck, gtid);
2805
2806 #if OMPT_SUPPORT && OMPT_OPTIONAL
2807   // This is the case, if called from omp_init_lock_with_hint:
2808   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2809   if (!codeptr)
2810     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2811   if (ompt_enabled.ompt_callback_mutex_released) {
2812     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2813         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2814   }
2815 #endif
2816
2817 #endif // KMP_USE_DYNAMIC_LOCK
2818 }
2819
2820 /* release the lock */
2821 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2822 #if KMP_USE_DYNAMIC_LOCK
2823
2824 #if USE_ITT_BUILD
2825   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2826 #endif
2827   int release_status =
2828       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2829   (void) release_status;
2830
2831 #if OMPT_SUPPORT && OMPT_OPTIONAL
2832   // This is the case, if called from omp_init_lock_with_hint:
2833   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2834   if (!codeptr)
2835     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2836   if (ompt_enabled.enabled) {
2837     if (release_status == KMP_LOCK_RELEASED) {
2838       if (ompt_enabled.ompt_callback_mutex_released) {
2839         // release_lock_last
2840         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2841             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2842       }
2843     } else if (ompt_enabled.ompt_callback_nest_lock) {
2844       // release_lock_prev
2845       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2846           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2847     }
2848   }
2849 #endif
2850
2851 #else // KMP_USE_DYNAMIC_LOCK
2852
2853   kmp_user_lock_p lck;
2854
2855   /* Can't use serial interval since not block structured */
2856
2857   if ((__kmp_user_lock_kind == lk_tas) &&
2858       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2859        OMP_NEST_LOCK_T_SIZE)) {
2860 #if KMP_OS_LINUX &&                                                            \
2861     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2862     // "fast" path implemented to fix customer performance issue
2863     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2864 #if USE_ITT_BUILD
2865     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2866 #endif /* USE_ITT_BUILD */
2867
2868 #if OMPT_SUPPORT && OMPT_OPTIONAL
2869     int release_status = KMP_LOCK_STILL_HELD;
2870 #endif
2871
2872     if (--(tl->lk.depth_locked) == 0) {
2873       TCW_4(tl->lk.poll, 0);
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875       release_status = KMP_LOCK_RELEASED;
2876 #endif
2877     }
2878     KMP_MB();
2879
2880 #if OMPT_SUPPORT && OMPT_OPTIONAL
2881     // This is the case, if called from omp_init_lock_with_hint:
2882     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2883     if (!codeptr)
2884       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2885     if (ompt_enabled.enabled) {
2886       if (release_status == KMP_LOCK_RELEASED) {
2887         if (ompt_enabled.ompt_callback_mutex_released) {
2888           // release_lock_last
2889           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2890               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2891         }
2892       } else if (ompt_enabled.ompt_callback_nest_lock) {
2893         // release_lock_previous
2894         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2895             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2896       }
2897     }
2898 #endif
2899
2900     return;
2901 #else
2902     lck = (kmp_user_lock_p)user_lock;
2903 #endif
2904   }
2905 #if KMP_USE_FUTEX
2906   else if ((__kmp_user_lock_kind == lk_futex) &&
2907            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2908             OMP_NEST_LOCK_T_SIZE)) {
2909     lck = (kmp_user_lock_p)user_lock;
2910   }
2911 #endif
2912   else {
2913     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2914   }
2915
2916 #if USE_ITT_BUILD
2917   __kmp_itt_lock_releasing(lck);
2918 #endif /* USE_ITT_BUILD */
2919
2920   int release_status;
2921   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923   // This is the case, if called from omp_init_lock_with_hint:
2924   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2925   if (!codeptr)
2926     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2927   if (ompt_enabled.enabled) {
2928     if (release_status == KMP_LOCK_RELEASED) {
2929       if (ompt_enabled.ompt_callback_mutex_released) {
2930         // release_lock_last
2931         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2932             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2933       }
2934     } else if (ompt_enabled.ompt_callback_nest_lock) {
2935       // release_lock_previous
2936       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2937           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2938     }
2939   }
2940 #endif
2941
2942 #endif // KMP_USE_DYNAMIC_LOCK
2943 }
2944
2945 /* try to acquire the lock */
2946 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2947   KMP_COUNT_BLOCK(OMP_test_lock);
2948
2949 #if KMP_USE_DYNAMIC_LOCK
2950   int rc;
2951   int tag = KMP_EXTRACT_D_TAG(user_lock);
2952 #if USE_ITT_BUILD
2953   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2954 #endif
2955 #if OMPT_SUPPORT && OMPT_OPTIONAL
2956   // This is the case, if called from omp_init_lock_with_hint:
2957   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2958   if (!codeptr)
2959     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2960   if (ompt_enabled.ompt_callback_mutex_acquire) {
2961     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2962         ompt_mutex_lock, omp_lock_hint_none,
2963         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2964         codeptr);
2965   }
2966 #endif
2967 #if KMP_USE_INLINED_TAS
2968   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2969     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2970   } else
2971 #elif KMP_USE_INLINED_FUTEX
2972   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2973     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2974   } else
2975 #endif
2976   {
2977     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2978   }
2979   if (rc) {
2980 #if USE_ITT_BUILD
2981     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2982 #endif
2983 #if OMPT_SUPPORT && OMPT_OPTIONAL
2984     if (ompt_enabled.ompt_callback_mutex_acquired) {
2985       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2986           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2987     }
2988 #endif
2989     return FTN_TRUE;
2990   } else {
2991 #if USE_ITT_BUILD
2992     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2993 #endif
2994     return FTN_FALSE;
2995   }
2996
2997 #else // KMP_USE_DYNAMIC_LOCK
2998
2999   kmp_user_lock_p lck;
3000   int rc;
3001
3002   if ((__kmp_user_lock_kind == lk_tas) &&
3003       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3004     lck = (kmp_user_lock_p)user_lock;
3005   }
3006 #if KMP_USE_FUTEX
3007   else if ((__kmp_user_lock_kind == lk_futex) &&
3008            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3009     lck = (kmp_user_lock_p)user_lock;
3010   }
3011 #endif
3012   else {
3013     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3014   }
3015
3016 #if USE_ITT_BUILD
3017   __kmp_itt_lock_acquiring(lck);
3018 #endif /* USE_ITT_BUILD */
3019 #if OMPT_SUPPORT && OMPT_OPTIONAL
3020   // This is the case, if called from omp_init_lock_with_hint:
3021   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3022   if (!codeptr)
3023     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3024   if (ompt_enabled.ompt_callback_mutex_acquire) {
3025     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3026         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3027         (ompt_wait_id_t)lck, codeptr);
3028   }
3029 #endif
3030
3031   rc = TEST_LOCK(lck, gtid);
3032 #if USE_ITT_BUILD
3033   if (rc) {
3034     __kmp_itt_lock_acquired(lck);
3035   } else {
3036     __kmp_itt_lock_cancelled(lck);
3037   }
3038 #endif /* USE_ITT_BUILD */
3039 #if OMPT_SUPPORT && OMPT_OPTIONAL
3040   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3041     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3042         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3043   }
3044 #endif
3045
3046   return (rc ? FTN_TRUE : FTN_FALSE);
3047
3048 /* Can't use serial interval since not block structured */
3049
3050 #endif // KMP_USE_DYNAMIC_LOCK
3051 }
3052
3053 /* try to acquire the lock */
3054 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3055 #if KMP_USE_DYNAMIC_LOCK
3056   int rc;
3057 #if USE_ITT_BUILD
3058   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3059 #endif
3060 #if OMPT_SUPPORT && OMPT_OPTIONAL
3061   // This is the case, if called from omp_init_lock_with_hint:
3062   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3063   if (!codeptr)
3064     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3065   if (ompt_enabled.ompt_callback_mutex_acquire) {
3066     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3067         ompt_mutex_nest_lock, omp_lock_hint_none,
3068         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3069         codeptr);
3070   }
3071 #endif
3072   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3073 #if USE_ITT_BUILD
3074   if (rc) {
3075     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3076   } else {
3077     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3078   }
3079 #endif
3080 #if OMPT_SUPPORT && OMPT_OPTIONAL
3081   if (ompt_enabled.enabled && rc) {
3082     if (rc == 1) {
3083       if (ompt_enabled.ompt_callback_mutex_acquired) {
3084         // lock_first
3085         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3086             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3087       }
3088     } else {
3089       if (ompt_enabled.ompt_callback_nest_lock) {
3090         // lock_next
3091         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3092             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3093       }
3094     }
3095   }
3096 #endif
3097   return rc;
3098
3099 #else // KMP_USE_DYNAMIC_LOCK
3100
3101   kmp_user_lock_p lck;
3102   int rc;
3103
3104   if ((__kmp_user_lock_kind == lk_tas) &&
3105       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3106        OMP_NEST_LOCK_T_SIZE)) {
3107     lck = (kmp_user_lock_p)user_lock;
3108   }
3109 #if KMP_USE_FUTEX
3110   else if ((__kmp_user_lock_kind == lk_futex) &&
3111            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3112             OMP_NEST_LOCK_T_SIZE)) {
3113     lck = (kmp_user_lock_p)user_lock;
3114   }
3115 #endif
3116   else {
3117     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3118   }
3119
3120 #if USE_ITT_BUILD
3121   __kmp_itt_lock_acquiring(lck);
3122 #endif /* USE_ITT_BUILD */
3123
3124 #if OMPT_SUPPORT && OMPT_OPTIONAL
3125   // This is the case, if called from omp_init_lock_with_hint:
3126   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3127   if (!codeptr)
3128     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3129   if (ompt_enabled.enabled) &&
3130         ompt_enabled.ompt_callback_mutex_acquire) {
3131       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3132           ompt_mutex_nest_lock, omp_lock_hint_none,
3133           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3134     }
3135 #endif
3136
3137   rc = TEST_NESTED_LOCK(lck, gtid);
3138 #if USE_ITT_BUILD
3139   if (rc) {
3140     __kmp_itt_lock_acquired(lck);
3141   } else {
3142     __kmp_itt_lock_cancelled(lck);
3143   }
3144 #endif /* USE_ITT_BUILD */
3145 #if OMPT_SUPPORT && OMPT_OPTIONAL
3146   if (ompt_enabled.enabled && rc) {
3147     if (rc == 1) {
3148       if (ompt_enabled.ompt_callback_mutex_acquired) {
3149         // lock_first
3150         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3151             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3152       }
3153     } else {
3154       if (ompt_enabled.ompt_callback_nest_lock) {
3155         // lock_next
3156         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3157             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3158       }
3159     }
3160   }
3161 #endif
3162   return rc;
3163
3164 /* Can't use serial interval since not block structured */
3165
3166 #endif // KMP_USE_DYNAMIC_LOCK
3167 }
3168
3169 // Interface to fast scalable reduce methods routines
3170
3171 // keep the selected method in a thread local structure for cross-function
3172 // usage: will be used in __kmpc_end_reduce* functions;
3173 // another solution: to re-determine the method one more time in
3174 // __kmpc_end_reduce* functions (new prototype required then)
3175 // AT: which solution is better?
3176 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3177   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3178
3179 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3180   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3181
3182 // description of the packed_reduction_method variable: look at the macros in
3183 // kmp.h
3184
3185 // used in a critical section reduce block
3186 static __forceinline void
3187 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3188                                           kmp_critical_name *crit) {
3189
3190   // this lock was visible to a customer and to the threading profile tool as a
3191   // serial overhead span (although it's used for an internal purpose only)
3192   //            why was it visible in previous implementation?
3193   //            should we keep it visible in new reduce block?
3194   kmp_user_lock_p lck;
3195
3196 #if KMP_USE_DYNAMIC_LOCK
3197
3198   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3199   // Check if it is initialized.
3200   if (*lk == 0) {
3201     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3202       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3203                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3204     } else {
3205       __kmp_init_indirect_csptr(crit, loc, global_tid,
3206                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3207     }
3208   }
3209   // Branch for accessing the actual lock object and set operation. This
3210   // branching is inevitable since this lock initialization does not follow the
3211   // normal dispatch path (lock table is not used).
3212   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3213     lck = (kmp_user_lock_p)lk;
3214     KMP_DEBUG_ASSERT(lck != NULL);
3215     if (__kmp_env_consistency_check) {
3216       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3217     }
3218     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3219   } else {
3220     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3221     lck = ilk->lock;
3222     KMP_DEBUG_ASSERT(lck != NULL);
3223     if (__kmp_env_consistency_check) {
3224       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3225     }
3226     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3227   }
3228
3229 #else // KMP_USE_DYNAMIC_LOCK
3230
3231   // We know that the fast reduction code is only emitted by Intel compilers
3232   // with 32 byte critical sections. If there isn't enough space, then we
3233   // have to use a pointer.
3234   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3235     lck = (kmp_user_lock_p)crit;
3236   } else {
3237     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3238   }
3239   KMP_DEBUG_ASSERT(lck != NULL);
3240
3241   if (__kmp_env_consistency_check)
3242     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3243
3244   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3245
3246 #endif // KMP_USE_DYNAMIC_LOCK
3247 }
3248
3249 // used in a critical section reduce block
3250 static __forceinline void
3251 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3252                                         kmp_critical_name *crit) {
3253
3254   kmp_user_lock_p lck;
3255
3256 #if KMP_USE_DYNAMIC_LOCK
3257
3258   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3259     lck = (kmp_user_lock_p)crit;
3260     if (__kmp_env_consistency_check)
3261       __kmp_pop_sync(global_tid, ct_critical, loc);
3262     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3263   } else {
3264     kmp_indirect_lock_t *ilk =
3265         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3266     if (__kmp_env_consistency_check)
3267       __kmp_pop_sync(global_tid, ct_critical, loc);
3268     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3269   }
3270
3271 #else // KMP_USE_DYNAMIC_LOCK
3272
3273   // We know that the fast reduction code is only emitted by Intel compilers
3274   // with 32 byte critical sections. If there isn't enough space, then we have
3275   // to use a pointer.
3276   if (__kmp_base_user_lock_size > 32) {
3277     lck = *((kmp_user_lock_p *)crit);
3278     KMP_ASSERT(lck != NULL);
3279   } else {
3280     lck = (kmp_user_lock_p)crit;
3281   }
3282
3283   if (__kmp_env_consistency_check)
3284     __kmp_pop_sync(global_tid, ct_critical, loc);
3285
3286   __kmp_release_user_lock_with_checks(lck, global_tid);
3287
3288 #endif // KMP_USE_DYNAMIC_LOCK
3289 } // __kmp_end_critical_section_reduce_block
3290
3291 #if OMP_40_ENABLED
3292 static __forceinline int
3293 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3294                                      int *task_state) {
3295   kmp_team_t *team;
3296
3297   // Check if we are inside the teams construct?
3298   if (th->th.th_teams_microtask) {
3299     *team_p = team = th->th.th_team;
3300     if (team->t.t_level == th->th.th_teams_level) {
3301       // This is reduction at teams construct.
3302       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3303       // Let's swap teams temporarily for the reduction.
3304       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3305       th->th.th_team = team->t.t_parent;
3306       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3307       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3308       *task_state = th->th.th_task_state;
3309       th->th.th_task_state = 0;
3310
3311       return 1;
3312     }
3313   }
3314   return 0;
3315 }
3316
3317 static __forceinline void
3318 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3319   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3320   th->th.th_info.ds.ds_tid = 0;
3321   th->th.th_team = team;
3322   th->th.th_team_nproc = team->t.t_nproc;
3323   th->th.th_task_team = team->t.t_task_team[task_state];
3324   th->th.th_task_state = task_state;
3325 }
3326 #endif
3327
3328 /* 2.a.i. Reduce Block without a terminating barrier */
3329 /*!
3330 @ingroup SYNCHRONIZATION
3331 @param loc source location information
3332 @param global_tid global thread number
3333 @param num_vars number of items (variables) to be reduced
3334 @param reduce_size size of data in bytes to be reduced
3335 @param reduce_data pointer to data to be reduced
3336 @param reduce_func callback function providing reduction operation on two
3337 operands and returning result of reduction in lhs_data
3338 @param lck pointer to the unique lock data structure
3339 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3340 threads if atomic reduction needed
3341
3342 The nowait version is used for a reduce clause with the nowait argument.
3343 */
3344 kmp_int32
3345 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3346                      size_t reduce_size, void *reduce_data,
3347                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3348                      kmp_critical_name *lck) {
3349
3350   KMP_COUNT_BLOCK(REDUCE_nowait);
3351   int retval = 0;
3352   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3353 #if OMP_40_ENABLED
3354   kmp_info_t *th;
3355   kmp_team_t *team;
3356   int teams_swapped = 0, task_state;
3357 #endif
3358   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3359
3360   // why do we need this initialization here at all?
3361   // Reduction clause can not be used as a stand-alone directive.
3362
3363   // do not call __kmp_serial_initialize(), it will be called by
3364   // __kmp_parallel_initialize() if needed
3365   // possible detection of false-positive race by the threadchecker ???
3366   if (!TCR_4(__kmp_init_parallel))
3367     __kmp_parallel_initialize();
3368
3369 // check correctness of reduce block nesting
3370 #if KMP_USE_DYNAMIC_LOCK
3371   if (__kmp_env_consistency_check)
3372     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3373 #else
3374   if (__kmp_env_consistency_check)
3375     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3376 #endif
3377
3378 #if OMP_40_ENABLED
3379   th = __kmp_thread_from_gtid(global_tid);
3380   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3381 #endif // OMP_40_ENABLED
3382
3383   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3384   // the value should be kept in a variable
3385   // the variable should be either a construct-specific or thread-specific
3386   // property, not a team specific property
3387   //     (a thread can reach the next reduce block on the next construct, reduce
3388   //     method may differ on the next construct)
3389   // an ident_t "loc" parameter could be used as a construct-specific property
3390   // (what if loc == 0?)
3391   //     (if both construct-specific and team-specific variables were shared,
3392   //     then unness extra syncs should be needed)
3393   // a thread-specific variable is better regarding two issues above (next
3394   // construct and extra syncs)
3395   // a thread-specific "th_local.reduction_method" variable is used currently
3396   // each thread executes 'determine' and 'set' lines (no need to execute by one
3397   // thread, to avoid unness extra syncs)
3398
3399   packed_reduction_method = __kmp_determine_reduction_method(
3400       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3401   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3402
3403   if (packed_reduction_method == critical_reduce_block) {
3404
3405     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3406     retval = 1;
3407
3408   } else if (packed_reduction_method == empty_reduce_block) {
3409
3410     // usage: if team size == 1, no synchronization is required ( Intel
3411     // platforms only )
3412     retval = 1;
3413
3414   } else if (packed_reduction_method == atomic_reduce_block) {
3415
3416     retval = 2;
3417
3418     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3419     // won't be called by the code gen)
3420     //     (it's not quite good, because the checking block has been closed by
3421     //     this 'pop',
3422     //      but atomic operation has not been executed yet, will be executed
3423     //      slightly later, literally on next instruction)
3424     if (__kmp_env_consistency_check)
3425       __kmp_pop_sync(global_tid, ct_reduce, loc);
3426
3427   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3428                                    tree_reduce_block)) {
3429
3430 // AT: performance issue: a real barrier here
3431 // AT:     (if master goes slow, other threads are blocked here waiting for the
3432 // master to come and release them)
3433 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3434 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3435 // be confusing to a customer)
3436 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3437 // might go faster and be more in line with sense of NOWAIT
3438 // AT: TO DO: do epcc test and compare times
3439
3440 // this barrier should be invisible to a customer and to the threading profile
3441 // tool (it's neither a terminating barrier nor customer's code, it's
3442 // used for an internal purpose)
3443 #if OMPT_SUPPORT
3444     // JP: can this barrier potentially leed to task scheduling?
3445     // JP: as long as there is a barrier in the implementation, OMPT should and
3446     // will provide the barrier events
3447     //         so we set-up the necessary frame/return addresses.
3448     ompt_frame_t *ompt_frame;
3449     if (ompt_enabled.enabled) {
3450       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3451       if (ompt_frame->enter_frame.ptr == NULL)
3452         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3453       OMPT_STORE_RETURN_ADDRESS(global_tid);
3454     }
3455 #endif
3456 #if USE_ITT_NOTIFY
3457     __kmp_threads[global_tid]->th.th_ident = loc;
3458 #endif
3459     retval =
3460         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3461                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3462     retval = (retval != 0) ? (0) : (1);
3463 #if OMPT_SUPPORT && OMPT_OPTIONAL
3464     if (ompt_enabled.enabled) {
3465       ompt_frame->enter_frame = ompt_data_none;
3466     }
3467 #endif
3468
3469     // all other workers except master should do this pop here
3470     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3471     if (__kmp_env_consistency_check) {
3472       if (retval == 0) {
3473         __kmp_pop_sync(global_tid, ct_reduce, loc);
3474       }
3475     }
3476
3477   } else {
3478
3479     // should never reach this block
3480     KMP_ASSERT(0); // "unexpected method"
3481   }
3482 #if OMP_40_ENABLED
3483   if (teams_swapped) {
3484     __kmp_restore_swapped_teams(th, team, task_state);
3485   }
3486 #endif
3487   KA_TRACE(
3488       10,
3489       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3490        global_tid, packed_reduction_method, retval));
3491
3492   return retval;
3493 }
3494
3495 /*!
3496 @ingroup SYNCHRONIZATION
3497 @param loc source location information
3498 @param global_tid global thread id.
3499 @param lck pointer to the unique lock data structure
3500
3501 Finish the execution of a reduce nowait.
3502 */
3503 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3504                               kmp_critical_name *lck) {
3505
3506   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3507
3508   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3509
3510   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3511
3512   if (packed_reduction_method == critical_reduce_block) {
3513
3514     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3515
3516   } else if (packed_reduction_method == empty_reduce_block) {
3517
3518     // usage: if team size == 1, no synchronization is required ( on Intel
3519     // platforms only )
3520
3521   } else if (packed_reduction_method == atomic_reduce_block) {
3522
3523     // neither master nor other workers should get here
3524     //     (code gen does not generate this call in case 2: atomic reduce block)
3525     // actually it's better to remove this elseif at all;
3526     // after removal this value will checked by the 'else' and will assert
3527
3528   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3529                                    tree_reduce_block)) {
3530
3531     // only master gets here
3532
3533   } else {
3534
3535     // should never reach this block
3536     KMP_ASSERT(0); // "unexpected method"
3537   }
3538
3539   if (__kmp_env_consistency_check)
3540     __kmp_pop_sync(global_tid, ct_reduce, loc);
3541
3542   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3543                 global_tid, packed_reduction_method));
3544
3545   return;
3546 }
3547
3548 /* 2.a.ii. Reduce Block with a terminating barrier */
3549
3550 /*!
3551 @ingroup SYNCHRONIZATION
3552 @param loc source location information
3553 @param global_tid global thread number
3554 @param num_vars number of items (variables) to be reduced
3555 @param reduce_size size of data in bytes to be reduced
3556 @param reduce_data pointer to data to be reduced
3557 @param reduce_func callback function providing reduction operation on two
3558 operands and returning result of reduction in lhs_data
3559 @param lck pointer to the unique lock data structure
3560 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3561 threads if atomic reduction needed
3562
3563 A blocking reduce that includes an implicit barrier.
3564 */
3565 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3566                         size_t reduce_size, void *reduce_data,
3567                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3568                         kmp_critical_name *lck) {
3569   KMP_COUNT_BLOCK(REDUCE_wait);
3570   int retval = 0;
3571   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3572 #if OMP_40_ENABLED
3573   kmp_info_t *th;
3574   kmp_team_t *team;
3575   int teams_swapped = 0, task_state;
3576 #endif
3577
3578   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3579
3580   // why do we need this initialization here at all?
3581   // Reduction clause can not be a stand-alone directive.
3582
3583   // do not call __kmp_serial_initialize(), it will be called by
3584   // __kmp_parallel_initialize() if needed
3585   // possible detection of false-positive race by the threadchecker ???
3586   if (!TCR_4(__kmp_init_parallel))
3587     __kmp_parallel_initialize();
3588
3589 // check correctness of reduce block nesting
3590 #if KMP_USE_DYNAMIC_LOCK
3591   if (__kmp_env_consistency_check)
3592     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3593 #else
3594   if (__kmp_env_consistency_check)
3595     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3596 #endif
3597
3598 #if OMP_40_ENABLED
3599   th = __kmp_thread_from_gtid(global_tid);
3600   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3601 #endif // OMP_40_ENABLED
3602
3603   packed_reduction_method = __kmp_determine_reduction_method(
3604       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3605   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3606
3607   if (packed_reduction_method == critical_reduce_block) {
3608
3609     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3610     retval = 1;
3611
3612   } else if (packed_reduction_method == empty_reduce_block) {
3613
3614     // usage: if team size == 1, no synchronization is required ( Intel
3615     // platforms only )
3616     retval = 1;
3617
3618   } else if (packed_reduction_method == atomic_reduce_block) {
3619
3620     retval = 2;
3621
3622   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3623                                    tree_reduce_block)) {
3624
3625 // case tree_reduce_block:
3626 // this barrier should be visible to a customer and to the threading profile
3627 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3628 #if OMPT_SUPPORT
3629     ompt_frame_t *ompt_frame;
3630     if (ompt_enabled.enabled) {
3631       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3632       if (ompt_frame->enter_frame.ptr == NULL)
3633         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3634       OMPT_STORE_RETURN_ADDRESS(global_tid);
3635     }
3636 #endif
3637 #if USE_ITT_NOTIFY
3638     __kmp_threads[global_tid]->th.th_ident =
3639         loc; // needed for correct notification of frames
3640 #endif
3641     retval =
3642         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3643                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3644     retval = (retval != 0) ? (0) : (1);
3645 #if OMPT_SUPPORT && OMPT_OPTIONAL
3646     if (ompt_enabled.enabled) {
3647       ompt_frame->enter_frame = ompt_data_none;
3648     }
3649 #endif
3650
3651     // all other workers except master should do this pop here
3652     // ( none of other workers except master will enter __kmpc_end_reduce() )
3653     if (__kmp_env_consistency_check) {
3654       if (retval == 0) { // 0: all other workers; 1: master
3655         __kmp_pop_sync(global_tid, ct_reduce, loc);
3656       }
3657     }
3658
3659   } else {
3660
3661     // should never reach this block
3662     KMP_ASSERT(0); // "unexpected method"
3663   }
3664 #if OMP_40_ENABLED
3665   if (teams_swapped) {
3666     __kmp_restore_swapped_teams(th, team, task_state);
3667   }
3668 #endif
3669
3670   KA_TRACE(10,
3671            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3672             global_tid, packed_reduction_method, retval));
3673
3674   return retval;
3675 }
3676
3677 /*!
3678 @ingroup SYNCHRONIZATION
3679 @param loc source location information
3680 @param global_tid global thread id.
3681 @param lck pointer to the unique lock data structure
3682
3683 Finish the execution of a blocking reduce.
3684 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3685 start function.
3686 */
3687 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3688                        kmp_critical_name *lck) {
3689
3690   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3691 #if OMP_40_ENABLED
3692   kmp_info_t *th;
3693   kmp_team_t *team;
3694   int teams_swapped = 0, task_state;
3695 #endif
3696
3697   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3698
3699 #if OMP_40_ENABLED
3700   th = __kmp_thread_from_gtid(global_tid);
3701   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3702 #endif // OMP_40_ENABLED
3703
3704   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3705
3706   // this barrier should be visible to a customer and to the threading profile
3707   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3708
3709   if (packed_reduction_method == critical_reduce_block) {
3710
3711     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3712
3713 // TODO: implicit barrier: should be exposed
3714 #if OMPT_SUPPORT
3715     ompt_frame_t *ompt_frame;
3716     if (ompt_enabled.enabled) {
3717       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3718       if (ompt_frame->enter_frame.ptr == NULL)
3719         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3720       OMPT_STORE_RETURN_ADDRESS(global_tid);
3721     }
3722 #endif
3723 #if USE_ITT_NOTIFY
3724     __kmp_threads[global_tid]->th.th_ident = loc;
3725 #endif
3726     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3727 #if OMPT_SUPPORT && OMPT_OPTIONAL
3728     if (ompt_enabled.enabled) {
3729       ompt_frame->enter_frame = ompt_data_none;
3730     }
3731 #endif
3732
3733   } else if (packed_reduction_method == empty_reduce_block) {
3734
3735 // usage: if team size==1, no synchronization is required (Intel platforms only)
3736
3737 // TODO: implicit barrier: should be exposed
3738 #if OMPT_SUPPORT
3739     ompt_frame_t *ompt_frame;
3740     if (ompt_enabled.enabled) {
3741       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3742       if (ompt_frame->enter_frame.ptr == NULL)
3743         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3744       OMPT_STORE_RETURN_ADDRESS(global_tid);
3745     }
3746 #endif
3747 #if USE_ITT_NOTIFY
3748     __kmp_threads[global_tid]->th.th_ident = loc;
3749 #endif
3750     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3751 #if OMPT_SUPPORT && OMPT_OPTIONAL
3752     if (ompt_enabled.enabled) {
3753       ompt_frame->enter_frame = ompt_data_none;
3754     }
3755 #endif
3756
3757   } else if (packed_reduction_method == atomic_reduce_block) {
3758
3759 #if OMPT_SUPPORT
3760     ompt_frame_t *ompt_frame;
3761     if (ompt_enabled.enabled) {
3762       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3763       if (ompt_frame->enter_frame.ptr == NULL)
3764         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3765       OMPT_STORE_RETURN_ADDRESS(global_tid);
3766     }
3767 #endif
3768 // TODO: implicit barrier: should be exposed
3769 #if USE_ITT_NOTIFY
3770     __kmp_threads[global_tid]->th.th_ident = loc;
3771 #endif
3772     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3773 #if OMPT_SUPPORT && OMPT_OPTIONAL
3774     if (ompt_enabled.enabled) {
3775       ompt_frame->enter_frame = ompt_data_none;
3776     }
3777 #endif
3778
3779   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3780                                    tree_reduce_block)) {
3781
3782     // only master executes here (master releases all other workers)
3783     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3784                             global_tid);
3785
3786   } else {
3787
3788     // should never reach this block
3789     KMP_ASSERT(0); // "unexpected method"
3790   }
3791 #if OMP_40_ENABLED
3792   if (teams_swapped) {
3793     __kmp_restore_swapped_teams(th, team, task_state);
3794   }
3795 #endif
3796
3797   if (__kmp_env_consistency_check)
3798     __kmp_pop_sync(global_tid, ct_reduce, loc);
3799
3800   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3801                 global_tid, packed_reduction_method));
3802
3803   return;
3804 }
3805
3806 #undef __KMP_GET_REDUCTION_METHOD
3807 #undef __KMP_SET_REDUCTION_METHOD
3808
3809 /* end of interface to fast scalable reduce routines */
3810
3811 kmp_uint64 __kmpc_get_taskid() {
3812
3813   kmp_int32 gtid;
3814   kmp_info_t *thread;
3815
3816   gtid = __kmp_get_gtid();
3817   if (gtid < 0) {
3818     return 0;
3819   }
3820   thread = __kmp_thread_from_gtid(gtid);
3821   return thread->th.th_current_task->td_task_id;
3822
3823 } // __kmpc_get_taskid
3824
3825 kmp_uint64 __kmpc_get_parent_taskid() {
3826
3827   kmp_int32 gtid;
3828   kmp_info_t *thread;
3829   kmp_taskdata_t *parent_task;
3830
3831   gtid = __kmp_get_gtid();
3832   if (gtid < 0) {
3833     return 0;
3834   }
3835   thread = __kmp_thread_from_gtid(gtid);
3836   parent_task = thread->th.th_current_task->td_parent;
3837   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3838
3839 } // __kmpc_get_parent_taskid
3840
3841 #if OMP_45_ENABLED
3842 /*!
3843 @ingroup WORK_SHARING
3844 @param loc  source location information.
3845 @param gtid  global thread number.
3846 @param num_dims  number of associated doacross loops.
3847 @param dims  info on loops bounds.
3848
3849 Initialize doacross loop information.
3850 Expect compiler send us inclusive bounds,
3851 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3852 */
3853 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3854                           const struct kmp_dim *dims) {
3855   int j, idx;
3856   kmp_int64 last, trace_count;
3857   kmp_info_t *th = __kmp_threads[gtid];
3858   kmp_team_t *team = th->th.th_team;
3859   kmp_uint32 *flags;
3860   kmp_disp_t *pr_buf = th->th.th_dispatch;
3861   dispatch_shared_info_t *sh_buf;
3862
3863   KA_TRACE(
3864       20,
3865       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3866        gtid, num_dims, !team->t.t_serialized));
3867   KMP_DEBUG_ASSERT(dims != NULL);
3868   KMP_DEBUG_ASSERT(num_dims > 0);
3869
3870   if (team->t.t_serialized) {
3871     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3872     return; // no dependencies if team is serialized
3873   }
3874   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3875   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3876   // the next loop
3877   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3878
3879   // Save bounds info into allocated private buffer
3880   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3881   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3882       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3883   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3884   pr_buf->th_doacross_info[0] =
3885       (kmp_int64)num_dims; // first element is number of dimensions
3886   // Save also address of num_done in order to access it later without knowing
3887   // the buffer index
3888   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3889   pr_buf->th_doacross_info[2] = dims[0].lo;
3890   pr_buf->th_doacross_info[3] = dims[0].up;
3891   pr_buf->th_doacross_info[4] = dims[0].st;
3892   last = 5;
3893   for (j = 1; j < num_dims; ++j) {
3894     kmp_int64
3895         range_length; // To keep ranges of all dimensions but the first dims[0]
3896     if (dims[j].st == 1) { // most common case
3897       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3898       range_length = dims[j].up - dims[j].lo + 1;
3899     } else {
3900       if (dims[j].st > 0) {
3901         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3902         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3903       } else { // negative increment
3904         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3905         range_length =
3906             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3907       }
3908     }
3909     pr_buf->th_doacross_info[last++] = range_length;
3910     pr_buf->th_doacross_info[last++] = dims[j].lo;
3911     pr_buf->th_doacross_info[last++] = dims[j].up;
3912     pr_buf->th_doacross_info[last++] = dims[j].st;
3913   }
3914
3915   // Compute total trip count.
3916   // Start with range of dims[0] which we don't need to keep in the buffer.
3917   if (dims[0].st == 1) { // most common case
3918     trace_count = dims[0].up - dims[0].lo + 1;
3919   } else if (dims[0].st > 0) {
3920     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3921     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3922   } else { // negative increment
3923     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3924     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3925   }
3926   for (j = 1; j < num_dims; ++j) {
3927     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3928   }
3929   KMP_DEBUG_ASSERT(trace_count > 0);
3930
3931   // Check if shared buffer is not occupied by other loop (idx -
3932   // __kmp_dispatch_num_buffers)
3933   if (idx != sh_buf->doacross_buf_idx) {
3934     // Shared buffer is occupied, wait for it to be free
3935     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3936                        __kmp_eq_4, NULL);
3937   }
3938 #if KMP_32_BIT_ARCH
3939   // Check if we are the first thread. After the CAS the first thread gets 0,
3940   // others get 1 if initialization is in progress, allocated pointer otherwise.
3941   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3942   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3943       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3944 #else
3945   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3946       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3947 #endif
3948   if (flags == NULL) {
3949     // we are the first thread, allocate the array of flags
3950     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3951     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3952     KMP_MB();
3953     sh_buf->doacross_flags = flags;
3954   } else if (flags == (kmp_uint32 *)1) {
3955 #if KMP_32_BIT_ARCH
3956     // initialization is still in progress, need to wait
3957     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3958 #else
3959     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3960 #endif
3961       KMP_YIELD(TRUE);
3962     KMP_MB();
3963   } else {
3964     KMP_MB();
3965   }
3966   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3967   pr_buf->th_doacross_flags =
3968       sh_buf->doacross_flags; // save private copy in order to not
3969   // touch shared buffer on each iteration
3970   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3971 }
3972
3973 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3974   kmp_int32 shft, num_dims, i;
3975   kmp_uint32 flag;
3976   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3977   kmp_info_t *th = __kmp_threads[gtid];
3978   kmp_team_t *team = th->th.th_team;
3979   kmp_disp_t *pr_buf;
3980   kmp_int64 lo, up, st;
3981
3982   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3983   if (team->t.t_serialized) {
3984     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3985     return; // no dependencies if team is serialized
3986   }
3987
3988   // calculate sequential iteration number and check out-of-bounds condition
3989   pr_buf = th->th.th_dispatch;
3990   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3991   num_dims = pr_buf->th_doacross_info[0];
3992   lo = pr_buf->th_doacross_info[2];
3993   up = pr_buf->th_doacross_info[3];
3994   st = pr_buf->th_doacross_info[4];
3995   if (st == 1) { // most common case
3996     if (vec[0] < lo || vec[0] > up) {
3997       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3998                     "bounds [%lld,%lld]\n",
3999                     gtid, vec[0], lo, up));
4000       return;
4001     }
4002     iter_number = vec[0] - lo;
4003   } else if (st > 0) {
4004     if (vec[0] < lo || vec[0] > up) {
4005       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4006                     "bounds [%lld,%lld]\n",
4007                     gtid, vec[0], lo, up));
4008       return;
4009     }
4010     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4011   } else { // negative increment
4012     if (vec[0] > lo || vec[0] < up) {
4013       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4014                     "bounds [%lld,%lld]\n",
4015                     gtid, vec[0], lo, up));
4016       return;
4017     }
4018     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4019   }
4020   for (i = 1; i < num_dims; ++i) {
4021     kmp_int64 iter, ln;
4022     kmp_int32 j = i * 4;
4023     ln = pr_buf->th_doacross_info[j + 1];
4024     lo = pr_buf->th_doacross_info[j + 2];
4025     up = pr_buf->th_doacross_info[j + 3];
4026     st = pr_buf->th_doacross_info[j + 4];
4027     if (st == 1) {
4028       if (vec[i] < lo || vec[i] > up) {
4029         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4030                       "bounds [%lld,%lld]\n",
4031                       gtid, vec[i], lo, up));
4032         return;
4033       }
4034       iter = vec[i] - lo;
4035     } else if (st > 0) {
4036       if (vec[i] < lo || vec[i] > up) {
4037         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4038                       "bounds [%lld,%lld]\n",
4039                       gtid, vec[i], lo, up));
4040         return;
4041       }
4042       iter = (kmp_uint64)(vec[i] - lo) / st;
4043     } else { // st < 0
4044       if (vec[i] > lo || vec[i] < up) {
4045         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4046                       "bounds [%lld,%lld]\n",
4047                       gtid, vec[i], lo, up));
4048         return;
4049       }
4050       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4051     }
4052     iter_number = iter + ln * iter_number;
4053   }
4054   shft = iter_number % 32; // use 32-bit granularity
4055   iter_number >>= 5; // divided by 32
4056   flag = 1 << shft;
4057   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4058     KMP_YIELD(TRUE);
4059   }
4060   KMP_MB();
4061   KA_TRACE(20,
4062            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4063             gtid, (iter_number << 5) + shft));
4064 }
4065
4066 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4067   kmp_int32 shft, num_dims, i;
4068   kmp_uint32 flag;
4069   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4070   kmp_info_t *th = __kmp_threads[gtid];
4071   kmp_team_t *team = th->th.th_team;
4072   kmp_disp_t *pr_buf;
4073   kmp_int64 lo, st;
4074
4075   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4076   if (team->t.t_serialized) {
4077     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4078     return; // no dependencies if team is serialized
4079   }
4080
4081   // calculate sequential iteration number (same as in "wait" but no
4082   // out-of-bounds checks)
4083   pr_buf = th->th.th_dispatch;
4084   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4085   num_dims = pr_buf->th_doacross_info[0];
4086   lo = pr_buf->th_doacross_info[2];
4087   st = pr_buf->th_doacross_info[4];
4088   if (st == 1) { // most common case
4089     iter_number = vec[0] - lo;
4090   } else if (st > 0) {
4091     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4092   } else { // negative increment
4093     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4094   }
4095   for (i = 1; i < num_dims; ++i) {
4096     kmp_int64 iter, ln;
4097     kmp_int32 j = i * 4;
4098     ln = pr_buf->th_doacross_info[j + 1];
4099     lo = pr_buf->th_doacross_info[j + 2];
4100     st = pr_buf->th_doacross_info[j + 4];
4101     if (st == 1) {
4102       iter = vec[i] - lo;
4103     } else if (st > 0) {
4104       iter = (kmp_uint64)(vec[i] - lo) / st;
4105     } else { // st < 0
4106       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4107     }
4108     iter_number = iter + ln * iter_number;
4109   }
4110   shft = iter_number % 32; // use 32-bit granularity
4111   iter_number >>= 5; // divided by 32
4112   flag = 1 << shft;
4113   KMP_MB();
4114   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4115     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4116   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4117                 (iter_number << 5) + shft));
4118 }
4119
4120 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4121   kmp_int32 num_done;
4122   kmp_info_t *th = __kmp_threads[gtid];
4123   kmp_team_t *team = th->th.th_team;
4124   kmp_disp_t *pr_buf = th->th.th_dispatch;
4125
4126   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4127   if (team->t.t_serialized) {
4128     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4129     return; // nothing to do
4130   }
4131   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4132   if (num_done == th->th.th_team_nproc) {
4133     // we are the last thread, need to free shared resources
4134     int idx = pr_buf->th_doacross_buf_idx - 1;
4135     dispatch_shared_info_t *sh_buf =
4136         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4137     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4138                      (kmp_int64)&sh_buf->doacross_num_done);
4139     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4140     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4141     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4142     sh_buf->doacross_flags = NULL;
4143     sh_buf->doacross_num_done = 0;
4144     sh_buf->doacross_buf_idx +=
4145         __kmp_dispatch_num_buffers; // free buffer for future re-use
4146   }
4147   // free private resources (need to keep buffer index forever)
4148   pr_buf->th_doacross_flags = NULL;
4149   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4150   pr_buf->th_doacross_info = NULL;
4151   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4152 }
4153 #endif
4154
4155 #if OMP_50_ENABLED
4156 int __kmpc_get_target_offload(void) {
4157   if (!__kmp_init_serial) {
4158     __kmp_serial_initialize();
4159   }
4160   return __kmp_target_offload;
4161 }
4162 #endif // OMP_50_ENABLED
4163
4164 // end of file //