2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
16 #include "kmp_error.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
23 #define MAX_MESSAGE 512
25 // flags will be used in future, e.g. to implement openmp_strict library
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc in source location information
31 * @param flags in for future use (currently ignored)
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37 // By default __kmpc_begin() is no-op.
39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40 __kmp_str_match_true(env)) {
41 __kmp_middle_initialize();
42 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43 } else if (__kmp_ignore_mppbeg() == FALSE) {
44 // By default __kmp_ignore_mppbeg() returns TRUE.
45 __kmp_internal_begin();
46 KC_TRACE(10, ("__kmpc_begin: called\n"));
51 * @ingroup STARTUP_SHUTDOWN
52 * @param loc source location information
54 * Shutdown the runtime library. This is also optional, and even if called will
55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
58 void __kmpc_end(ident_t *loc) {
59 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62 // returns FALSE and __kmpc_end() will unregister this root (it can cause
63 // library shut down).
64 if (__kmp_ignore_mppend() == FALSE) {
65 KC_TRACE(10, ("__kmpc_end: called\n"));
66 KA_TRACE(30, ("__kmpc_end\n"));
68 __kmp_internal_end_thread(-1);
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71 // Normal exit process on Windows does not allow worker threads of the final
72 // parallel region to finish reporting their events, so shutting down the
73 // library here fixes the issue at least for the cases where __kmpc_end() is
75 if (ompt_enabled.enabled)
76 __kmp_internal_end_library(__kmp_gtid_get_specific());
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
85 This function can be called in any context.
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannote be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99 kmp_int32 gtid = __kmp_entry_gtid();
101 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 return TCR_4(__kmp_all_nth);
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135 return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 * @ingroup DEPRECATED
151 * @param loc location description
153 * This function need not be called. It always returns TRUE.
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
166 if (__kmp_par_range == 0) {
169 semi2 = loc->psource;
173 semi2 = strchr(semi2, ';');
177 semi2 = strchr(semi2 + 1, ';');
181 if (__kmp_par_range_filename[0]) {
182 const char *name = semi2 - 1;
183 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
186 if ((*name == '/') || (*name == ';')) {
189 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190 return __kmp_par_range < 0;
193 semi3 = strchr(semi2 + 1, ';');
194 if (__kmp_par_range_routine[0]) {
195 if ((semi3 != NULL) && (semi3 > semi2) &&
196 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197 return __kmp_par_range < 0;
200 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202 return __kmp_par_range > 0;
204 return __kmp_par_range < 0;
208 #endif /* KMP_DEBUG */
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218 return __kmp_entry_thread()->th.th_root->r.r_active;
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231 kmp_int32 num_threads) {
232 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233 global_tid, num_threads));
235 __kmp_push_num_threads(loc, global_tid, num_threads);
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241 /* the num_threads are automatically popped */
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245 kmp_int32 proc_bind) {
246 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
249 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
254 @param loc source location information
255 @param argc total number of arguments in the ellipsis
256 @param microtask pointer to callback routine consisting of outlined parallel
258 @param ... pointers to shared variables that aren't global
260 Do the actual fork and call the microtask in the relevant number of threads.
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263 int gtid = __kmp_entry_gtid();
265 #if (KMP_STATS_ENABLED)
266 // If we were in a serial region, then stop the serial timer, record
267 // the event, and start parallel region timer
268 stats_state_e previous_state = KMP_GET_THREAD_STATE();
269 if (previous_state == stats_state_e::SERIAL_REGION) {
270 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
274 int inParallel = __kmpc_in_parallel(loc);
276 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
278 KMP_COUNT_BLOCK(OMP_PARALLEL);
282 // maybe to save thr_state is enough here
285 va_start(ap, microtask);
288 ompt_frame_t *ompt_frame;
289 if (ompt_enabled.enabled) {
290 kmp_info_t *master_th = __kmp_threads[gtid];
291 kmp_team_t *parent_team = master_th->th.th_team;
292 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
294 ompt_frame = &(lwt->ompt_task_info.frame);
296 int tid = __kmp_tid_from_gtid(gtid);
298 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
300 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301 OMPT_STORE_RETURN_ADDRESS(gtid);
305 #if INCLUDE_SSC_MARKS
308 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
312 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
318 #if INCLUDE_SSC_MARKS
321 __kmp_join_call(loc, gtid
331 #if KMP_STATS_ENABLED
332 if (previous_state == stats_state_e::SERIAL_REGION) {
333 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
335 KMP_POP_PARTITIONED_TIMER();
337 #endif // KMP_STATS_ENABLED
342 @param loc source location information
343 @param global_tid global thread number
344 @param num_teams number of teams requested for the teams construct
345 @param num_threads number of threads per team requested for the teams construct
347 Set the number of teams to be used by the teams construct.
348 This call is only required if the teams construct has a `num_teams` clause
349 or a `thread_limit` clause (or both).
351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352 kmp_int32 num_teams, kmp_int32 num_threads) {
354 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355 global_tid, num_teams, num_threads));
357 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
362 @param loc source location information
363 @param argc total number of arguments in the ellipsis
364 @param microtask pointer to callback routine consisting of outlined teams
366 @param ... pointers to shared variables that aren't global
368 Do the actual fork and call the microtask in the relevant number of threads.
370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
372 int gtid = __kmp_entry_gtid();
373 kmp_info_t *this_thr = __kmp_threads[gtid];
375 va_start(ap, microtask);
377 #if KMP_STATS_ENABLED
378 KMP_COUNT_BLOCK(OMP_TEAMS);
379 stats_state_e previous_state = KMP_GET_THREAD_STATE();
380 if (previous_state == stats_state_e::SERIAL_REGION) {
381 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
383 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
387 // remember teams entry point and nesting level
388 this_thr->th.th_teams_microtask = microtask;
389 this_thr->th.th_teams_level =
390 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
393 kmp_team_t *parent_team = this_thr->th.th_team;
394 int tid = __kmp_tid_from_gtid(gtid);
395 if (ompt_enabled.enabled) {
396 parent_team->t.t_implicit_task_taskdata[tid]
397 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
399 OMPT_STORE_RETURN_ADDRESS(gtid);
402 // check if __kmpc_push_num_teams called, set default number of teams
404 if (this_thr->th.th_teams_size.nteams == 0) {
405 __kmp_push_num_teams(loc, gtid, 0, 0);
407 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
411 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412 VOLATILE_CAST(microtask_t)
413 __kmp_teams_master, // "wrapped" task
414 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
421 __kmp_join_call(loc, gtid
428 // Pop current CG root off list
429 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
431 this_thr->th.th_cg_roots = tmp->up;
432 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
433 " to node %p. cg_nthreads was %d\n",
434 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
435 KMP_DEBUG_ASSERT(tmp->cg_nthreads);
436 int i = tmp->cg_nthreads--;
437 if (i == 1) { // check is we are the last thread in CG (not always the case)
440 // Restore current task's thread_limit from CG root
441 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
442 this_thr->th.th_current_task->td_icvs.thread_limit =
443 this_thr->th.th_cg_roots->cg_thread_limit;
445 this_thr->th.th_teams_microtask = NULL;
446 this_thr->th.th_teams_level = 0;
447 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
449 #if KMP_STATS_ENABLED
450 if (previous_state == stats_state_e::SERIAL_REGION) {
451 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
453 KMP_POP_PARTITIONED_TIMER();
455 #endif // KMP_STATS_ENABLED
458 // I don't think this function should ever have been exported.
459 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
460 // openmp code ever called it, but it's been exported from the RTL for so
461 // long that I'm afraid to remove the definition.
462 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
466 @param loc source location information
467 @param global_tid global thread number
469 Enter a serialized parallel construct. This interface is used to handle a
470 conditional parallel region, like this,
472 #pragma omp parallel if (condition)
474 when the condition is false.
476 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
477 // The implementation is now in kmp_runtime.cpp so that it can share static
478 // functions with kmp_fork_call since the tasks to be done are similar in
481 OMPT_STORE_RETURN_ADDRESS(global_tid);
483 __kmp_serialized_parallel(loc, global_tid);
488 @param loc source location information
489 @param global_tid global thread number
491 Leave a serialized parallel construct.
493 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494 kmp_internal_control_t *top;
495 kmp_info_t *this_thr;
496 kmp_team_t *serial_team;
499 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
501 /* skip all this code for autopar serialized loops since it results in
502 unacceptable overhead */
503 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
507 if (!TCR_4(__kmp_init_parallel))
508 __kmp_parallel_initialize();
510 __kmp_resume_if_soft_paused();
512 this_thr = __kmp_threads[global_tid];
513 serial_team = this_thr->th.th_serial_team;
515 kmp_task_team_t *task_team = this_thr->th.th_task_team;
516 // we need to wait for the proxy tasks before finishing the thread
517 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
518 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
521 KMP_DEBUG_ASSERT(serial_team);
522 KMP_ASSERT(serial_team->t.t_serialized);
523 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
524 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
525 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
526 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
529 if (ompt_enabled.enabled &&
530 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
531 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
532 if (ompt_enabled.ompt_callback_implicit_task) {
533 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
534 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
535 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
538 // reset clear the task id only after unlinking the task
539 ompt_data_t *parent_task_data;
540 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
542 if (ompt_enabled.ompt_callback_parallel_end) {
543 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
544 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
545 ompt_parallel_invoker_program | ompt_parallel_team,
546 OMPT_LOAD_RETURN_ADDRESS(global_tid));
548 __ompt_lw_taskteam_unlink(this_thr);
549 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
553 /* If necessary, pop the internal control stack values and replace the team
555 top = serial_team->t.t_control_stack_top;
556 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
557 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
558 serial_team->t.t_control_stack_top = top->next;
562 // if( serial_team -> t.t_serialized > 1 )
563 serial_team->t.t_level--;
565 /* pop dispatch buffers stack */
566 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
568 dispatch_private_info_t *disp_buffer =
569 serial_team->t.t_dispatch->th_disp_buffer;
570 serial_team->t.t_dispatch->th_disp_buffer =
571 serial_team->t.t_dispatch->th_disp_buffer->next;
572 __kmp_free(disp_buffer);
574 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
576 --serial_team->t.t_serialized;
577 if (serial_team->t.t_serialized == 0) {
579 /* return to the parallel section */
581 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
582 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
583 __kmp_clear_x87_fpu_status_word();
584 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
585 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
587 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
589 this_thr->th.th_team = serial_team->t.t_parent;
590 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
592 /* restore values cached in the thread */
593 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
594 this_thr->th.th_team_master =
595 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
596 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
598 /* TODO the below shouldn't need to be adjusted for serialized teams */
599 this_thr->th.th_dispatch =
600 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
602 __kmp_pop_current_task_from_thread(this_thr);
604 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
605 this_thr->th.th_current_task->td_flags.executing = 1;
607 if (__kmp_tasking_mode != tskm_immediate_exec) {
608 // Copy the task team from the new child / old parent team to the thread.
609 this_thr->th.th_task_team =
610 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
612 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
614 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
617 if (__kmp_tasking_mode != tskm_immediate_exec) {
618 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
619 "depth of serial team %p to %d\n",
620 global_tid, serial_team, serial_team->t.t_serialized));
624 if (__kmp_env_consistency_check)
625 __kmp_pop_parallel(global_tid, NULL);
627 if (ompt_enabled.enabled)
628 this_thr->th.ompt_thread_info.state =
629 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
630 : ompt_state_work_parallel);
635 @ingroup SYNCHRONIZATION
636 @param loc source location information.
638 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
639 depending on the memory ordering convention obeyed by the compiler
640 even that may not be necessary).
642 void __kmpc_flush(ident_t *loc) {
643 KC_TRACE(10, ("__kmpc_flush: called\n"));
645 /* need explicit __mf() here since use volatile instead in library */
646 KMP_MB(); /* Flush all pending memory write invalidates. */
648 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
650 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
651 // We shouldn't need it, though, since the ABI rules require that
652 // * If the compiler generates NGO stores it also generates the fence
653 // * If users hand-code NGO stores they should insert the fence
654 // therefore no incomplete unordered stores should be visible.
657 // This is to address non-temporal store instructions (sfence needed).
658 // The clflush instruction is addressed either (mfence needed).
659 // Probably the non-temporal load monvtdqa instruction should also be
661 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
662 if (!__kmp_cpuinfo.initialized) {
663 __kmp_query_cpuid(&__kmp_cpuinfo);
665 if (!__kmp_cpuinfo.sse2) {
666 // CPU cannot execute SSE2 instructions.
670 #elif KMP_COMPILER_MSVC
673 __sync_synchronize();
674 #endif // KMP_COMPILER_ICC
677 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
679 // Nothing to see here move along
681 // Nothing needed here (we have a real MB above).
683 // The flushing thread needs to yield here; this prevents a
684 // busy-waiting thread from saturating the pipeline. flush is
685 // often used in loops like this:
687 // #pragma omp flush(flag)
689 // and adding the yield here is good for at least a 10x speedup
690 // when running >2 threads per core (on the NAS LU benchmark).
694 #error Unknown or unsupported architecture
697 #if OMPT_SUPPORT && OMPT_OPTIONAL
698 if (ompt_enabled.ompt_callback_flush) {
699 ompt_callbacks.ompt_callback(ompt_callback_flush)(
700 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
705 /* -------------------------------------------------------------------------- */
707 @ingroup SYNCHRONIZATION
708 @param loc source location information
709 @param global_tid thread id.
713 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
714 KMP_COUNT_BLOCK(OMP_BARRIER);
715 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
717 if (!TCR_4(__kmp_init_parallel))
718 __kmp_parallel_initialize();
720 __kmp_resume_if_soft_paused();
722 if (__kmp_env_consistency_check) {
724 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
726 __kmp_check_barrier(global_tid, ct_barrier, loc);
730 ompt_frame_t *ompt_frame;
731 if (ompt_enabled.enabled) {
732 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
733 if (ompt_frame->enter_frame.ptr == NULL)
734 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
735 OMPT_STORE_RETURN_ADDRESS(global_tid);
738 __kmp_threads[global_tid]->th.th_ident = loc;
739 // TODO: explicit barrier_wait_id:
740 // this function is called when 'barrier' directive is present or
741 // implicit barrier at the end of a worksharing construct.
742 // 1) better to add a per-thread barrier counter to a thread data structure
743 // 2) set to 0 when a new team is created
744 // 4) no sync is required
746 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
747 #if OMPT_SUPPORT && OMPT_OPTIONAL
748 if (ompt_enabled.enabled) {
749 ompt_frame->enter_frame = ompt_data_none;
754 /* The BARRIER for a MASTER section is always explicit */
756 @ingroup WORK_SHARING
757 @param loc source location information.
758 @param global_tid global thread number .
759 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
761 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
764 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
766 if (!TCR_4(__kmp_init_parallel))
767 __kmp_parallel_initialize();
769 __kmp_resume_if_soft_paused();
771 if (KMP_MASTER_GTID(global_tid)) {
772 KMP_COUNT_BLOCK(OMP_MASTER);
773 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
777 #if OMPT_SUPPORT && OMPT_OPTIONAL
779 if (ompt_enabled.ompt_callback_master) {
780 kmp_info_t *this_thr = __kmp_threads[global_tid];
781 kmp_team_t *team = this_thr->th.th_team;
783 int tid = __kmp_tid_from_gtid(global_tid);
784 ompt_callbacks.ompt_callback(ompt_callback_master)(
785 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
786 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
787 OMPT_GET_RETURN_ADDRESS(0));
792 if (__kmp_env_consistency_check) {
793 #if KMP_USE_DYNAMIC_LOCK
795 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
797 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
800 __kmp_push_sync(global_tid, ct_master, loc, NULL);
802 __kmp_check_sync(global_tid, ct_master, loc, NULL);
810 @ingroup WORK_SHARING
811 @param loc source location information.
812 @param global_tid global thread number .
814 Mark the end of a <tt>master</tt> region. This should only be called by the
815 thread that executes the <tt>master</tt> region.
817 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
818 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
820 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
821 KMP_POP_PARTITIONED_TIMER();
823 #if OMPT_SUPPORT && OMPT_OPTIONAL
824 kmp_info_t *this_thr = __kmp_threads[global_tid];
825 kmp_team_t *team = this_thr->th.th_team;
826 if (ompt_enabled.ompt_callback_master) {
827 int tid = __kmp_tid_from_gtid(global_tid);
828 ompt_callbacks.ompt_callback(ompt_callback_master)(
829 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
830 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
831 OMPT_GET_RETURN_ADDRESS(0));
835 if (__kmp_env_consistency_check) {
837 KMP_WARNING(ThreadIdentInvalid);
839 if (KMP_MASTER_GTID(global_tid))
840 __kmp_pop_sync(global_tid, ct_master, loc);
845 @ingroup WORK_SHARING
846 @param loc source location information.
847 @param gtid global thread number.
849 Start execution of an <tt>ordered</tt> construct.
851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
854 KMP_DEBUG_ASSERT(__kmp_init_serial);
856 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
858 if (!TCR_4(__kmp_init_parallel))
859 __kmp_parallel_initialize();
861 __kmp_resume_if_soft_paused();
864 __kmp_itt_ordered_prep(gtid);
865 // TODO: ordered_wait_id
866 #endif /* USE_ITT_BUILD */
868 th = __kmp_threads[gtid];
870 #if OMPT_SUPPORT && OMPT_OPTIONAL
874 if (ompt_enabled.enabled) {
875 OMPT_STORE_RETURN_ADDRESS(gtid);
876 team = __kmp_team_from_gtid(gtid);
877 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
878 /* OMPT state update */
879 th->th.ompt_thread_info.wait_id = lck;
880 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
882 /* OMPT event callback */
883 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
884 if (ompt_enabled.ompt_callback_mutex_acquire) {
885 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
886 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
892 if (th->th.th_dispatch->th_deo_fcn != 0)
893 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
895 __kmp_parallel_deo(>id, &cid, loc);
897 #if OMPT_SUPPORT && OMPT_OPTIONAL
898 if (ompt_enabled.enabled) {
899 /* OMPT state update */
900 th->th.ompt_thread_info.state = ompt_state_work_parallel;
901 th->th.ompt_thread_info.wait_id = 0;
903 /* OMPT event callback */
904 if (ompt_enabled.ompt_callback_mutex_acquired) {
905 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
906 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
912 __kmp_itt_ordered_start(gtid);
913 #endif /* USE_ITT_BUILD */
917 @ingroup WORK_SHARING
918 @param loc source location information.
919 @param gtid global thread number.
921 End execution of an <tt>ordered</tt> construct.
923 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
927 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
930 __kmp_itt_ordered_end(gtid);
931 // TODO: ordered_wait_id
932 #endif /* USE_ITT_BUILD */
934 th = __kmp_threads[gtid];
936 if (th->th.th_dispatch->th_dxo_fcn != 0)
937 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
939 __kmp_parallel_dxo(>id, &cid, loc);
941 #if OMPT_SUPPORT && OMPT_OPTIONAL
942 OMPT_STORE_RETURN_ADDRESS(gtid);
943 if (ompt_enabled.ompt_callback_mutex_released) {
944 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
946 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
947 ->t.t_ordered.dt.t_value,
948 OMPT_LOAD_RETURN_ADDRESS(gtid));
953 #if KMP_USE_DYNAMIC_LOCK
955 static __forceinline void
956 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
957 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
958 // Pointer to the allocated indirect lock is written to crit, while indexing
961 kmp_indirect_lock_t **lck;
962 lck = (kmp_indirect_lock_t **)crit;
963 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
964 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
965 KMP_SET_I_LOCK_LOCATION(ilk, loc);
966 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
968 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
970 __kmp_itt_critical_creating(ilk->lock, loc);
972 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
975 __kmp_itt_critical_destroyed(ilk->lock);
977 // We don't really need to destroy the unclaimed lock here since it will be
978 // cleaned up at program exit.
979 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
981 KMP_DEBUG_ASSERT(*lck != NULL);
984 // Fast-path acquire tas lock
985 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
987 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
988 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
989 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
990 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
991 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
993 KMP_FSYNC_PREPARE(l); \
994 KMP_INIT_YIELD(spins); \
995 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
997 if (TCR_4(__kmp_nth) > \
998 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
1001 KMP_YIELD_SPIN(spins); \
1003 __kmp_spin_backoff(&backoff); \
1005 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1006 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
1008 KMP_FSYNC_ACQUIRED(l); \
1011 // Fast-path test tas lock
1012 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1014 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1015 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1016 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1017 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1018 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1021 // Fast-path release tas lock
1022 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1023 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1027 #include <sys/syscall.h>
1030 #define FUTEX_WAIT 0
1033 #define FUTEX_WAKE 1
1036 // Fast-path acquire futex lock
1037 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1039 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1040 kmp_int32 gtid_code = (gtid + 1) << 1; \
1042 KMP_FSYNC_PREPARE(ftx); \
1043 kmp_int32 poll_val; \
1044 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1045 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1046 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1047 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1049 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1051 KMP_LOCK_BUSY(1, futex))) { \
1054 poll_val |= KMP_LOCK_BUSY(1, futex); \
1057 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1058 NULL, NULL, 0)) != 0) { \
1063 KMP_FSYNC_ACQUIRED(ftx); \
1066 // Fast-path test futex lock
1067 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1069 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1070 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1071 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1072 KMP_FSYNC_ACQUIRED(ftx); \
1079 // Fast-path release futex lock
1080 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1082 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1084 KMP_FSYNC_RELEASING(ftx); \
1085 kmp_int32 poll_val = \
1086 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1087 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1088 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1089 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1092 KMP_YIELD_OVERSUB(); \
1095 #endif // KMP_USE_FUTEX
1097 #else // KMP_USE_DYNAMIC_LOCK
1099 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1102 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1104 // Because of the double-check, the following load doesn't need to be volatile
1105 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1110 // Allocate & initialize the lock.
1111 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1112 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1113 __kmp_init_user_lock_with_checks(lck);
1114 __kmp_set_user_lock_location(lck, loc);
1116 __kmp_itt_critical_creating(lck);
1117 // __kmp_itt_critical_creating() should be called *before* the first usage
1118 // of underlying lock. It is the only place where we can guarantee it. There
1119 // are chances the lock will destroyed with no usage, but it is not a
1120 // problem, because this is not real event seen by user but rather setting
1121 // name for object (lock). See more details in kmp_itt.h.
1122 #endif /* USE_ITT_BUILD */
1124 // Use a cmpxchg instruction to slam the start of the critical section with
1125 // the lock pointer. If another thread beat us to it, deallocate the lock,
1126 // and use the lock that the other thread allocated.
1127 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1130 // Deallocate the lock and reload the value.
1132 __kmp_itt_critical_destroyed(lck);
1133 // Let ITT know the lock is destroyed and the same memory location may be reused
1134 // for another purpose.
1135 #endif /* USE_ITT_BUILD */
1136 __kmp_destroy_user_lock_with_checks(lck);
1137 __kmp_user_lock_free(&idx, gtid, lck);
1138 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1139 KMP_DEBUG_ASSERT(lck != NULL);
1145 #endif // KMP_USE_DYNAMIC_LOCK
1148 @ingroup WORK_SHARING
1149 @param loc source location information.
1150 @param global_tid global thread number .
1151 @param crit identity of the critical section. This could be a pointer to a lock
1152 associated with the critical section, or some other suitably unique value.
1154 Enter code protected by a `critical` construct.
1155 This function blocks until the executing thread can enter the critical section.
1157 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1158 kmp_critical_name *crit) {
1159 #if KMP_USE_DYNAMIC_LOCK
1160 #if OMPT_SUPPORT && OMPT_OPTIONAL
1161 OMPT_STORE_RETURN_ADDRESS(global_tid);
1162 #endif // OMPT_SUPPORT
1163 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1165 KMP_COUNT_BLOCK(OMP_CRITICAL);
1166 #if OMPT_SUPPORT && OMPT_OPTIONAL
1167 ompt_state_t prev_state = ompt_state_undefined;
1168 ompt_thread_info_t ti;
1170 kmp_user_lock_p lck;
1172 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1174 // TODO: add THR_OVHD_STATE
1176 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1177 KMP_CHECK_USER_LOCK_INIT();
1179 if ((__kmp_user_lock_kind == lk_tas) &&
1180 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1181 lck = (kmp_user_lock_p)crit;
1184 else if ((__kmp_user_lock_kind == lk_futex) &&
1185 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1186 lck = (kmp_user_lock_p)crit;
1189 else { // ticket, queuing or drdpa
1190 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1193 if (__kmp_env_consistency_check)
1194 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1196 // since the critical directive binds to all threads, not just the current
1197 // team we have to check this even if we are in a serialized team.
1198 // also, even if we are the uber thread, we still have to conduct the lock,
1199 // as we have to contend with sibling threads.
1202 __kmp_itt_critical_acquiring(lck);
1203 #endif /* USE_ITT_BUILD */
1204 #if OMPT_SUPPORT && OMPT_OPTIONAL
1205 OMPT_STORE_RETURN_ADDRESS(gtid);
1206 void *codeptr_ra = NULL;
1207 if (ompt_enabled.enabled) {
1208 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1209 /* OMPT state update */
1210 prev_state = ti.state;
1211 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1212 ti.state = ompt_state_wait_critical;
1214 /* OMPT event callback */
1215 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1216 if (ompt_enabled.ompt_callback_mutex_acquire) {
1217 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1218 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1219 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1223 // Value of 'crit' should be good for using as a critical_id of the critical
1224 // section directive.
1225 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1228 __kmp_itt_critical_acquired(lck);
1229 #endif /* USE_ITT_BUILD */
1230 #if OMPT_SUPPORT && OMPT_OPTIONAL
1231 if (ompt_enabled.enabled) {
1232 /* OMPT state update */
1233 ti.state = prev_state;
1236 /* OMPT event callback */
1237 if (ompt_enabled.ompt_callback_mutex_acquired) {
1238 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1239 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1243 KMP_POP_PARTITIONED_TIMER();
1245 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1246 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1247 #endif // KMP_USE_DYNAMIC_LOCK
1250 #if KMP_USE_DYNAMIC_LOCK
1252 // Converts the given hint to an internal lock implementation
1253 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1255 #define KMP_TSX_LOCK(seq) lockseq_##seq
1257 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1260 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1261 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1263 #define KMP_CPUINFO_RTM 0
1266 // Hints that do not require further logic
1267 if (hint & kmp_lock_hint_hle)
1268 return KMP_TSX_LOCK(hle);
1269 if (hint & kmp_lock_hint_rtm)
1270 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1271 if (hint & kmp_lock_hint_adaptive)
1272 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1274 // Rule out conflicting hints first by returning the default lock
1275 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1276 return __kmp_user_lock_seq;
1277 if ((hint & omp_lock_hint_speculative) &&
1278 (hint & omp_lock_hint_nonspeculative))
1279 return __kmp_user_lock_seq;
1281 // Do not even consider speculation when it appears to be contended
1282 if (hint & omp_lock_hint_contended)
1283 return lockseq_queuing;
1285 // Uncontended lock without speculation
1286 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1289 // HLE lock for speculation
1290 if (hint & omp_lock_hint_speculative)
1291 return KMP_TSX_LOCK(hle);
1293 return __kmp_user_lock_seq;
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297 #if KMP_USE_DYNAMIC_LOCK
1298 static kmp_mutex_impl_t
1299 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1301 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1306 return kmp_mutex_impl_queuing;
1309 return kmp_mutex_impl_spin;
1312 return kmp_mutex_impl_speculative;
1315 return kmp_mutex_impl_none;
1317 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1320 switch (ilock->type) {
1322 case locktag_adaptive:
1324 return kmp_mutex_impl_speculative;
1326 case locktag_nested_tas:
1327 return kmp_mutex_impl_spin;
1329 case locktag_nested_futex:
1331 case locktag_ticket:
1332 case locktag_queuing:
1334 case locktag_nested_ticket:
1335 case locktag_nested_queuing:
1336 case locktag_nested_drdpa:
1337 return kmp_mutex_impl_queuing;
1339 return kmp_mutex_impl_none;
1343 // For locks without dynamic binding
1344 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1345 switch (__kmp_user_lock_kind) {
1347 return kmp_mutex_impl_spin;
1354 return kmp_mutex_impl_queuing;
1359 return kmp_mutex_impl_speculative;
1362 return kmp_mutex_impl_none;
1365 #endif // KMP_USE_DYNAMIC_LOCK
1366 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1369 @ingroup WORK_SHARING
1370 @param loc source location information.
1371 @param global_tid global thread number.
1372 @param crit identity of the critical section. This could be a pointer to a lock
1373 associated with the critical section, or some other suitably unique value.
1374 @param hint the lock hint.
1376 Enter code protected by a `critical` construct with a hint. The hint value is
1377 used to suggest a lock implementation. This function blocks until the executing
1378 thread can enter the critical section unless the hint suggests use of
1379 speculative execution and the hardware supports it.
1381 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1382 kmp_critical_name *crit, uint32_t hint) {
1383 KMP_COUNT_BLOCK(OMP_CRITICAL);
1384 kmp_user_lock_p lck;
1385 #if OMPT_SUPPORT && OMPT_OPTIONAL
1386 ompt_state_t prev_state = ompt_state_undefined;
1387 ompt_thread_info_t ti;
1388 // This is the case, if called from __kmpc_critical:
1389 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1391 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1394 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1396 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1397 // Check if it is initialized.
1398 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1400 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1401 if (KMP_IS_D_LOCK(lckseq)) {
1402 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1403 KMP_GET_D_TAG(lckseq));
1405 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1408 // Branch for accessing the actual lock object and set operation. This
1409 // branching is inevitable since this lock initialization does not follow the
1410 // normal dispatch path (lock table is not used).
1411 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1412 lck = (kmp_user_lock_p)lk;
1413 if (__kmp_env_consistency_check) {
1414 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1415 __kmp_map_hint_to_lock(hint));
1418 __kmp_itt_critical_acquiring(lck);
1420 #if OMPT_SUPPORT && OMPT_OPTIONAL
1421 if (ompt_enabled.enabled) {
1422 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1423 /* OMPT state update */
1424 prev_state = ti.state;
1425 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1426 ti.state = ompt_state_wait_critical;
1428 /* OMPT event callback */
1429 if (ompt_enabled.ompt_callback_mutex_acquire) {
1430 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1431 ompt_mutex_critical, (unsigned int)hint,
1432 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1437 #if KMP_USE_INLINED_TAS
1438 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1439 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1441 #elif KMP_USE_INLINED_FUTEX
1442 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1443 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1447 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1450 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1452 if (__kmp_env_consistency_check) {
1453 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1454 __kmp_map_hint_to_lock(hint));
1457 __kmp_itt_critical_acquiring(lck);
1459 #if OMPT_SUPPORT && OMPT_OPTIONAL
1460 if (ompt_enabled.enabled) {
1461 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1462 /* OMPT state update */
1463 prev_state = ti.state;
1464 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1465 ti.state = ompt_state_wait_critical;
1467 /* OMPT event callback */
1468 if (ompt_enabled.ompt_callback_mutex_acquire) {
1469 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1470 ompt_mutex_critical, (unsigned int)hint,
1471 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1476 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1478 KMP_POP_PARTITIONED_TIMER();
1481 __kmp_itt_critical_acquired(lck);
1482 #endif /* USE_ITT_BUILD */
1483 #if OMPT_SUPPORT && OMPT_OPTIONAL
1484 if (ompt_enabled.enabled) {
1485 /* OMPT state update */
1486 ti.state = prev_state;
1489 /* OMPT event callback */
1490 if (ompt_enabled.ompt_callback_mutex_acquired) {
1491 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1492 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1497 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1498 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1499 } // __kmpc_critical_with_hint
1501 #endif // KMP_USE_DYNAMIC_LOCK
1504 @ingroup WORK_SHARING
1505 @param loc source location information.
1506 @param global_tid global thread number .
1507 @param crit identity of the critical section. This could be a pointer to a lock
1508 associated with the critical section, or some other suitably unique value.
1510 Leave a critical section, releasing any lock that was held during its execution.
1512 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1513 kmp_critical_name *crit) {
1514 kmp_user_lock_p lck;
1516 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1518 #if KMP_USE_DYNAMIC_LOCK
1519 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1520 lck = (kmp_user_lock_p)crit;
1521 KMP_ASSERT(lck != NULL);
1522 if (__kmp_env_consistency_check) {
1523 __kmp_pop_sync(global_tid, ct_critical, loc);
1526 __kmp_itt_critical_releasing(lck);
1528 #if KMP_USE_INLINED_TAS
1529 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1530 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1532 #elif KMP_USE_INLINED_FUTEX
1533 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1534 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1538 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1541 kmp_indirect_lock_t *ilk =
1542 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1543 KMP_ASSERT(ilk != NULL);
1545 if (__kmp_env_consistency_check) {
1546 __kmp_pop_sync(global_tid, ct_critical, loc);
1549 __kmp_itt_critical_releasing(lck);
1551 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1554 #else // KMP_USE_DYNAMIC_LOCK
1556 if ((__kmp_user_lock_kind == lk_tas) &&
1557 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1558 lck = (kmp_user_lock_p)crit;
1561 else if ((__kmp_user_lock_kind == lk_futex) &&
1562 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1563 lck = (kmp_user_lock_p)crit;
1566 else { // ticket, queuing or drdpa
1567 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1570 KMP_ASSERT(lck != NULL);
1572 if (__kmp_env_consistency_check)
1573 __kmp_pop_sync(global_tid, ct_critical, loc);
1576 __kmp_itt_critical_releasing(lck);
1577 #endif /* USE_ITT_BUILD */
1578 // Value of 'crit' should be good for using as a critical_id of the critical
1579 // section directive.
1580 __kmp_release_user_lock_with_checks(lck, global_tid);
1582 #endif // KMP_USE_DYNAMIC_LOCK
1584 #if OMPT_SUPPORT && OMPT_OPTIONAL
1585 /* OMPT release event triggers after lock is released; place here to trigger
1586 * for all #if branches */
1587 OMPT_STORE_RETURN_ADDRESS(global_tid);
1588 if (ompt_enabled.ompt_callback_mutex_released) {
1589 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1590 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1591 OMPT_LOAD_RETURN_ADDRESS(0));
1595 KMP_POP_PARTITIONED_TIMER();
1596 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1600 @ingroup SYNCHRONIZATION
1601 @param loc source location information
1602 @param global_tid thread id.
1603 @return one if the thread should execute the master block, zero otherwise
1605 Start execution of a combined barrier and master. The barrier is executed inside
1608 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1611 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1613 if (!TCR_4(__kmp_init_parallel))
1614 __kmp_parallel_initialize();
1616 __kmp_resume_if_soft_paused();
1618 if (__kmp_env_consistency_check)
1619 __kmp_check_barrier(global_tid, ct_barrier, loc);
1622 ompt_frame_t *ompt_frame;
1623 if (ompt_enabled.enabled) {
1624 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1625 if (ompt_frame->enter_frame.ptr == NULL)
1626 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1627 OMPT_STORE_RETURN_ADDRESS(global_tid);
1631 __kmp_threads[global_tid]->th.th_ident = loc;
1633 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1634 #if OMPT_SUPPORT && OMPT_OPTIONAL
1635 if (ompt_enabled.enabled) {
1636 ompt_frame->enter_frame = ompt_data_none;
1640 return (status != 0) ? 0 : 1;
1644 @ingroup SYNCHRONIZATION
1645 @param loc source location information
1646 @param global_tid thread id.
1648 Complete the execution of a combined barrier and master. This function should
1649 only be called at the completion of the <tt>master</tt> code. Other threads will
1650 still be waiting at the barrier and this call releases them.
1652 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1653 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1655 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1659 @ingroup SYNCHRONIZATION
1660 @param loc source location information
1661 @param global_tid thread id.
1662 @return one if the thread should execute the master block, zero otherwise
1664 Start execution of a combined barrier and master(nowait) construct.
1665 The barrier is executed inside this function.
1666 There is no equivalent "end" function, since the
1668 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1671 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1673 if (!TCR_4(__kmp_init_parallel))
1674 __kmp_parallel_initialize();
1676 __kmp_resume_if_soft_paused();
1678 if (__kmp_env_consistency_check) {
1680 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1682 __kmp_check_barrier(global_tid, ct_barrier, loc);
1686 ompt_frame_t *ompt_frame;
1687 if (ompt_enabled.enabled) {
1688 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1689 if (ompt_frame->enter_frame.ptr == NULL)
1690 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1691 OMPT_STORE_RETURN_ADDRESS(global_tid);
1695 __kmp_threads[global_tid]->th.th_ident = loc;
1697 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1698 #if OMPT_SUPPORT && OMPT_OPTIONAL
1699 if (ompt_enabled.enabled) {
1700 ompt_frame->enter_frame = ompt_data_none;
1704 ret = __kmpc_master(loc, global_tid);
1706 if (__kmp_env_consistency_check) {
1707 /* there's no __kmpc_end_master called; so the (stats) */
1708 /* actions of __kmpc_end_master are done here */
1710 if (global_tid < 0) {
1711 KMP_WARNING(ThreadIdentInvalid);
1714 /* only one thread should do the pop since only */
1715 /* one did the push (see __kmpc_master()) */
1717 __kmp_pop_sync(global_tid, ct_master, loc);
1724 /* The BARRIER for a SINGLE process section is always explicit */
1726 @ingroup WORK_SHARING
1727 @param loc source location information
1728 @param global_tid global thread number
1729 @return One if this thread should execute the single construct, zero otherwise.
1731 Test whether to execute a <tt>single</tt> construct.
1732 There are no implicit barriers in the two "single" calls, rather the compiler
1733 should introduce an explicit barrier if it is required.
1736 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1737 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1740 // We are going to execute the single statement, so we should count it.
1741 KMP_COUNT_BLOCK(OMP_SINGLE);
1742 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1745 #if OMPT_SUPPORT && OMPT_OPTIONAL
1746 kmp_info_t *this_thr = __kmp_threads[global_tid];
1747 kmp_team_t *team = this_thr->th.th_team;
1748 int tid = __kmp_tid_from_gtid(global_tid);
1750 if (ompt_enabled.enabled) {
1752 if (ompt_enabled.ompt_callback_work) {
1753 ompt_callbacks.ompt_callback(ompt_callback_work)(
1754 ompt_work_single_executor, ompt_scope_begin,
1755 &(team->t.ompt_team_info.parallel_data),
1756 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1757 1, OMPT_GET_RETURN_ADDRESS(0));
1760 if (ompt_enabled.ompt_callback_work) {
1761 ompt_callbacks.ompt_callback(ompt_callback_work)(
1762 ompt_work_single_other, ompt_scope_begin,
1763 &(team->t.ompt_team_info.parallel_data),
1764 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1765 1, OMPT_GET_RETURN_ADDRESS(0));
1766 ompt_callbacks.ompt_callback(ompt_callback_work)(
1767 ompt_work_single_other, ompt_scope_end,
1768 &(team->t.ompt_team_info.parallel_data),
1769 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1770 1, OMPT_GET_RETURN_ADDRESS(0));
1780 @ingroup WORK_SHARING
1781 @param loc source location information
1782 @param global_tid global thread number
1784 Mark the end of a <tt>single</tt> construct. This function should
1785 only be called by the thread that executed the block of code protected
1786 by the `single` construct.
1788 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1789 __kmp_exit_single(global_tid);
1790 KMP_POP_PARTITIONED_TIMER();
1792 #if OMPT_SUPPORT && OMPT_OPTIONAL
1793 kmp_info_t *this_thr = __kmp_threads[global_tid];
1794 kmp_team_t *team = this_thr->th.th_team;
1795 int tid = __kmp_tid_from_gtid(global_tid);
1797 if (ompt_enabled.ompt_callback_work) {
1798 ompt_callbacks.ompt_callback(ompt_callback_work)(
1799 ompt_work_single_executor, ompt_scope_end,
1800 &(team->t.ompt_team_info.parallel_data),
1801 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1802 OMPT_GET_RETURN_ADDRESS(0));
1808 @ingroup WORK_SHARING
1809 @param loc Source location
1810 @param global_tid Global thread id
1812 Mark the end of a statically scheduled loop.
1814 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1815 KMP_POP_PARTITIONED_TIMER();
1816 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1818 #if OMPT_SUPPORT && OMPT_OPTIONAL
1819 if (ompt_enabled.ompt_callback_work) {
1820 ompt_work_t ompt_work_type = ompt_work_loop;
1821 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1822 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1823 // Determine workshare type
1825 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1826 ompt_work_type = ompt_work_loop;
1827 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1828 ompt_work_type = ompt_work_sections;
1829 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1830 ompt_work_type = ompt_work_distribute;
1832 // use default set above.
1833 // a warning about this case is provided in __kmpc_for_static_init
1835 KMP_DEBUG_ASSERT(ompt_work_type);
1837 ompt_callbacks.ompt_callback(ompt_callback_work)(
1838 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1839 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1842 if (__kmp_env_consistency_check)
1843 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1846 // User routines which take C-style arguments (call by value)
1847 // different from the Fortran equivalent routines
1849 void ompc_set_num_threads(int arg) {
1850 // !!!!! TODO: check the per-task binding
1851 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1854 void ompc_set_dynamic(int flag) {
1857 /* For the thread-private implementation of the internal controls */
1858 thread = __kmp_entry_thread();
1860 __kmp_save_internal_controls(thread);
1862 set__dynamic(thread, flag ? TRUE : FALSE);
1865 void ompc_set_nested(int flag) {
1868 /* For the thread-private internal controls implementation */
1869 thread = __kmp_entry_thread();
1871 __kmp_save_internal_controls(thread);
1873 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1876 void ompc_set_max_active_levels(int max_active_levels) {
1878 /* we want per-task implementation of this internal control */
1880 /* For the per-thread internal controls implementation */
1881 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1884 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1885 // !!!!! TODO: check the per-task binding
1886 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1889 int ompc_get_ancestor_thread_num(int level) {
1890 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1893 int ompc_get_team_size(int level) {
1894 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1897 /* OpenMP 5.0 Affinity Format API */
1899 void ompc_set_affinity_format(char const *format) {
1900 if (!__kmp_init_serial) {
1901 __kmp_serial_initialize();
1903 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1904 format, KMP_STRLEN(format) + 1);
1907 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1909 if (!__kmp_init_serial) {
1910 __kmp_serial_initialize();
1912 format_size = KMP_STRLEN(__kmp_affinity_format);
1913 if (buffer && size) {
1914 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1920 void ompc_display_affinity(char const *format) {
1922 if (!TCR_4(__kmp_init_middle)) {
1923 __kmp_middle_initialize();
1925 gtid = __kmp_get_gtid();
1926 __kmp_aux_display_affinity(gtid, format);
1929 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1930 char const *format) {
1932 size_t num_required;
1933 kmp_str_buf_t capture_buf;
1934 if (!TCR_4(__kmp_init_middle)) {
1935 __kmp_middle_initialize();
1937 gtid = __kmp_get_gtid();
1938 __kmp_str_buf_init(&capture_buf);
1939 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1940 if (buffer && buf_size) {
1941 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1942 capture_buf.used + 1);
1944 __kmp_str_buf_free(&capture_buf);
1945 return num_required;
1948 void kmpc_set_stacksize(int arg) {
1949 // __kmp_aux_set_stacksize initializes the library if needed
1950 __kmp_aux_set_stacksize(arg);
1953 void kmpc_set_stacksize_s(size_t arg) {
1954 // __kmp_aux_set_stacksize initializes the library if needed
1955 __kmp_aux_set_stacksize(arg);
1958 void kmpc_set_blocktime(int arg) {
1962 gtid = __kmp_entry_gtid();
1963 tid = __kmp_tid_from_gtid(gtid);
1964 thread = __kmp_thread_from_gtid(gtid);
1966 __kmp_aux_set_blocktime(arg, thread, tid);
1969 void kmpc_set_library(int arg) {
1970 // __kmp_user_set_library initializes the library if needed
1971 __kmp_user_set_library((enum library_type)arg);
1974 void kmpc_set_defaults(char const *str) {
1975 // __kmp_aux_set_defaults initializes the library if needed
1976 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1979 void kmpc_set_disp_num_buffers(int arg) {
1980 // ignore after initialization because some teams have already
1981 // allocated dispatch buffers
1982 if (__kmp_init_serial == 0 && arg > 0)
1983 __kmp_dispatch_num_buffers = arg;
1986 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1987 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1990 if (!TCR_4(__kmp_init_middle)) {
1991 __kmp_middle_initialize();
1993 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1997 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1998 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2001 if (!TCR_4(__kmp_init_middle)) {
2002 __kmp_middle_initialize();
2004 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2008 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2009 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2012 if (!TCR_4(__kmp_init_middle)) {
2013 __kmp_middle_initialize();
2015 return __kmp_aux_get_affinity_mask_proc(proc, mask);
2019 /* -------------------------------------------------------------------------- */
2021 @ingroup THREADPRIVATE
2022 @param loc source location information
2023 @param gtid global thread number
2024 @param cpy_size size of the cpy_data buffer
2025 @param cpy_data pointer to data to be copied
2026 @param cpy_func helper function to call for copying data
2027 @param didit flag variable: 1=single thread; 0=not single thread
2029 __kmpc_copyprivate implements the interface for the private data broadcast
2030 needed for the copyprivate clause associated with a single region in an
2031 OpenMP<sup>*</sup> program (both C and Fortran).
2032 All threads participating in the parallel region call this routine.
2033 One of the threads (called the single thread) should have the <tt>didit</tt>
2034 variable set to 1 and all other threads should have that variable set to 0.
2035 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2037 The OpenMP specification forbids the use of nowait on the single region when a
2038 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2039 barrier internally to avoid race conditions, so the code generation for the
2040 single region should avoid generating a barrier after the call to @ref
2043 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2044 The <tt>loc</tt> parameter is a pointer to source location information.
2046 Internal implementation: The single thread will first copy its descriptor
2047 address (cpy_data) to a team-private location, then the other threads will each
2048 call the function pointed to by the parameter cpy_func, which carries out the
2049 copy by copying the data using the cpy_data buffer.
2051 The cpy_func routine used for the copy and the contents of the data area defined
2052 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2053 to be done. For instance, the cpy_data buffer can hold the actual data to be
2054 copied or it may hold a list of pointers to the data. The cpy_func routine must
2055 interpret the cpy_data buffer appropriately.
2057 The interface to cpy_func is as follows:
2059 void cpy_func( void *destination, void *source )
2061 where void *destination is the cpy_data pointer for the thread being copied to
2062 and void *source is the cpy_data pointer for the thread being copied from.
2064 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2065 void *cpy_data, void (*cpy_func)(void *, void *),
2069 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2073 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2075 if (__kmp_env_consistency_check) {
2077 KMP_WARNING(ConstructIdentInvalid);
2081 // ToDo: Optimize the following two barriers into some kind of split barrier
2084 *data_ptr = cpy_data;
2087 ompt_frame_t *ompt_frame;
2088 if (ompt_enabled.enabled) {
2089 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2090 if (ompt_frame->enter_frame.ptr == NULL)
2091 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2092 OMPT_STORE_RETURN_ADDRESS(gtid);
2095 /* This barrier is not a barrier region boundary */
2097 __kmp_threads[gtid]->th.th_ident = loc;
2099 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2102 (*cpy_func)(cpy_data, *data_ptr);
2104 // Consider next barrier a user-visible barrier for barrier region boundaries
2105 // Nesting checks are already handled by the single construct checks
2108 if (ompt_enabled.enabled) {
2109 OMPT_STORE_RETURN_ADDRESS(gtid);
2113 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2114 // tasks can overwrite the location)
2116 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2117 #if OMPT_SUPPORT && OMPT_OPTIONAL
2118 if (ompt_enabled.enabled) {
2119 ompt_frame->enter_frame = ompt_data_none;
2124 /* -------------------------------------------------------------------------- */
2126 #define INIT_LOCK __kmp_init_user_lock_with_checks
2127 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2128 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2129 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2130 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2131 #define ACQUIRE_NESTED_LOCK_TIMED \
2132 __kmp_acquire_nested_user_lock_with_checks_timed
2133 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2134 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2135 #define TEST_LOCK __kmp_test_user_lock_with_checks
2136 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2137 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2138 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2140 // TODO: Make check abort messages use location info & pass it into
2141 // with_checks routines
2143 #if KMP_USE_DYNAMIC_LOCK
2145 // internal lock initializer
2146 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2147 kmp_dyna_lockseq_t seq) {
2148 if (KMP_IS_D_LOCK(seq)) {
2149 KMP_INIT_D_LOCK(lock, seq);
2151 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2154 KMP_INIT_I_LOCK(lock, seq);
2156 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2157 __kmp_itt_lock_creating(ilk->lock, loc);
2162 // internal nest lock initializer
2163 static __forceinline void
2164 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2165 kmp_dyna_lockseq_t seq) {
2167 // Don't have nested lock implementation for speculative locks
2168 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2169 seq = __kmp_user_lock_seq;
2173 seq = lockseq_nested_tas;
2177 seq = lockseq_nested_futex;
2180 case lockseq_ticket:
2181 seq = lockseq_nested_ticket;
2183 case lockseq_queuing:
2184 seq = lockseq_nested_queuing;
2187 seq = lockseq_nested_drdpa;
2190 seq = lockseq_nested_queuing;
2192 KMP_INIT_I_LOCK(lock, seq);
2194 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2195 __kmp_itt_lock_creating(ilk->lock, loc);
2199 /* initialize the lock with a hint */
2200 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2202 KMP_DEBUG_ASSERT(__kmp_init_serial);
2203 if (__kmp_env_consistency_check && user_lock == NULL) {
2204 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2207 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2209 #if OMPT_SUPPORT && OMPT_OPTIONAL
2210 // This is the case, if called from omp_init_lock_with_hint:
2211 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2213 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2214 if (ompt_enabled.ompt_callback_lock_init) {
2215 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2216 ompt_mutex_lock, (omp_lock_hint_t)hint,
2217 __ompt_get_mutex_impl_type(user_lock),
2218 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2223 /* initialize the lock with a hint */
2224 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2225 void **user_lock, uintptr_t hint) {
2226 KMP_DEBUG_ASSERT(__kmp_init_serial);
2227 if (__kmp_env_consistency_check && user_lock == NULL) {
2228 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2231 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2233 #if OMPT_SUPPORT && OMPT_OPTIONAL
2234 // This is the case, if called from omp_init_lock_with_hint:
2235 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2237 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2238 if (ompt_enabled.ompt_callback_lock_init) {
2239 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2240 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2241 __ompt_get_mutex_impl_type(user_lock),
2242 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2247 #endif // KMP_USE_DYNAMIC_LOCK
2249 /* initialize the lock */
2250 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2251 #if KMP_USE_DYNAMIC_LOCK
2253 KMP_DEBUG_ASSERT(__kmp_init_serial);
2254 if (__kmp_env_consistency_check && user_lock == NULL) {
2255 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2257 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2259 #if OMPT_SUPPORT && OMPT_OPTIONAL
2260 // This is the case, if called from omp_init_lock_with_hint:
2261 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2263 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2264 if (ompt_enabled.ompt_callback_lock_init) {
2265 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2266 ompt_mutex_lock, omp_lock_hint_none,
2267 __ompt_get_mutex_impl_type(user_lock),
2268 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2272 #else // KMP_USE_DYNAMIC_LOCK
2274 static char const *const func = "omp_init_lock";
2275 kmp_user_lock_p lck;
2276 KMP_DEBUG_ASSERT(__kmp_init_serial);
2278 if (__kmp_env_consistency_check) {
2279 if (user_lock == NULL) {
2280 KMP_FATAL(LockIsUninitialized, func);
2284 KMP_CHECK_USER_LOCK_INIT();
2286 if ((__kmp_user_lock_kind == lk_tas) &&
2287 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2288 lck = (kmp_user_lock_p)user_lock;
2291 else if ((__kmp_user_lock_kind == lk_futex) &&
2292 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2293 lck = (kmp_user_lock_p)user_lock;
2297 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2300 __kmp_set_user_lock_location(lck, loc);
2302 #if OMPT_SUPPORT && OMPT_OPTIONAL
2303 // This is the case, if called from omp_init_lock_with_hint:
2304 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2306 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2307 if (ompt_enabled.ompt_callback_lock_init) {
2308 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2309 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2310 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2315 __kmp_itt_lock_creating(lck);
2316 #endif /* USE_ITT_BUILD */
2318 #endif // KMP_USE_DYNAMIC_LOCK
2319 } // __kmpc_init_lock
2321 /* initialize the lock */
2322 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2323 #if KMP_USE_DYNAMIC_LOCK
2325 KMP_DEBUG_ASSERT(__kmp_init_serial);
2326 if (__kmp_env_consistency_check && user_lock == NULL) {
2327 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2329 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2331 #if OMPT_SUPPORT && OMPT_OPTIONAL
2332 // This is the case, if called from omp_init_lock_with_hint:
2333 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2335 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2336 if (ompt_enabled.ompt_callback_lock_init) {
2337 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2338 ompt_mutex_nest_lock, omp_lock_hint_none,
2339 __ompt_get_mutex_impl_type(user_lock),
2340 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2344 #else // KMP_USE_DYNAMIC_LOCK
2346 static char const *const func = "omp_init_nest_lock";
2347 kmp_user_lock_p lck;
2348 KMP_DEBUG_ASSERT(__kmp_init_serial);
2350 if (__kmp_env_consistency_check) {
2351 if (user_lock == NULL) {
2352 KMP_FATAL(LockIsUninitialized, func);
2356 KMP_CHECK_USER_LOCK_INIT();
2358 if ((__kmp_user_lock_kind == lk_tas) &&
2359 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2360 OMP_NEST_LOCK_T_SIZE)) {
2361 lck = (kmp_user_lock_p)user_lock;
2364 else if ((__kmp_user_lock_kind == lk_futex) &&
2365 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2366 OMP_NEST_LOCK_T_SIZE)) {
2367 lck = (kmp_user_lock_p)user_lock;
2371 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2374 INIT_NESTED_LOCK(lck);
2375 __kmp_set_user_lock_location(lck, loc);
2377 #if OMPT_SUPPORT && OMPT_OPTIONAL
2378 // This is the case, if called from omp_init_lock_with_hint:
2379 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2381 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2382 if (ompt_enabled.ompt_callback_lock_init) {
2383 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2384 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2385 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2390 __kmp_itt_lock_creating(lck);
2391 #endif /* USE_ITT_BUILD */
2393 #endif // KMP_USE_DYNAMIC_LOCK
2394 } // __kmpc_init_nest_lock
2396 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2397 #if KMP_USE_DYNAMIC_LOCK
2400 kmp_user_lock_p lck;
2401 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2402 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2404 lck = (kmp_user_lock_p)user_lock;
2406 __kmp_itt_lock_destroyed(lck);
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409 // This is the case, if called from omp_init_lock_with_hint:
2410 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2412 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2413 if (ompt_enabled.ompt_callback_lock_destroy) {
2414 kmp_user_lock_p lck;
2415 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2416 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2418 lck = (kmp_user_lock_p)user_lock;
2420 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2421 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2424 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2426 kmp_user_lock_p lck;
2428 if ((__kmp_user_lock_kind == lk_tas) &&
2429 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2430 lck = (kmp_user_lock_p)user_lock;
2433 else if ((__kmp_user_lock_kind == lk_futex) &&
2434 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2435 lck = (kmp_user_lock_p)user_lock;
2439 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2442 #if OMPT_SUPPORT && OMPT_OPTIONAL
2443 // This is the case, if called from omp_init_lock_with_hint:
2444 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2446 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2447 if (ompt_enabled.ompt_callback_lock_destroy) {
2448 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2449 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2454 __kmp_itt_lock_destroyed(lck);
2455 #endif /* USE_ITT_BUILD */
2458 if ((__kmp_user_lock_kind == lk_tas) &&
2459 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2463 else if ((__kmp_user_lock_kind == lk_futex) &&
2464 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2469 __kmp_user_lock_free(user_lock, gtid, lck);
2471 #endif // KMP_USE_DYNAMIC_LOCK
2472 } // __kmpc_destroy_lock
2474 /* destroy the lock */
2475 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2476 #if KMP_USE_DYNAMIC_LOCK
2479 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2480 __kmp_itt_lock_destroyed(ilk->lock);
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483 // This is the case, if called from omp_init_lock_with_hint:
2484 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2486 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2487 if (ompt_enabled.ompt_callback_lock_destroy) {
2488 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2489 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2492 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2494 #else // KMP_USE_DYNAMIC_LOCK
2496 kmp_user_lock_p lck;
2498 if ((__kmp_user_lock_kind == lk_tas) &&
2499 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2500 OMP_NEST_LOCK_T_SIZE)) {
2501 lck = (kmp_user_lock_p)user_lock;
2504 else if ((__kmp_user_lock_kind == lk_futex) &&
2505 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2506 OMP_NEST_LOCK_T_SIZE)) {
2507 lck = (kmp_user_lock_p)user_lock;
2511 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2514 #if OMPT_SUPPORT && OMPT_OPTIONAL
2515 // This is the case, if called from omp_init_lock_with_hint:
2516 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2518 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2519 if (ompt_enabled.ompt_callback_lock_destroy) {
2520 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2521 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2526 __kmp_itt_lock_destroyed(lck);
2527 #endif /* USE_ITT_BUILD */
2529 DESTROY_NESTED_LOCK(lck);
2531 if ((__kmp_user_lock_kind == lk_tas) &&
2532 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2533 OMP_NEST_LOCK_T_SIZE)) {
2537 else if ((__kmp_user_lock_kind == lk_futex) &&
2538 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2539 OMP_NEST_LOCK_T_SIZE)) {
2544 __kmp_user_lock_free(user_lock, gtid, lck);
2546 #endif // KMP_USE_DYNAMIC_LOCK
2547 } // __kmpc_destroy_nest_lock
2549 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2550 KMP_COUNT_BLOCK(OMP_set_lock);
2551 #if KMP_USE_DYNAMIC_LOCK
2552 int tag = KMP_EXTRACT_D_TAG(user_lock);
2554 __kmp_itt_lock_acquiring(
2556 user_lock); // itt function will get to the right lock object.
2558 #if OMPT_SUPPORT && OMPT_OPTIONAL
2559 // This is the case, if called from omp_init_lock_with_hint:
2560 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2562 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2563 if (ompt_enabled.ompt_callback_mutex_acquire) {
2564 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2565 ompt_mutex_lock, omp_lock_hint_none,
2566 __ompt_get_mutex_impl_type(user_lock),
2567 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2570 #if KMP_USE_INLINED_TAS
2571 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2572 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2574 #elif KMP_USE_INLINED_FUTEX
2575 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2576 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2580 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2583 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2585 #if OMPT_SUPPORT && OMPT_OPTIONAL
2586 if (ompt_enabled.ompt_callback_mutex_acquired) {
2587 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2588 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2592 #else // KMP_USE_DYNAMIC_LOCK
2594 kmp_user_lock_p lck;
2596 if ((__kmp_user_lock_kind == lk_tas) &&
2597 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2598 lck = (kmp_user_lock_p)user_lock;
2601 else if ((__kmp_user_lock_kind == lk_futex) &&
2602 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2603 lck = (kmp_user_lock_p)user_lock;
2607 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2611 __kmp_itt_lock_acquiring(lck);
2612 #endif /* USE_ITT_BUILD */
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614 // This is the case, if called from omp_init_lock_with_hint:
2615 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2617 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2618 if (ompt_enabled.ompt_callback_mutex_acquire) {
2619 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2620 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2621 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2625 ACQUIRE_LOCK(lck, gtid);
2628 __kmp_itt_lock_acquired(lck);
2629 #endif /* USE_ITT_BUILD */
2631 #if OMPT_SUPPORT && OMPT_OPTIONAL
2632 if (ompt_enabled.ompt_callback_mutex_acquired) {
2633 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2634 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2638 #endif // KMP_USE_DYNAMIC_LOCK
2641 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2642 #if KMP_USE_DYNAMIC_LOCK
2645 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2647 #if OMPT_SUPPORT && OMPT_OPTIONAL
2648 // This is the case, if called from omp_init_lock_with_hint:
2649 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2651 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2652 if (ompt_enabled.enabled) {
2653 if (ompt_enabled.ompt_callback_mutex_acquire) {
2654 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2655 ompt_mutex_nest_lock, omp_lock_hint_none,
2656 __ompt_get_mutex_impl_type(user_lock),
2657 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2661 int acquire_status =
2662 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2663 (void) acquire_status;
2665 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2668 #if OMPT_SUPPORT && OMPT_OPTIONAL
2669 if (ompt_enabled.enabled) {
2670 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2671 if (ompt_enabled.ompt_callback_mutex_acquired) {
2673 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2674 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2678 if (ompt_enabled.ompt_callback_nest_lock) {
2680 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2681 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2687 #else // KMP_USE_DYNAMIC_LOCK
2689 kmp_user_lock_p lck;
2691 if ((__kmp_user_lock_kind == lk_tas) &&
2692 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2693 OMP_NEST_LOCK_T_SIZE)) {
2694 lck = (kmp_user_lock_p)user_lock;
2697 else if ((__kmp_user_lock_kind == lk_futex) &&
2698 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2699 OMP_NEST_LOCK_T_SIZE)) {
2700 lck = (kmp_user_lock_p)user_lock;
2704 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2708 __kmp_itt_lock_acquiring(lck);
2709 #endif /* USE_ITT_BUILD */
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711 // This is the case, if called from omp_init_lock_with_hint:
2712 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2715 if (ompt_enabled.enabled) {
2716 if (ompt_enabled.ompt_callback_mutex_acquire) {
2717 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718 ompt_mutex_nest_lock, omp_lock_hint_none,
2719 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2725 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2728 __kmp_itt_lock_acquired(lck);
2729 #endif /* USE_ITT_BUILD */
2731 #if OMPT_SUPPORT && OMPT_OPTIONAL
2732 if (ompt_enabled.enabled) {
2733 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2734 if (ompt_enabled.ompt_callback_mutex_acquired) {
2736 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2737 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2740 if (ompt_enabled.ompt_callback_nest_lock) {
2742 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2743 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2749 #endif // KMP_USE_DYNAMIC_LOCK
2752 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2753 #if KMP_USE_DYNAMIC_LOCK
2755 int tag = KMP_EXTRACT_D_TAG(user_lock);
2757 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2759 #if KMP_USE_INLINED_TAS
2760 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2761 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2763 #elif KMP_USE_INLINED_FUTEX
2764 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2765 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2769 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2772 #if OMPT_SUPPORT && OMPT_OPTIONAL
2773 // This is the case, if called from omp_init_lock_with_hint:
2774 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2776 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2777 if (ompt_enabled.ompt_callback_mutex_released) {
2778 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2779 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2783 #else // KMP_USE_DYNAMIC_LOCK
2785 kmp_user_lock_p lck;
2787 /* Can't use serial interval since not block structured */
2788 /* release the lock */
2790 if ((__kmp_user_lock_kind == lk_tas) &&
2791 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2792 #if KMP_OS_LINUX && \
2793 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2794 // "fast" path implemented to fix customer performance issue
2796 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2797 #endif /* USE_ITT_BUILD */
2798 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2801 #if OMPT_SUPPORT && OMPT_OPTIONAL
2802 // This is the case, if called from omp_init_lock_with_hint:
2803 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2805 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2806 if (ompt_enabled.ompt_callback_mutex_released) {
2807 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2808 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2814 lck = (kmp_user_lock_p)user_lock;
2818 else if ((__kmp_user_lock_kind == lk_futex) &&
2819 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2820 lck = (kmp_user_lock_p)user_lock;
2824 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2828 __kmp_itt_lock_releasing(lck);
2829 #endif /* USE_ITT_BUILD */
2831 RELEASE_LOCK(lck, gtid);
2833 #if OMPT_SUPPORT && OMPT_OPTIONAL
2834 // This is the case, if called from omp_init_lock_with_hint:
2835 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2837 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2838 if (ompt_enabled.ompt_callback_mutex_released) {
2839 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2840 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2844 #endif // KMP_USE_DYNAMIC_LOCK
2847 /* release the lock */
2848 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2849 #if KMP_USE_DYNAMIC_LOCK
2852 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2854 int release_status =
2855 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2856 (void) release_status;
2858 #if OMPT_SUPPORT && OMPT_OPTIONAL
2859 // This is the case, if called from omp_init_lock_with_hint:
2860 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2862 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2863 if (ompt_enabled.enabled) {
2864 if (release_status == KMP_LOCK_RELEASED) {
2865 if (ompt_enabled.ompt_callback_mutex_released) {
2866 // release_lock_last
2867 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2868 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2871 } else if (ompt_enabled.ompt_callback_nest_lock) {
2872 // release_lock_prev
2873 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2874 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2879 #else // KMP_USE_DYNAMIC_LOCK
2881 kmp_user_lock_p lck;
2883 /* Can't use serial interval since not block structured */
2885 if ((__kmp_user_lock_kind == lk_tas) &&
2886 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2887 OMP_NEST_LOCK_T_SIZE)) {
2888 #if KMP_OS_LINUX && \
2889 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2890 // "fast" path implemented to fix customer performance issue
2891 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2893 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2894 #endif /* USE_ITT_BUILD */
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897 int release_status = KMP_LOCK_STILL_HELD;
2900 if (--(tl->lk.depth_locked) == 0) {
2901 TCW_4(tl->lk.poll, 0);
2902 #if OMPT_SUPPORT && OMPT_OPTIONAL
2903 release_status = KMP_LOCK_RELEASED;
2908 #if OMPT_SUPPORT && OMPT_OPTIONAL
2909 // This is the case, if called from omp_init_lock_with_hint:
2910 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2913 if (ompt_enabled.enabled) {
2914 if (release_status == KMP_LOCK_RELEASED) {
2915 if (ompt_enabled.ompt_callback_mutex_released) {
2916 // release_lock_last
2917 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2918 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2920 } else if (ompt_enabled.ompt_callback_nest_lock) {
2921 // release_lock_previous
2922 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2923 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2930 lck = (kmp_user_lock_p)user_lock;
2934 else if ((__kmp_user_lock_kind == lk_futex) &&
2935 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2936 OMP_NEST_LOCK_T_SIZE)) {
2937 lck = (kmp_user_lock_p)user_lock;
2941 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2945 __kmp_itt_lock_releasing(lck);
2946 #endif /* USE_ITT_BUILD */
2949 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2950 #if OMPT_SUPPORT && OMPT_OPTIONAL
2951 // This is the case, if called from omp_init_lock_with_hint:
2952 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2954 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2955 if (ompt_enabled.enabled) {
2956 if (release_status == KMP_LOCK_RELEASED) {
2957 if (ompt_enabled.ompt_callback_mutex_released) {
2958 // release_lock_last
2959 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2960 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2962 } else if (ompt_enabled.ompt_callback_nest_lock) {
2963 // release_lock_previous
2964 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2965 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2970 #endif // KMP_USE_DYNAMIC_LOCK
2973 /* try to acquire the lock */
2974 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2975 KMP_COUNT_BLOCK(OMP_test_lock);
2977 #if KMP_USE_DYNAMIC_LOCK
2979 int tag = KMP_EXTRACT_D_TAG(user_lock);
2981 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2983 #if OMPT_SUPPORT && OMPT_OPTIONAL
2984 // This is the case, if called from omp_init_lock_with_hint:
2985 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2987 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2988 if (ompt_enabled.ompt_callback_mutex_acquire) {
2989 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2990 ompt_mutex_lock, omp_lock_hint_none,
2991 __ompt_get_mutex_impl_type(user_lock),
2992 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2995 #if KMP_USE_INLINED_TAS
2996 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2997 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2999 #elif KMP_USE_INLINED_FUTEX
3000 if (tag == locktag_futex && !__kmp_env_consistency_check) {
3001 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3005 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3009 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3011 #if OMPT_SUPPORT && OMPT_OPTIONAL
3012 if (ompt_enabled.ompt_callback_mutex_acquired) {
3013 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3014 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3020 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3025 #else // KMP_USE_DYNAMIC_LOCK
3027 kmp_user_lock_p lck;
3030 if ((__kmp_user_lock_kind == lk_tas) &&
3031 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3032 lck = (kmp_user_lock_p)user_lock;
3035 else if ((__kmp_user_lock_kind == lk_futex) &&
3036 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3037 lck = (kmp_user_lock_p)user_lock;
3041 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3045 __kmp_itt_lock_acquiring(lck);
3046 #endif /* USE_ITT_BUILD */
3047 #if OMPT_SUPPORT && OMPT_OPTIONAL
3048 // This is the case, if called from omp_init_lock_with_hint:
3049 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3052 if (ompt_enabled.ompt_callback_mutex_acquire) {
3053 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3054 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3055 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059 rc = TEST_LOCK(lck, gtid);
3062 __kmp_itt_lock_acquired(lck);
3064 __kmp_itt_lock_cancelled(lck);
3066 #endif /* USE_ITT_BUILD */
3067 #if OMPT_SUPPORT && OMPT_OPTIONAL
3068 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3069 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3070 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3074 return (rc ? FTN_TRUE : FTN_FALSE);
3076 /* Can't use serial interval since not block structured */
3078 #endif // KMP_USE_DYNAMIC_LOCK
3081 /* try to acquire the lock */
3082 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3083 #if KMP_USE_DYNAMIC_LOCK
3086 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3088 #if OMPT_SUPPORT && OMPT_OPTIONAL
3089 // This is the case, if called from omp_init_lock_with_hint:
3090 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3092 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3093 if (ompt_enabled.ompt_callback_mutex_acquire) {
3094 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3095 ompt_mutex_nest_lock, omp_lock_hint_none,
3096 __ompt_get_mutex_impl_type(user_lock),
3097 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3100 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3103 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3105 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3108 #if OMPT_SUPPORT && OMPT_OPTIONAL
3109 if (ompt_enabled.enabled && rc) {
3111 if (ompt_enabled.ompt_callback_mutex_acquired) {
3113 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3114 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3118 if (ompt_enabled.ompt_callback_nest_lock) {
3120 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3121 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3128 #else // KMP_USE_DYNAMIC_LOCK
3130 kmp_user_lock_p lck;
3133 if ((__kmp_user_lock_kind == lk_tas) &&
3134 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3135 OMP_NEST_LOCK_T_SIZE)) {
3136 lck = (kmp_user_lock_p)user_lock;
3139 else if ((__kmp_user_lock_kind == lk_futex) &&
3140 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3141 OMP_NEST_LOCK_T_SIZE)) {
3142 lck = (kmp_user_lock_p)user_lock;
3146 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3150 __kmp_itt_lock_acquiring(lck);
3151 #endif /* USE_ITT_BUILD */
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154 // This is the case, if called from omp_init_lock_with_hint:
3155 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3157 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3158 if (ompt_enabled.enabled) &&
3159 ompt_enabled.ompt_callback_mutex_acquire) {
3160 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3161 ompt_mutex_nest_lock, omp_lock_hint_none,
3162 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3167 rc = TEST_NESTED_LOCK(lck, gtid);
3170 __kmp_itt_lock_acquired(lck);
3172 __kmp_itt_lock_cancelled(lck);
3174 #endif /* USE_ITT_BUILD */
3175 #if OMPT_SUPPORT && OMPT_OPTIONAL
3176 if (ompt_enabled.enabled && rc) {
3178 if (ompt_enabled.ompt_callback_mutex_acquired) {
3180 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3181 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3184 if (ompt_enabled.ompt_callback_nest_lock) {
3186 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3187 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3194 /* Can't use serial interval since not block structured */
3196 #endif // KMP_USE_DYNAMIC_LOCK
3199 // Interface to fast scalable reduce methods routines
3201 // keep the selected method in a thread local structure for cross-function
3202 // usage: will be used in __kmpc_end_reduce* functions;
3203 // another solution: to re-determine the method one more time in
3204 // __kmpc_end_reduce* functions (new prototype required then)
3205 // AT: which solution is better?
3206 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3207 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3209 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3210 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3212 // description of the packed_reduction_method variable: look at the macros in
3215 // used in a critical section reduce block
3216 static __forceinline void
3217 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3218 kmp_critical_name *crit) {
3220 // this lock was visible to a customer and to the threading profile tool as a
3221 // serial overhead span (although it's used for an internal purpose only)
3222 // why was it visible in previous implementation?
3223 // should we keep it visible in new reduce block?
3224 kmp_user_lock_p lck;
3226 #if KMP_USE_DYNAMIC_LOCK
3228 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3229 // Check if it is initialized.
3231 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3232 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3233 KMP_GET_D_TAG(__kmp_user_lock_seq));
3235 __kmp_init_indirect_csptr(crit, loc, global_tid,
3236 KMP_GET_I_TAG(__kmp_user_lock_seq));
3239 // Branch for accessing the actual lock object and set operation. This
3240 // branching is inevitable since this lock initialization does not follow the
3241 // normal dispatch path (lock table is not used).
3242 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3243 lck = (kmp_user_lock_p)lk;
3244 KMP_DEBUG_ASSERT(lck != NULL);
3245 if (__kmp_env_consistency_check) {
3246 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3248 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3250 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3252 KMP_DEBUG_ASSERT(lck != NULL);
3253 if (__kmp_env_consistency_check) {
3254 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3256 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3259 #else // KMP_USE_DYNAMIC_LOCK
3261 // We know that the fast reduction code is only emitted by Intel compilers
3262 // with 32 byte critical sections. If there isn't enough space, then we
3263 // have to use a pointer.
3264 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3265 lck = (kmp_user_lock_p)crit;
3267 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3269 KMP_DEBUG_ASSERT(lck != NULL);
3271 if (__kmp_env_consistency_check)
3272 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3274 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3276 #endif // KMP_USE_DYNAMIC_LOCK
3279 // used in a critical section reduce block
3280 static __forceinline void
3281 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3282 kmp_critical_name *crit) {
3284 kmp_user_lock_p lck;
3286 #if KMP_USE_DYNAMIC_LOCK
3288 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3289 lck = (kmp_user_lock_p)crit;
3290 if (__kmp_env_consistency_check)
3291 __kmp_pop_sync(global_tid, ct_critical, loc);
3292 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3294 kmp_indirect_lock_t *ilk =
3295 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3296 if (__kmp_env_consistency_check)
3297 __kmp_pop_sync(global_tid, ct_critical, loc);
3298 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3301 #else // KMP_USE_DYNAMIC_LOCK
3303 // We know that the fast reduction code is only emitted by Intel compilers
3304 // with 32 byte critical sections. If there isn't enough space, then we have
3305 // to use a pointer.
3306 if (__kmp_base_user_lock_size > 32) {
3307 lck = *((kmp_user_lock_p *)crit);
3308 KMP_ASSERT(lck != NULL);
3310 lck = (kmp_user_lock_p)crit;
3313 if (__kmp_env_consistency_check)
3314 __kmp_pop_sync(global_tid, ct_critical, loc);
3316 __kmp_release_user_lock_with_checks(lck, global_tid);
3318 #endif // KMP_USE_DYNAMIC_LOCK
3319 } // __kmp_end_critical_section_reduce_block
3321 static __forceinline int
3322 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3326 // Check if we are inside the teams construct?
3327 if (th->th.th_teams_microtask) {
3328 *team_p = team = th->th.th_team;
3329 if (team->t.t_level == th->th.th_teams_level) {
3330 // This is reduction at teams construct.
3331 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3332 // Let's swap teams temporarily for the reduction.
3333 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3334 th->th.th_team = team->t.t_parent;
3335 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3336 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3337 *task_state = th->th.th_task_state;
3338 th->th.th_task_state = 0;
3346 static __forceinline void
3347 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3348 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3349 th->th.th_info.ds.ds_tid = 0;
3350 th->th.th_team = team;
3351 th->th.th_team_nproc = team->t.t_nproc;
3352 th->th.th_task_team = team->t.t_task_team[task_state];
3353 th->th.th_task_state = task_state;
3356 /* 2.a.i. Reduce Block without a terminating barrier */
3358 @ingroup SYNCHRONIZATION
3359 @param loc source location information
3360 @param global_tid global thread number
3361 @param num_vars number of items (variables) to be reduced
3362 @param reduce_size size of data in bytes to be reduced
3363 @param reduce_data pointer to data to be reduced
3364 @param reduce_func callback function providing reduction operation on two
3365 operands and returning result of reduction in lhs_data
3366 @param lck pointer to the unique lock data structure
3367 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3368 threads if atomic reduction needed
3370 The nowait version is used for a reduce clause with the nowait argument.
3373 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3374 size_t reduce_size, void *reduce_data,
3375 void (*reduce_func)(void *lhs_data, void *rhs_data),
3376 kmp_critical_name *lck) {
3378 KMP_COUNT_BLOCK(REDUCE_nowait);
3380 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3383 int teams_swapped = 0, task_state;
3384 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3386 // why do we need this initialization here at all?
3387 // Reduction clause can not be used as a stand-alone directive.
3389 // do not call __kmp_serial_initialize(), it will be called by
3390 // __kmp_parallel_initialize() if needed
3391 // possible detection of false-positive race by the threadchecker ???
3392 if (!TCR_4(__kmp_init_parallel))
3393 __kmp_parallel_initialize();
3395 __kmp_resume_if_soft_paused();
3397 // check correctness of reduce block nesting
3398 #if KMP_USE_DYNAMIC_LOCK
3399 if (__kmp_env_consistency_check)
3400 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3402 if (__kmp_env_consistency_check)
3403 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3406 th = __kmp_thread_from_gtid(global_tid);
3407 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3409 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3410 // the value should be kept in a variable
3411 // the variable should be either a construct-specific or thread-specific
3412 // property, not a team specific property
3413 // (a thread can reach the next reduce block on the next construct, reduce
3414 // method may differ on the next construct)
3415 // an ident_t "loc" parameter could be used as a construct-specific property
3416 // (what if loc == 0?)
3417 // (if both construct-specific and team-specific variables were shared,
3418 // then unness extra syncs should be needed)
3419 // a thread-specific variable is better regarding two issues above (next
3420 // construct and extra syncs)
3421 // a thread-specific "th_local.reduction_method" variable is used currently
3422 // each thread executes 'determine' and 'set' lines (no need to execute by one
3423 // thread, to avoid unness extra syncs)
3425 packed_reduction_method = __kmp_determine_reduction_method(
3426 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3427 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3429 OMPT_REDUCTION_DECL(th, global_tid);
3430 if (packed_reduction_method == critical_reduce_block) {
3432 OMPT_REDUCTION_BEGIN;
3434 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3437 } else if (packed_reduction_method == empty_reduce_block) {
3439 OMPT_REDUCTION_BEGIN;
3441 // usage: if team size == 1, no synchronization is required ( Intel
3445 } else if (packed_reduction_method == atomic_reduce_block) {
3449 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3450 // won't be called by the code gen)
3451 // (it's not quite good, because the checking block has been closed by
3453 // but atomic operation has not been executed yet, will be executed
3454 // slightly later, literally on next instruction)
3455 if (__kmp_env_consistency_check)
3456 __kmp_pop_sync(global_tid, ct_reduce, loc);
3458 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3459 tree_reduce_block)) {
3461 // AT: performance issue: a real barrier here
3462 // AT: (if master goes slow, other threads are blocked here waiting for the
3463 // master to come and release them)
3464 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3465 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3466 // be confusing to a customer)
3467 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3468 // might go faster and be more in line with sense of NOWAIT
3469 // AT: TO DO: do epcc test and compare times
3471 // this barrier should be invisible to a customer and to the threading profile
3472 // tool (it's neither a terminating barrier nor customer's code, it's
3473 // used for an internal purpose)
3475 // JP: can this barrier potentially leed to task scheduling?
3476 // JP: as long as there is a barrier in the implementation, OMPT should and
3477 // will provide the barrier events
3478 // so we set-up the necessary frame/return addresses.
3479 ompt_frame_t *ompt_frame;
3480 if (ompt_enabled.enabled) {
3481 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3482 if (ompt_frame->enter_frame.ptr == NULL)
3483 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3484 OMPT_STORE_RETURN_ADDRESS(global_tid);
3488 __kmp_threads[global_tid]->th.th_ident = loc;
3491 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3492 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3493 retval = (retval != 0) ? (0) : (1);
3494 #if OMPT_SUPPORT && OMPT_OPTIONAL
3495 if (ompt_enabled.enabled) {
3496 ompt_frame->enter_frame = ompt_data_none;
3500 // all other workers except master should do this pop here
3501 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3502 if (__kmp_env_consistency_check) {
3504 __kmp_pop_sync(global_tid, ct_reduce, loc);
3510 // should never reach this block
3511 KMP_ASSERT(0); // "unexpected method"
3513 if (teams_swapped) {
3514 __kmp_restore_swapped_teams(th, team, task_state);
3518 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3519 global_tid, packed_reduction_method, retval));
3525 @ingroup SYNCHRONIZATION
3526 @param loc source location information
3527 @param global_tid global thread id.
3528 @param lck pointer to the unique lock data structure
3530 Finish the execution of a reduce nowait.
3532 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3533 kmp_critical_name *lck) {
3535 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3537 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3539 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3541 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3543 if (packed_reduction_method == critical_reduce_block) {
3545 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3548 } else if (packed_reduction_method == empty_reduce_block) {
3550 // usage: if team size == 1, no synchronization is required ( on Intel
3555 } else if (packed_reduction_method == atomic_reduce_block) {
3557 // neither master nor other workers should get here
3558 // (code gen does not generate this call in case 2: atomic reduce block)
3559 // actually it's better to remove this elseif at all;
3560 // after removal this value will checked by the 'else' and will assert
3562 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3563 tree_reduce_block)) {
3565 // only master gets here
3566 // OMPT: tree reduction is annotated in the barrier code
3570 // should never reach this block
3571 KMP_ASSERT(0); // "unexpected method"
3574 if (__kmp_env_consistency_check)
3575 __kmp_pop_sync(global_tid, ct_reduce, loc);
3577 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3578 global_tid, packed_reduction_method));
3583 /* 2.a.ii. Reduce Block with a terminating barrier */
3586 @ingroup SYNCHRONIZATION
3587 @param loc source location information
3588 @param global_tid global thread number
3589 @param num_vars number of items (variables) to be reduced
3590 @param reduce_size size of data in bytes to be reduced
3591 @param reduce_data pointer to data to be reduced
3592 @param reduce_func callback function providing reduction operation on two
3593 operands and returning result of reduction in lhs_data
3594 @param lck pointer to the unique lock data structure
3595 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3596 threads if atomic reduction needed
3598 A blocking reduce that includes an implicit barrier.
3600 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3601 size_t reduce_size, void *reduce_data,
3602 void (*reduce_func)(void *lhs_data, void *rhs_data),
3603 kmp_critical_name *lck) {
3604 KMP_COUNT_BLOCK(REDUCE_wait);
3606 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3609 int teams_swapped = 0, task_state;
3611 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3613 // why do we need this initialization here at all?
3614 // Reduction clause can not be a stand-alone directive.
3616 // do not call __kmp_serial_initialize(), it will be called by
3617 // __kmp_parallel_initialize() if needed
3618 // possible detection of false-positive race by the threadchecker ???
3619 if (!TCR_4(__kmp_init_parallel))
3620 __kmp_parallel_initialize();
3622 __kmp_resume_if_soft_paused();
3624 // check correctness of reduce block nesting
3625 #if KMP_USE_DYNAMIC_LOCK
3626 if (__kmp_env_consistency_check)
3627 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3629 if (__kmp_env_consistency_check)
3630 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3633 th = __kmp_thread_from_gtid(global_tid);
3634 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3636 packed_reduction_method = __kmp_determine_reduction_method(
3637 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3638 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3640 OMPT_REDUCTION_DECL(th, global_tid);
3642 if (packed_reduction_method == critical_reduce_block) {
3644 OMPT_REDUCTION_BEGIN;
3645 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3648 } else if (packed_reduction_method == empty_reduce_block) {
3650 OMPT_REDUCTION_BEGIN;
3651 // usage: if team size == 1, no synchronization is required ( Intel
3655 } else if (packed_reduction_method == atomic_reduce_block) {
3659 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3660 tree_reduce_block)) {
3662 // case tree_reduce_block:
3663 // this barrier should be visible to a customer and to the threading profile
3664 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3666 ompt_frame_t *ompt_frame;
3667 if (ompt_enabled.enabled) {
3668 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3669 if (ompt_frame->enter_frame.ptr == NULL)
3670 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3671 OMPT_STORE_RETURN_ADDRESS(global_tid);
3675 __kmp_threads[global_tid]->th.th_ident =
3676 loc; // needed for correct notification of frames
3679 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3680 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3681 retval = (retval != 0) ? (0) : (1);
3682 #if OMPT_SUPPORT && OMPT_OPTIONAL
3683 if (ompt_enabled.enabled) {
3684 ompt_frame->enter_frame = ompt_data_none;
3688 // all other workers except master should do this pop here
3689 // ( none of other workers except master will enter __kmpc_end_reduce() )
3690 if (__kmp_env_consistency_check) {
3691 if (retval == 0) { // 0: all other workers; 1: master
3692 __kmp_pop_sync(global_tid, ct_reduce, loc);
3698 // should never reach this block
3699 KMP_ASSERT(0); // "unexpected method"
3701 if (teams_swapped) {
3702 __kmp_restore_swapped_teams(th, team, task_state);
3706 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3707 global_tid, packed_reduction_method, retval));
3712 @ingroup SYNCHRONIZATION
3713 @param loc source location information
3714 @param global_tid global thread id.
3715 @param lck pointer to the unique lock data structure
3717 Finish the execution of a blocking reduce.
3718 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3721 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3722 kmp_critical_name *lck) {
3724 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3727 int teams_swapped = 0, task_state;
3729 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3731 th = __kmp_thread_from_gtid(global_tid);
3732 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3734 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3736 // this barrier should be visible to a customer and to the threading profile
3737 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3738 OMPT_REDUCTION_DECL(th, global_tid);
3740 if (packed_reduction_method == critical_reduce_block) {
3741 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3745 // TODO: implicit barrier: should be exposed
3747 ompt_frame_t *ompt_frame;
3748 if (ompt_enabled.enabled) {
3749 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3750 if (ompt_frame->enter_frame.ptr == NULL)
3751 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3752 OMPT_STORE_RETURN_ADDRESS(global_tid);
3756 __kmp_threads[global_tid]->th.th_ident = loc;
3758 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3759 #if OMPT_SUPPORT && OMPT_OPTIONAL
3760 if (ompt_enabled.enabled) {
3761 ompt_frame->enter_frame = ompt_data_none;
3765 } else if (packed_reduction_method == empty_reduce_block) {
3769 // usage: if team size==1, no synchronization is required (Intel platforms only)
3771 // TODO: implicit barrier: should be exposed
3773 ompt_frame_t *ompt_frame;
3774 if (ompt_enabled.enabled) {
3775 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3776 if (ompt_frame->enter_frame.ptr == NULL)
3777 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3778 OMPT_STORE_RETURN_ADDRESS(global_tid);
3782 __kmp_threads[global_tid]->th.th_ident = loc;
3784 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3785 #if OMPT_SUPPORT && OMPT_OPTIONAL
3786 if (ompt_enabled.enabled) {
3787 ompt_frame->enter_frame = ompt_data_none;
3791 } else if (packed_reduction_method == atomic_reduce_block) {
3794 ompt_frame_t *ompt_frame;
3795 if (ompt_enabled.enabled) {
3796 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3797 if (ompt_frame->enter_frame.ptr == NULL)
3798 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3799 OMPT_STORE_RETURN_ADDRESS(global_tid);
3802 // TODO: implicit barrier: should be exposed
3804 __kmp_threads[global_tid]->th.th_ident = loc;
3806 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3807 #if OMPT_SUPPORT && OMPT_OPTIONAL
3808 if (ompt_enabled.enabled) {
3809 ompt_frame->enter_frame = ompt_data_none;
3813 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3814 tree_reduce_block)) {
3816 // only master executes here (master releases all other workers)
3817 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3822 // should never reach this block
3823 KMP_ASSERT(0); // "unexpected method"
3825 if (teams_swapped) {
3826 __kmp_restore_swapped_teams(th, team, task_state);
3829 if (__kmp_env_consistency_check)
3830 __kmp_pop_sync(global_tid, ct_reduce, loc);
3832 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3833 global_tid, packed_reduction_method));
3838 #undef __KMP_GET_REDUCTION_METHOD
3839 #undef __KMP_SET_REDUCTION_METHOD
3841 /* end of interface to fast scalable reduce routines */
3843 kmp_uint64 __kmpc_get_taskid() {
3848 gtid = __kmp_get_gtid();
3852 thread = __kmp_thread_from_gtid(gtid);
3853 return thread->th.th_current_task->td_task_id;
3855 } // __kmpc_get_taskid
3857 kmp_uint64 __kmpc_get_parent_taskid() {
3861 kmp_taskdata_t *parent_task;
3863 gtid = __kmp_get_gtid();
3867 thread = __kmp_thread_from_gtid(gtid);
3868 parent_task = thread->th.th_current_task->td_parent;
3869 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3871 } // __kmpc_get_parent_taskid
3874 @ingroup WORK_SHARING
3875 @param loc source location information.
3876 @param gtid global thread number.
3877 @param num_dims number of associated doacross loops.
3878 @param dims info on loops bounds.
3880 Initialize doacross loop information.
3881 Expect compiler send us inclusive bounds,
3882 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3884 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3885 const struct kmp_dim *dims) {
3887 kmp_int64 last, trace_count;
3888 kmp_info_t *th = __kmp_threads[gtid];
3889 kmp_team_t *team = th->th.th_team;
3891 kmp_disp_t *pr_buf = th->th.th_dispatch;
3892 dispatch_shared_info_t *sh_buf;
3896 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3897 gtid, num_dims, !team->t.t_serialized));
3898 KMP_DEBUG_ASSERT(dims != NULL);
3899 KMP_DEBUG_ASSERT(num_dims > 0);
3901 if (team->t.t_serialized) {
3902 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3903 return; // no dependencies if team is serialized
3905 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3906 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3908 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3910 // Save bounds info into allocated private buffer
3911 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3912 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3913 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3914 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3915 pr_buf->th_doacross_info[0] =
3916 (kmp_int64)num_dims; // first element is number of dimensions
3917 // Save also address of num_done in order to access it later without knowing
3919 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3920 pr_buf->th_doacross_info[2] = dims[0].lo;
3921 pr_buf->th_doacross_info[3] = dims[0].up;
3922 pr_buf->th_doacross_info[4] = dims[0].st;
3924 for (j = 1; j < num_dims; ++j) {
3926 range_length; // To keep ranges of all dimensions but the first dims[0]
3927 if (dims[j].st == 1) { // most common case
3928 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3929 range_length = dims[j].up - dims[j].lo + 1;
3931 if (dims[j].st > 0) {
3932 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3933 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3934 } else { // negative increment
3935 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3937 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3940 pr_buf->th_doacross_info[last++] = range_length;
3941 pr_buf->th_doacross_info[last++] = dims[j].lo;
3942 pr_buf->th_doacross_info[last++] = dims[j].up;
3943 pr_buf->th_doacross_info[last++] = dims[j].st;
3946 // Compute total trip count.
3947 // Start with range of dims[0] which we don't need to keep in the buffer.
3948 if (dims[0].st == 1) { // most common case
3949 trace_count = dims[0].up - dims[0].lo + 1;
3950 } else if (dims[0].st > 0) {
3951 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3952 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3953 } else { // negative increment
3954 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3955 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3957 for (j = 1; j < num_dims; ++j) {
3958 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3960 KMP_DEBUG_ASSERT(trace_count > 0);
3962 // Check if shared buffer is not occupied by other loop (idx -
3963 // __kmp_dispatch_num_buffers)
3964 if (idx != sh_buf->doacross_buf_idx) {
3965 // Shared buffer is occupied, wait for it to be free
3966 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3970 // Check if we are the first thread. After the CAS the first thread gets 0,
3971 // others get 1 if initialization is in progress, allocated pointer otherwise.
3972 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3973 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3974 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3976 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3977 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3979 if (flags == NULL) {
3980 // we are the first thread, allocate the array of flags
3981 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3982 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3984 sh_buf->doacross_flags = flags;
3985 } else if (flags == (kmp_uint32 *)1) {
3987 // initialization is still in progress, need to wait
3988 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3990 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3997 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3998 pr_buf->th_doacross_flags =
3999 sh_buf->doacross_flags; // save private copy in order to not
4000 // touch shared buffer on each iteration
4001 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4004 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4005 kmp_int32 shft, num_dims, i;
4007 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4008 kmp_info_t *th = __kmp_threads[gtid];
4009 kmp_team_t *team = th->th.th_team;
4011 kmp_int64 lo, up, st;
4013 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4014 if (team->t.t_serialized) {
4015 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4016 return; // no dependencies if team is serialized
4019 // calculate sequential iteration number and check out-of-bounds condition
4020 pr_buf = th->th.th_dispatch;
4021 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022 num_dims = pr_buf->th_doacross_info[0];
4023 lo = pr_buf->th_doacross_info[2];
4024 up = pr_buf->th_doacross_info[3];
4025 st = pr_buf->th_doacross_info[4];
4026 if (st == 1) { // most common case
4027 if (vec[0] < lo || vec[0] > up) {
4028 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4029 "bounds [%lld,%lld]\n",
4030 gtid, vec[0], lo, up));
4033 iter_number = vec[0] - lo;
4034 } else if (st > 0) {
4035 if (vec[0] < lo || vec[0] > up) {
4036 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4037 "bounds [%lld,%lld]\n",
4038 gtid, vec[0], lo, up));
4041 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4042 } else { // negative increment
4043 if (vec[0] > lo || vec[0] < up) {
4044 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4045 "bounds [%lld,%lld]\n",
4046 gtid, vec[0], lo, up));
4049 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4051 for (i = 1; i < num_dims; ++i) {
4053 kmp_int32 j = i * 4;
4054 ln = pr_buf->th_doacross_info[j + 1];
4055 lo = pr_buf->th_doacross_info[j + 2];
4056 up = pr_buf->th_doacross_info[j + 3];
4057 st = pr_buf->th_doacross_info[j + 4];
4059 if (vec[i] < lo || vec[i] > up) {
4060 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4061 "bounds [%lld,%lld]\n",
4062 gtid, vec[i], lo, up));
4066 } else if (st > 0) {
4067 if (vec[i] < lo || vec[i] > up) {
4068 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4069 "bounds [%lld,%lld]\n",
4070 gtid, vec[i], lo, up));
4073 iter = (kmp_uint64)(vec[i] - lo) / st;
4075 if (vec[i] > lo || vec[i] < up) {
4076 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4077 "bounds [%lld,%lld]\n",
4078 gtid, vec[i], lo, up));
4081 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4083 iter_number = iter + ln * iter_number;
4085 shft = iter_number % 32; // use 32-bit granularity
4086 iter_number >>= 5; // divided by 32
4088 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4093 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4094 gtid, (iter_number << 5) + shft));
4097 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4098 kmp_int32 shft, num_dims, i;
4100 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4101 kmp_info_t *th = __kmp_threads[gtid];
4102 kmp_team_t *team = th->th.th_team;
4106 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4107 if (team->t.t_serialized) {
4108 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4109 return; // no dependencies if team is serialized
4112 // calculate sequential iteration number (same as in "wait" but no
4113 // out-of-bounds checks)
4114 pr_buf = th->th.th_dispatch;
4115 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4116 num_dims = pr_buf->th_doacross_info[0];
4117 lo = pr_buf->th_doacross_info[2];
4118 st = pr_buf->th_doacross_info[4];
4119 if (st == 1) { // most common case
4120 iter_number = vec[0] - lo;
4121 } else if (st > 0) {
4122 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4123 } else { // negative increment
4124 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4126 for (i = 1; i < num_dims; ++i) {
4128 kmp_int32 j = i * 4;
4129 ln = pr_buf->th_doacross_info[j + 1];
4130 lo = pr_buf->th_doacross_info[j + 2];
4131 st = pr_buf->th_doacross_info[j + 4];
4134 } else if (st > 0) {
4135 iter = (kmp_uint64)(vec[i] - lo) / st;
4137 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4139 iter_number = iter + ln * iter_number;
4141 shft = iter_number % 32; // use 32-bit granularity
4142 iter_number >>= 5; // divided by 32
4145 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4146 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4147 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4148 (iter_number << 5) + shft));
4151 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4153 kmp_info_t *th = __kmp_threads[gtid];
4154 kmp_team_t *team = th->th.th_team;
4155 kmp_disp_t *pr_buf = th->th.th_dispatch;
4157 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4158 if (team->t.t_serialized) {
4159 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4160 return; // nothing to do
4162 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4163 if (num_done == th->th.th_team_nproc) {
4164 // we are the last thread, need to free shared resources
4165 int idx = pr_buf->th_doacross_buf_idx - 1;
4166 dispatch_shared_info_t *sh_buf =
4167 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4168 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4169 (kmp_int64)&sh_buf->doacross_num_done);
4170 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4171 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4172 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4173 sh_buf->doacross_flags = NULL;
4174 sh_buf->doacross_num_done = 0;
4175 sh_buf->doacross_buf_idx +=
4176 __kmp_dispatch_num_buffers; // free buffer for future re-use
4178 // free private resources (need to keep buffer index forever)
4179 pr_buf->th_doacross_flags = NULL;
4180 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4181 pr_buf->th_doacross_info = NULL;
4182 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4185 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4186 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4187 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4190 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4191 __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4194 int __kmpc_get_target_offload(void) {
4195 if (!__kmp_init_serial) {
4196 __kmp_serial_initialize();
4198 return __kmp_target_offload;
4201 int __kmpc_pause_resource(kmp_pause_status_t level) {
4202 if (!__kmp_init_serial) {
4203 return 1; // Can't pause if runtime is not initialized
4205 return __kmp_pause_resource(level);