2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
5 //===----------------------------------------------------------------------===//
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 //===----------------------------------------------------------------------===//
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
16 #include "kmp_error.h"
20 #include "kmp_stats.h"
23 #include "ompt-specific.h"
26 #define MAX_MESSAGE 512
28 // flags will be used in future, e.g. to implement openmp_strict library
32 * @ingroup STARTUP_SHUTDOWN
33 * @param loc in source location information
34 * @param flags in for future use (currently ignored)
36 * Initialize the runtime library. This call is optional; if it is not made then
37 * it will be implicitly called by attempts to use other library functions.
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40 // By default __kmpc_begin() is no-op.
42 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43 __kmp_str_match_true(env)) {
44 __kmp_middle_initialize();
45 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46 } else if (__kmp_ignore_mppbeg() == FALSE) {
47 // By default __kmp_ignore_mppbeg() returns TRUE.
48 __kmp_internal_begin();
49 KC_TRACE(10, ("__kmpc_begin: called\n"));
54 * @ingroup STARTUP_SHUTDOWN
55 * @param loc source location information
57 * Shutdown the runtime library. This is also optional, and even if called will
58 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
61 void __kmpc_end(ident_t *loc) {
62 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65 // returns FALSE and __kmpc_end() will unregister this root (it can cause
66 // library shut down).
67 if (__kmp_ignore_mppend() == FALSE) {
68 KC_TRACE(10, ("__kmpc_end: called\n"));
69 KA_TRACE(30, ("__kmpc_end\n"));
71 __kmp_internal_end_thread(-1);
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74 // Normal exit process on Windows does not allow worker threads of the final
75 // parallel region to finish reporting their events, so shutting down the
76 // library here fixes the issue at least for the cases where __kmpc_end() is
78 if (ompt_enabled.enabled)
79 __kmp_internal_end_library(__kmp_gtid_get_specific());
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
88 This function can be called in any context.
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102 kmp_int32 gtid = __kmp_entry_gtid();
104 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
125 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
127 return TCR_4(__kmp_all_nth);
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138 return __kmp_tid_from_gtid(__kmp_entry_gtid());
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
149 return __kmp_entry_thread()->th.th_team->t.t_nproc;
153 * @ingroup DEPRECATED
154 * @param loc location description
156 * This function need not be called. It always returns TRUE.
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
169 if (__kmp_par_range == 0) {
172 semi2 = loc->psource;
176 semi2 = strchr(semi2, ';');
180 semi2 = strchr(semi2 + 1, ';');
184 if (__kmp_par_range_filename[0]) {
185 const char *name = semi2 - 1;
186 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
189 if ((*name == '/') || (*name == ';')) {
192 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193 return __kmp_par_range < 0;
196 semi3 = strchr(semi2 + 1, ';');
197 if (__kmp_par_range_routine[0]) {
198 if ((semi3 != NULL) && (semi3 > semi2) &&
199 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200 return __kmp_par_range < 0;
203 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205 return __kmp_par_range > 0;
207 return __kmp_par_range < 0;
211 #endif /* KMP_DEBUG */
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221 return __kmp_entry_thread()->th.th_root->r.r_active;
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234 kmp_int32 num_threads) {
235 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236 global_tid, num_threads));
238 __kmp_push_num_threads(loc, global_tid, num_threads);
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
244 /* the num_threads are automatically popped */
247 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
248 kmp_int32 proc_bind) {
249 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
257 @param loc source location information
258 @param argc total number of arguments in the ellipsis
259 @param microtask pointer to callback routine consisting of outlined parallel
261 @param ... pointers to shared variables that aren't global
263 Do the actual fork and call the microtask in the relevant number of threads.
265 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
266 int gtid = __kmp_entry_gtid();
268 #if (KMP_STATS_ENABLED)
269 // If we were in a serial region, then stop the serial timer, record
270 // the event, and start parallel region timer
271 stats_state_e previous_state = KMP_GET_THREAD_STATE();
272 if (previous_state == stats_state_e::SERIAL_REGION) {
273 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
275 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
277 int inParallel = __kmpc_in_parallel(loc);
279 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
281 KMP_COUNT_BLOCK(OMP_PARALLEL);
285 // maybe to save thr_state is enough here
288 va_start(ap, microtask);
291 ompt_frame_t *ompt_frame;
292 if (ompt_enabled.enabled) {
293 kmp_info_t *master_th = __kmp_threads[gtid];
294 kmp_team_t *parent_team = master_th->th.th_team;
295 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
297 ompt_frame = &(lwt->ompt_task_info.frame);
299 int tid = __kmp_tid_from_gtid(gtid);
301 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
303 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
304 OMPT_STORE_RETURN_ADDRESS(gtid);
308 #if INCLUDE_SSC_MARKS
311 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
312 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
313 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
321 #if INCLUDE_SSC_MARKS
324 __kmp_join_call(loc, gtid
334 #if KMP_STATS_ENABLED
335 if (previous_state == stats_state_e::SERIAL_REGION) {
336 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
338 KMP_POP_PARTITIONED_TIMER();
340 #endif // KMP_STATS_ENABLED
345 @param loc source location information
346 @param global_tid global thread number
347 @param num_teams number of teams requested for the teams construct
348 @param num_threads number of threads per team requested for the teams construct
350 Set the number of teams to be used by the teams construct.
351 This call is only required if the teams construct has a `num_teams` clause
352 or a `thread_limit` clause (or both).
354 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
355 kmp_int32 num_teams, kmp_int32 num_threads) {
357 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
358 global_tid, num_teams, num_threads));
360 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
365 @param loc source location information
366 @param argc total number of arguments in the ellipsis
367 @param microtask pointer to callback routine consisting of outlined teams
369 @param ... pointers to shared variables that aren't global
371 Do the actual fork and call the microtask in the relevant number of threads.
373 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
375 int gtid = __kmp_entry_gtid();
376 kmp_info_t *this_thr = __kmp_threads[gtid];
378 va_start(ap, microtask);
380 #if KMP_STATS_ENABLED
381 KMP_COUNT_BLOCK(OMP_TEAMS);
382 stats_state_e previous_state = KMP_GET_THREAD_STATE();
383 if (previous_state == stats_state_e::SERIAL_REGION) {
384 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
386 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
390 // remember teams entry point and nesting level
391 this_thr->th.th_teams_microtask = microtask;
392 this_thr->th.th_teams_level =
393 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
396 kmp_team_t *parent_team = this_thr->th.th_team;
397 int tid = __kmp_tid_from_gtid(gtid);
398 if (ompt_enabled.enabled) {
399 parent_team->t.t_implicit_task_taskdata[tid]
400 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
402 OMPT_STORE_RETURN_ADDRESS(gtid);
405 // check if __kmpc_push_num_teams called, set default number of teams
407 if (this_thr->th.th_teams_size.nteams == 0) {
408 __kmp_push_num_teams(loc, gtid, 0, 0);
410 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
411 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
412 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
414 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
415 VOLATILE_CAST(microtask_t)
416 __kmp_teams_master, // "wrapped" task
417 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
424 __kmp_join_call(loc, gtid
431 // Pop current CG root off list
432 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
433 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
434 this_thr->th.th_cg_roots = tmp->up;
435 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
436 " to node %p. cg_nthreads was %d\n",
437 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
438 KMP_DEBUG_ASSERT(tmp->cg_nthreads);
439 int i = tmp->cg_nthreads--;
440 if (i == 1) { // check is we are the last thread in CG (not always the case)
443 // Restore current task's thread_limit from CG root
444 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445 this_thr->th.th_current_task->td_icvs.thread_limit =
446 this_thr->th.th_cg_roots->cg_thread_limit;
448 this_thr->th.th_teams_microtask = NULL;
449 this_thr->th.th_teams_level = 0;
450 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
452 #if KMP_STATS_ENABLED
453 if (previous_state == stats_state_e::SERIAL_REGION) {
454 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
456 KMP_POP_PARTITIONED_TIMER();
458 #endif // KMP_STATS_ENABLED
461 // I don't think this function should ever have been exported.
462 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
463 // openmp code ever called it, but it's been exported from the RTL for so
464 // long that I'm afraid to remove the definition.
465 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
469 @param loc source location information
470 @param global_tid global thread number
472 Enter a serialized parallel construct. This interface is used to handle a
473 conditional parallel region, like this,
475 #pragma omp parallel if (condition)
477 when the condition is false.
479 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
480 // The implementation is now in kmp_runtime.cpp so that it can share static
481 // functions with kmp_fork_call since the tasks to be done are similar in
484 OMPT_STORE_RETURN_ADDRESS(global_tid);
486 __kmp_serialized_parallel(loc, global_tid);
491 @param loc source location information
492 @param global_tid global thread number
494 Leave a serialized parallel construct.
496 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
497 kmp_internal_control_t *top;
498 kmp_info_t *this_thr;
499 kmp_team_t *serial_team;
502 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
504 /* skip all this code for autopar serialized loops since it results in
505 unacceptable overhead */
506 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
510 if (!TCR_4(__kmp_init_parallel))
511 __kmp_parallel_initialize();
513 __kmp_resume_if_soft_paused();
515 this_thr = __kmp_threads[global_tid];
516 serial_team = this_thr->th.th_serial_team;
518 kmp_task_team_t *task_team = this_thr->th.th_task_team;
519 // we need to wait for the proxy tasks before finishing the thread
520 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
521 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
524 KMP_DEBUG_ASSERT(serial_team);
525 KMP_ASSERT(serial_team->t.t_serialized);
526 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
527 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
528 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
529 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
532 if (ompt_enabled.enabled &&
533 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
534 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
535 if (ompt_enabled.ompt_callback_implicit_task) {
536 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
537 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
538 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
541 // reset clear the task id only after unlinking the task
542 ompt_data_t *parent_task_data;
543 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
545 if (ompt_enabled.ompt_callback_parallel_end) {
546 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
547 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
548 ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
550 __ompt_lw_taskteam_unlink(this_thr);
551 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
555 /* If necessary, pop the internal control stack values and replace the team
557 top = serial_team->t.t_control_stack_top;
558 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
559 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
560 serial_team->t.t_control_stack_top = top->next;
564 // if( serial_team -> t.t_serialized > 1 )
565 serial_team->t.t_level--;
567 /* pop dispatch buffers stack */
568 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
570 dispatch_private_info_t *disp_buffer =
571 serial_team->t.t_dispatch->th_disp_buffer;
572 serial_team->t.t_dispatch->th_disp_buffer =
573 serial_team->t.t_dispatch->th_disp_buffer->next;
574 __kmp_free(disp_buffer);
576 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
578 --serial_team->t.t_serialized;
579 if (serial_team->t.t_serialized == 0) {
581 /* return to the parallel section */
583 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
584 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
585 __kmp_clear_x87_fpu_status_word();
586 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
587 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
589 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
591 this_thr->th.th_team = serial_team->t.t_parent;
592 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
594 /* restore values cached in the thread */
595 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
596 this_thr->th.th_team_master =
597 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
598 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
600 /* TODO the below shouldn't need to be adjusted for serialized teams */
601 this_thr->th.th_dispatch =
602 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
604 __kmp_pop_current_task_from_thread(this_thr);
606 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
607 this_thr->th.th_current_task->td_flags.executing = 1;
609 if (__kmp_tasking_mode != tskm_immediate_exec) {
610 // Copy the task team from the new child / old parent team to the thread.
611 this_thr->th.th_task_team =
612 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
614 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
616 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
619 if (__kmp_tasking_mode != tskm_immediate_exec) {
620 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
621 "depth of serial team %p to %d\n",
622 global_tid, serial_team, serial_team->t.t_serialized));
626 if (__kmp_env_consistency_check)
627 __kmp_pop_parallel(global_tid, NULL);
629 if (ompt_enabled.enabled)
630 this_thr->th.ompt_thread_info.state =
631 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
632 : ompt_state_work_parallel);
637 @ingroup SYNCHRONIZATION
638 @param loc source location information.
640 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
641 depending on the memory ordering convention obeyed by the compiler
642 even that may not be necessary).
644 void __kmpc_flush(ident_t *loc) {
645 KC_TRACE(10, ("__kmpc_flush: called\n"));
647 /* need explicit __mf() here since use volatile instead in library */
648 KMP_MB(); /* Flush all pending memory write invalidates. */
650 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
652 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
653 // We shouldn't need it, though, since the ABI rules require that
654 // * If the compiler generates NGO stores it also generates the fence
655 // * If users hand-code NGO stores they should insert the fence
656 // therefore no incomplete unordered stores should be visible.
659 // This is to address non-temporal store instructions (sfence needed).
660 // The clflush instruction is addressed either (mfence needed).
661 // Probably the non-temporal load monvtdqa instruction should also be
663 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
664 if (!__kmp_cpuinfo.initialized) {
665 __kmp_query_cpuid(&__kmp_cpuinfo);
667 if (!__kmp_cpuinfo.sse2) {
668 // CPU cannot execute SSE2 instructions.
672 #elif KMP_COMPILER_MSVC
675 __sync_synchronize();
676 #endif // KMP_COMPILER_ICC
679 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
680 // Nothing to see here move along
682 // Nothing needed here (we have a real MB above).
684 // The flushing thread needs to yield here; this prevents a
685 // busy-waiting thread from saturating the pipeline. flush is
686 // often used in loops like this:
688 // #pragma omp flush(flag)
690 // and adding the yield here is good for at least a 10x speedup
691 // when running >2 threads per core (on the NAS LU benchmark).
695 #error Unknown or unsupported architecture
698 #if OMPT_SUPPORT && OMPT_OPTIONAL
699 if (ompt_enabled.ompt_callback_flush) {
700 ompt_callbacks.ompt_callback(ompt_callback_flush)(
701 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
706 /* -------------------------------------------------------------------------- */
708 @ingroup SYNCHRONIZATION
709 @param loc source location information
710 @param global_tid thread id.
714 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
715 KMP_COUNT_BLOCK(OMP_BARRIER);
716 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
718 if (!TCR_4(__kmp_init_parallel))
719 __kmp_parallel_initialize();
721 __kmp_resume_if_soft_paused();
723 if (__kmp_env_consistency_check) {
725 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
727 __kmp_check_barrier(global_tid, ct_barrier, loc);
731 ompt_frame_t *ompt_frame;
732 if (ompt_enabled.enabled) {
733 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
734 if (ompt_frame->enter_frame.ptr == NULL)
735 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
736 OMPT_STORE_RETURN_ADDRESS(global_tid);
739 __kmp_threads[global_tid]->th.th_ident = loc;
740 // TODO: explicit barrier_wait_id:
741 // this function is called when 'barrier' directive is present or
742 // implicit barrier at the end of a worksharing construct.
743 // 1) better to add a per-thread barrier counter to a thread data structure
744 // 2) set to 0 when a new team is created
745 // 4) no sync is required
747 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
748 #if OMPT_SUPPORT && OMPT_OPTIONAL
749 if (ompt_enabled.enabled) {
750 ompt_frame->enter_frame = ompt_data_none;
755 /* The BARRIER for a MASTER section is always explicit */
757 @ingroup WORK_SHARING
758 @param loc source location information.
759 @param global_tid global thread number .
760 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
762 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
765 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
767 if (!TCR_4(__kmp_init_parallel))
768 __kmp_parallel_initialize();
770 __kmp_resume_if_soft_paused();
772 if (KMP_MASTER_GTID(global_tid)) {
773 KMP_COUNT_BLOCK(OMP_MASTER);
774 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
778 #if OMPT_SUPPORT && OMPT_OPTIONAL
780 if (ompt_enabled.ompt_callback_master) {
781 kmp_info_t *this_thr = __kmp_threads[global_tid];
782 kmp_team_t *team = this_thr->th.th_team;
784 int tid = __kmp_tid_from_gtid(global_tid);
785 ompt_callbacks.ompt_callback(ompt_callback_master)(
786 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
787 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
788 OMPT_GET_RETURN_ADDRESS(0));
793 if (__kmp_env_consistency_check) {
794 #if KMP_USE_DYNAMIC_LOCK
796 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
798 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
801 __kmp_push_sync(global_tid, ct_master, loc, NULL);
803 __kmp_check_sync(global_tid, ct_master, loc, NULL);
811 @ingroup WORK_SHARING
812 @param loc source location information.
813 @param global_tid global thread number .
815 Mark the end of a <tt>master</tt> region. This should only be called by the
816 thread that executes the <tt>master</tt> region.
818 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
819 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
821 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
822 KMP_POP_PARTITIONED_TIMER();
824 #if OMPT_SUPPORT && OMPT_OPTIONAL
825 kmp_info_t *this_thr = __kmp_threads[global_tid];
826 kmp_team_t *team = this_thr->th.th_team;
827 if (ompt_enabled.ompt_callback_master) {
828 int tid = __kmp_tid_from_gtid(global_tid);
829 ompt_callbacks.ompt_callback(ompt_callback_master)(
830 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
831 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
832 OMPT_GET_RETURN_ADDRESS(0));
836 if (__kmp_env_consistency_check) {
838 KMP_WARNING(ThreadIdentInvalid);
840 if (KMP_MASTER_GTID(global_tid))
841 __kmp_pop_sync(global_tid, ct_master, loc);
846 @ingroup WORK_SHARING
847 @param loc source location information.
848 @param gtid global thread number.
850 Start execution of an <tt>ordered</tt> construct.
852 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
855 KMP_DEBUG_ASSERT(__kmp_init_serial);
857 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
859 if (!TCR_4(__kmp_init_parallel))
860 __kmp_parallel_initialize();
862 __kmp_resume_if_soft_paused();
865 __kmp_itt_ordered_prep(gtid);
866 // TODO: ordered_wait_id
867 #endif /* USE_ITT_BUILD */
869 th = __kmp_threads[gtid];
871 #if OMPT_SUPPORT && OMPT_OPTIONAL
875 if (ompt_enabled.enabled) {
876 OMPT_STORE_RETURN_ADDRESS(gtid);
877 team = __kmp_team_from_gtid(gtid);
878 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
879 /* OMPT state update */
880 th->th.ompt_thread_info.wait_id = lck;
881 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
883 /* OMPT event callback */
884 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
885 if (ompt_enabled.ompt_callback_mutex_acquire) {
886 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
887 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
893 if (th->th.th_dispatch->th_deo_fcn != 0)
894 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
896 __kmp_parallel_deo(>id, &cid, loc);
898 #if OMPT_SUPPORT && OMPT_OPTIONAL
899 if (ompt_enabled.enabled) {
900 /* OMPT state update */
901 th->th.ompt_thread_info.state = ompt_state_work_parallel;
902 th->th.ompt_thread_info.wait_id = 0;
904 /* OMPT event callback */
905 if (ompt_enabled.ompt_callback_mutex_acquired) {
906 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
907 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
913 __kmp_itt_ordered_start(gtid);
914 #endif /* USE_ITT_BUILD */
918 @ingroup WORK_SHARING
919 @param loc source location information.
920 @param gtid global thread number.
922 End execution of an <tt>ordered</tt> construct.
924 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
928 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
931 __kmp_itt_ordered_end(gtid);
932 // TODO: ordered_wait_id
933 #endif /* USE_ITT_BUILD */
935 th = __kmp_threads[gtid];
937 if (th->th.th_dispatch->th_dxo_fcn != 0)
938 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
940 __kmp_parallel_dxo(>id, &cid, loc);
942 #if OMPT_SUPPORT && OMPT_OPTIONAL
943 OMPT_STORE_RETURN_ADDRESS(gtid);
944 if (ompt_enabled.ompt_callback_mutex_released) {
945 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
947 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
948 ->t.t_ordered.dt.t_value,
949 OMPT_LOAD_RETURN_ADDRESS(gtid));
954 #if KMP_USE_DYNAMIC_LOCK
956 static __forceinline void
957 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
958 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
959 // Pointer to the allocated indirect lock is written to crit, while indexing
962 kmp_indirect_lock_t **lck;
963 lck = (kmp_indirect_lock_t **)crit;
964 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
965 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
966 KMP_SET_I_LOCK_LOCATION(ilk, loc);
967 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
969 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
971 __kmp_itt_critical_creating(ilk->lock, loc);
973 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
976 __kmp_itt_critical_destroyed(ilk->lock);
978 // We don't really need to destroy the unclaimed lock here since it will be
979 // cleaned up at program exit.
980 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
982 KMP_DEBUG_ASSERT(*lck != NULL);
985 // Fast-path acquire tas lock
986 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
988 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
989 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
990 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
991 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
992 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
994 KMP_FSYNC_PREPARE(l); \
995 KMP_INIT_YIELD(spins); \
996 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
998 if (TCR_4(__kmp_nth) > \
999 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
1002 KMP_YIELD_SPIN(spins); \
1004 __kmp_spin_backoff(&backoff); \
1006 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
1007 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
1009 KMP_FSYNC_ACQUIRED(l); \
1012 // Fast-path test tas lock
1013 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1015 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1016 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1017 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1018 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1019 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1022 // Fast-path release tas lock
1023 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1024 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1028 #include <sys/syscall.h>
1031 #define FUTEX_WAIT 0
1034 #define FUTEX_WAKE 1
1037 // Fast-path acquire futex lock
1038 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1040 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1041 kmp_int32 gtid_code = (gtid + 1) << 1; \
1043 KMP_FSYNC_PREPARE(ftx); \
1044 kmp_int32 poll_val; \
1045 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1046 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1047 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1048 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1050 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1052 KMP_LOCK_BUSY(1, futex))) { \
1055 poll_val |= KMP_LOCK_BUSY(1, futex); \
1058 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1059 NULL, NULL, 0)) != 0) { \
1064 KMP_FSYNC_ACQUIRED(ftx); \
1067 // Fast-path test futex lock
1068 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1070 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1071 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1072 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1073 KMP_FSYNC_ACQUIRED(ftx); \
1080 // Fast-path release futex lock
1081 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1083 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1085 KMP_FSYNC_RELEASING(ftx); \
1086 kmp_int32 poll_val = \
1087 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1088 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1089 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1090 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1093 KMP_YIELD_OVERSUB(); \
1096 #endif // KMP_USE_FUTEX
1098 #else // KMP_USE_DYNAMIC_LOCK
1100 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1103 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1105 // Because of the double-check, the following load doesn't need to be volatile
1106 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1111 // Allocate & initialize the lock.
1112 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1113 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1114 __kmp_init_user_lock_with_checks(lck);
1115 __kmp_set_user_lock_location(lck, loc);
1117 __kmp_itt_critical_creating(lck);
1118 // __kmp_itt_critical_creating() should be called *before* the first usage
1119 // of underlying lock. It is the only place where we can guarantee it. There
1120 // are chances the lock will destroyed with no usage, but it is not a
1121 // problem, because this is not real event seen by user but rather setting
1122 // name for object (lock). See more details in kmp_itt.h.
1123 #endif /* USE_ITT_BUILD */
1125 // Use a cmpxchg instruction to slam the start of the critical section with
1126 // the lock pointer. If another thread beat us to it, deallocate the lock,
1127 // and use the lock that the other thread allocated.
1128 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1131 // Deallocate the lock and reload the value.
1133 __kmp_itt_critical_destroyed(lck);
1134 // Let ITT know the lock is destroyed and the same memory location may be reused
1135 // for another purpose.
1136 #endif /* USE_ITT_BUILD */
1137 __kmp_destroy_user_lock_with_checks(lck);
1138 __kmp_user_lock_free(&idx, gtid, lck);
1139 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1140 KMP_DEBUG_ASSERT(lck != NULL);
1146 #endif // KMP_USE_DYNAMIC_LOCK
1149 @ingroup WORK_SHARING
1150 @param loc source location information.
1151 @param global_tid global thread number .
1152 @param crit identity of the critical section. This could be a pointer to a lock
1153 associated with the critical section, or some other suitably unique value.
1155 Enter code protected by a `critical` construct.
1156 This function blocks until the executing thread can enter the critical section.
1158 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1159 kmp_critical_name *crit) {
1160 #if KMP_USE_DYNAMIC_LOCK
1161 #if OMPT_SUPPORT && OMPT_OPTIONAL
1162 OMPT_STORE_RETURN_ADDRESS(global_tid);
1163 #endif // OMPT_SUPPORT
1164 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1166 KMP_COUNT_BLOCK(OMP_CRITICAL);
1167 #if OMPT_SUPPORT && OMPT_OPTIONAL
1168 ompt_state_t prev_state = ompt_state_undefined;
1169 ompt_thread_info_t ti;
1171 kmp_user_lock_p lck;
1173 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1175 // TODO: add THR_OVHD_STATE
1177 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1178 KMP_CHECK_USER_LOCK_INIT();
1180 if ((__kmp_user_lock_kind == lk_tas) &&
1181 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1182 lck = (kmp_user_lock_p)crit;
1185 else if ((__kmp_user_lock_kind == lk_futex) &&
1186 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1187 lck = (kmp_user_lock_p)crit;
1190 else { // ticket, queuing or drdpa
1191 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1194 if (__kmp_env_consistency_check)
1195 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1197 // since the critical directive binds to all threads, not just the current
1198 // team we have to check this even if we are in a serialized team.
1199 // also, even if we are the uber thread, we still have to conduct the lock,
1200 // as we have to contend with sibling threads.
1203 __kmp_itt_critical_acquiring(lck);
1204 #endif /* USE_ITT_BUILD */
1205 #if OMPT_SUPPORT && OMPT_OPTIONAL
1206 OMPT_STORE_RETURN_ADDRESS(gtid);
1207 void *codeptr_ra = NULL;
1208 if (ompt_enabled.enabled) {
1209 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1210 /* OMPT state update */
1211 prev_state = ti.state;
1212 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1213 ti.state = ompt_state_wait_critical;
1215 /* OMPT event callback */
1216 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1217 if (ompt_enabled.ompt_callback_mutex_acquire) {
1218 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1219 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1220 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1224 // Value of 'crit' should be good for using as a critical_id of the critical
1225 // section directive.
1226 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1229 __kmp_itt_critical_acquired(lck);
1230 #endif /* USE_ITT_BUILD */
1231 #if OMPT_SUPPORT && OMPT_OPTIONAL
1232 if (ompt_enabled.enabled) {
1233 /* OMPT state update */
1234 ti.state = prev_state;
1237 /* OMPT event callback */
1238 if (ompt_enabled.ompt_callback_mutex_acquired) {
1239 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1240 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1244 KMP_POP_PARTITIONED_TIMER();
1246 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1247 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1248 #endif // KMP_USE_DYNAMIC_LOCK
1251 #if KMP_USE_DYNAMIC_LOCK
1253 // Converts the given hint to an internal lock implementation
1254 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1256 #define KMP_TSX_LOCK(seq) lockseq_##seq
1258 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1261 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1262 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1264 #define KMP_CPUINFO_RTM 0
1267 // Hints that do not require further logic
1268 if (hint & kmp_lock_hint_hle)
1269 return KMP_TSX_LOCK(hle);
1270 if (hint & kmp_lock_hint_rtm)
1271 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1272 if (hint & kmp_lock_hint_adaptive)
1273 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1275 // Rule out conflicting hints first by returning the default lock
1276 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1277 return __kmp_user_lock_seq;
1278 if ((hint & omp_lock_hint_speculative) &&
1279 (hint & omp_lock_hint_nonspeculative))
1280 return __kmp_user_lock_seq;
1282 // Do not even consider speculation when it appears to be contended
1283 if (hint & omp_lock_hint_contended)
1284 return lockseq_queuing;
1286 // Uncontended lock without speculation
1287 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1290 // HLE lock for speculation
1291 if (hint & omp_lock_hint_speculative)
1292 return KMP_TSX_LOCK(hle);
1294 return __kmp_user_lock_seq;
1297 #if OMPT_SUPPORT && OMPT_OPTIONAL
1298 #if KMP_USE_DYNAMIC_LOCK
1299 static kmp_mutex_impl_t
1300 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1302 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1307 return kmp_mutex_impl_queuing;
1310 return kmp_mutex_impl_spin;
1313 return kmp_mutex_impl_speculative;
1316 return kmp_mutex_impl_none;
1318 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1321 switch (ilock->type) {
1323 case locktag_adaptive:
1325 return kmp_mutex_impl_speculative;
1327 case locktag_nested_tas:
1328 return kmp_mutex_impl_spin;
1330 case locktag_nested_futex:
1332 case locktag_ticket:
1333 case locktag_queuing:
1335 case locktag_nested_ticket:
1336 case locktag_nested_queuing:
1337 case locktag_nested_drdpa:
1338 return kmp_mutex_impl_queuing;
1340 return kmp_mutex_impl_none;
1344 // For locks without dynamic binding
1345 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1346 switch (__kmp_user_lock_kind) {
1348 return kmp_mutex_impl_spin;
1355 return kmp_mutex_impl_queuing;
1360 return kmp_mutex_impl_speculative;
1363 return kmp_mutex_impl_none;
1366 #endif // KMP_USE_DYNAMIC_LOCK
1367 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1370 @ingroup WORK_SHARING
1371 @param loc source location information.
1372 @param global_tid global thread number.
1373 @param crit identity of the critical section. This could be a pointer to a lock
1374 associated with the critical section, or some other suitably unique value.
1375 @param hint the lock hint.
1377 Enter code protected by a `critical` construct with a hint. The hint value is
1378 used to suggest a lock implementation. This function blocks until the executing
1379 thread can enter the critical section unless the hint suggests use of
1380 speculative execution and the hardware supports it.
1382 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1383 kmp_critical_name *crit, uint32_t hint) {
1384 KMP_COUNT_BLOCK(OMP_CRITICAL);
1385 kmp_user_lock_p lck;
1386 #if OMPT_SUPPORT && OMPT_OPTIONAL
1387 ompt_state_t prev_state = ompt_state_undefined;
1388 ompt_thread_info_t ti;
1389 // This is the case, if called from __kmpc_critical:
1390 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1392 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1395 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1397 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1398 // Check if it is initialized.
1399 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1401 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1402 if (KMP_IS_D_LOCK(lckseq)) {
1403 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1404 KMP_GET_D_TAG(lckseq));
1406 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1409 // Branch for accessing the actual lock object and set operation. This
1410 // branching is inevitable since this lock initialization does not follow the
1411 // normal dispatch path (lock table is not used).
1412 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1413 lck = (kmp_user_lock_p)lk;
1414 if (__kmp_env_consistency_check) {
1415 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1416 __kmp_map_hint_to_lock(hint));
1419 __kmp_itt_critical_acquiring(lck);
1421 #if OMPT_SUPPORT && OMPT_OPTIONAL
1422 if (ompt_enabled.enabled) {
1423 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1424 /* OMPT state update */
1425 prev_state = ti.state;
1426 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1427 ti.state = ompt_state_wait_critical;
1429 /* OMPT event callback */
1430 if (ompt_enabled.ompt_callback_mutex_acquire) {
1431 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1432 ompt_mutex_critical, (unsigned int)hint,
1433 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1438 #if KMP_USE_INLINED_TAS
1439 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1440 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1442 #elif KMP_USE_INLINED_FUTEX
1443 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1444 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1448 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1451 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1453 if (__kmp_env_consistency_check) {
1454 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1455 __kmp_map_hint_to_lock(hint));
1458 __kmp_itt_critical_acquiring(lck);
1460 #if OMPT_SUPPORT && OMPT_OPTIONAL
1461 if (ompt_enabled.enabled) {
1462 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1463 /* OMPT state update */
1464 prev_state = ti.state;
1465 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1466 ti.state = ompt_state_wait_critical;
1468 /* OMPT event callback */
1469 if (ompt_enabled.ompt_callback_mutex_acquire) {
1470 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1471 ompt_mutex_critical, (unsigned int)hint,
1472 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1477 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1479 KMP_POP_PARTITIONED_TIMER();
1482 __kmp_itt_critical_acquired(lck);
1483 #endif /* USE_ITT_BUILD */
1484 #if OMPT_SUPPORT && OMPT_OPTIONAL
1485 if (ompt_enabled.enabled) {
1486 /* OMPT state update */
1487 ti.state = prev_state;
1490 /* OMPT event callback */
1491 if (ompt_enabled.ompt_callback_mutex_acquired) {
1492 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1493 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1498 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1499 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1500 } // __kmpc_critical_with_hint
1502 #endif // KMP_USE_DYNAMIC_LOCK
1505 @ingroup WORK_SHARING
1506 @param loc source location information.
1507 @param global_tid global thread number .
1508 @param crit identity of the critical section. This could be a pointer to a lock
1509 associated with the critical section, or some other suitably unique value.
1511 Leave a critical section, releasing any lock that was held during its execution.
1513 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1514 kmp_critical_name *crit) {
1515 kmp_user_lock_p lck;
1517 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1519 #if KMP_USE_DYNAMIC_LOCK
1520 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1521 lck = (kmp_user_lock_p)crit;
1522 KMP_ASSERT(lck != NULL);
1523 if (__kmp_env_consistency_check) {
1524 __kmp_pop_sync(global_tid, ct_critical, loc);
1527 __kmp_itt_critical_releasing(lck);
1529 #if KMP_USE_INLINED_TAS
1530 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1531 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1533 #elif KMP_USE_INLINED_FUTEX
1534 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1535 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1539 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1542 kmp_indirect_lock_t *ilk =
1543 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1544 KMP_ASSERT(ilk != NULL);
1546 if (__kmp_env_consistency_check) {
1547 __kmp_pop_sync(global_tid, ct_critical, loc);
1550 __kmp_itt_critical_releasing(lck);
1552 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1555 #else // KMP_USE_DYNAMIC_LOCK
1557 if ((__kmp_user_lock_kind == lk_tas) &&
1558 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1559 lck = (kmp_user_lock_p)crit;
1562 else if ((__kmp_user_lock_kind == lk_futex) &&
1563 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1564 lck = (kmp_user_lock_p)crit;
1567 else { // ticket, queuing or drdpa
1568 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1571 KMP_ASSERT(lck != NULL);
1573 if (__kmp_env_consistency_check)
1574 __kmp_pop_sync(global_tid, ct_critical, loc);
1577 __kmp_itt_critical_releasing(lck);
1578 #endif /* USE_ITT_BUILD */
1579 // Value of 'crit' should be good for using as a critical_id of the critical
1580 // section directive.
1581 __kmp_release_user_lock_with_checks(lck, global_tid);
1583 #endif // KMP_USE_DYNAMIC_LOCK
1585 #if OMPT_SUPPORT && OMPT_OPTIONAL
1586 /* OMPT release event triggers after lock is released; place here to trigger
1587 * for all #if branches */
1588 OMPT_STORE_RETURN_ADDRESS(global_tid);
1589 if (ompt_enabled.ompt_callback_mutex_released) {
1590 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1591 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1592 OMPT_LOAD_RETURN_ADDRESS(0));
1596 KMP_POP_PARTITIONED_TIMER();
1597 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1601 @ingroup SYNCHRONIZATION
1602 @param loc source location information
1603 @param global_tid thread id.
1604 @return one if the thread should execute the master block, zero otherwise
1606 Start execution of a combined barrier and master. The barrier is executed inside
1609 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1612 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1614 if (!TCR_4(__kmp_init_parallel))
1615 __kmp_parallel_initialize();
1617 __kmp_resume_if_soft_paused();
1619 if (__kmp_env_consistency_check)
1620 __kmp_check_barrier(global_tid, ct_barrier, loc);
1623 ompt_frame_t *ompt_frame;
1624 if (ompt_enabled.enabled) {
1625 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1626 if (ompt_frame->enter_frame.ptr == NULL)
1627 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1628 OMPT_STORE_RETURN_ADDRESS(global_tid);
1632 __kmp_threads[global_tid]->th.th_ident = loc;
1634 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1635 #if OMPT_SUPPORT && OMPT_OPTIONAL
1636 if (ompt_enabled.enabled) {
1637 ompt_frame->enter_frame = ompt_data_none;
1641 return (status != 0) ? 0 : 1;
1645 @ingroup SYNCHRONIZATION
1646 @param loc source location information
1647 @param global_tid thread id.
1649 Complete the execution of a combined barrier and master. This function should
1650 only be called at the completion of the <tt>master</tt> code. Other threads will
1651 still be waiting at the barrier and this call releases them.
1653 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1654 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1656 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1660 @ingroup SYNCHRONIZATION
1661 @param loc source location information
1662 @param global_tid thread id.
1663 @return one if the thread should execute the master block, zero otherwise
1665 Start execution of a combined barrier and master(nowait) construct.
1666 The barrier is executed inside this function.
1667 There is no equivalent "end" function, since the
1669 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1672 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1674 if (!TCR_4(__kmp_init_parallel))
1675 __kmp_parallel_initialize();
1677 __kmp_resume_if_soft_paused();
1679 if (__kmp_env_consistency_check) {
1681 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1683 __kmp_check_barrier(global_tid, ct_barrier, loc);
1687 ompt_frame_t *ompt_frame;
1688 if (ompt_enabled.enabled) {
1689 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1690 if (ompt_frame->enter_frame.ptr == NULL)
1691 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1692 OMPT_STORE_RETURN_ADDRESS(global_tid);
1696 __kmp_threads[global_tid]->th.th_ident = loc;
1698 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1699 #if OMPT_SUPPORT && OMPT_OPTIONAL
1700 if (ompt_enabled.enabled) {
1701 ompt_frame->enter_frame = ompt_data_none;
1705 ret = __kmpc_master(loc, global_tid);
1707 if (__kmp_env_consistency_check) {
1708 /* there's no __kmpc_end_master called; so the (stats) */
1709 /* actions of __kmpc_end_master are done here */
1711 if (global_tid < 0) {
1712 KMP_WARNING(ThreadIdentInvalid);
1715 /* only one thread should do the pop since only */
1716 /* one did the push (see __kmpc_master()) */
1718 __kmp_pop_sync(global_tid, ct_master, loc);
1725 /* The BARRIER for a SINGLE process section is always explicit */
1727 @ingroup WORK_SHARING
1728 @param loc source location information
1729 @param global_tid global thread number
1730 @return One if this thread should execute the single construct, zero otherwise.
1732 Test whether to execute a <tt>single</tt> construct.
1733 There are no implicit barriers in the two "single" calls, rather the compiler
1734 should introduce an explicit barrier if it is required.
1737 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1738 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1741 // We are going to execute the single statement, so we should count it.
1742 KMP_COUNT_BLOCK(OMP_SINGLE);
1743 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1746 #if OMPT_SUPPORT && OMPT_OPTIONAL
1747 kmp_info_t *this_thr = __kmp_threads[global_tid];
1748 kmp_team_t *team = this_thr->th.th_team;
1749 int tid = __kmp_tid_from_gtid(global_tid);
1751 if (ompt_enabled.enabled) {
1753 if (ompt_enabled.ompt_callback_work) {
1754 ompt_callbacks.ompt_callback(ompt_callback_work)(
1755 ompt_work_single_executor, ompt_scope_begin,
1756 &(team->t.ompt_team_info.parallel_data),
1757 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758 1, OMPT_GET_RETURN_ADDRESS(0));
1761 if (ompt_enabled.ompt_callback_work) {
1762 ompt_callbacks.ompt_callback(ompt_callback_work)(
1763 ompt_work_single_other, ompt_scope_begin,
1764 &(team->t.ompt_team_info.parallel_data),
1765 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1766 1, OMPT_GET_RETURN_ADDRESS(0));
1767 ompt_callbacks.ompt_callback(ompt_callback_work)(
1768 ompt_work_single_other, ompt_scope_end,
1769 &(team->t.ompt_team_info.parallel_data),
1770 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1771 1, OMPT_GET_RETURN_ADDRESS(0));
1781 @ingroup WORK_SHARING
1782 @param loc source location information
1783 @param global_tid global thread number
1785 Mark the end of a <tt>single</tt> construct. This function should
1786 only be called by the thread that executed the block of code protected
1787 by the `single` construct.
1789 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1790 __kmp_exit_single(global_tid);
1791 KMP_POP_PARTITIONED_TIMER();
1793 #if OMPT_SUPPORT && OMPT_OPTIONAL
1794 kmp_info_t *this_thr = __kmp_threads[global_tid];
1795 kmp_team_t *team = this_thr->th.th_team;
1796 int tid = __kmp_tid_from_gtid(global_tid);
1798 if (ompt_enabled.ompt_callback_work) {
1799 ompt_callbacks.ompt_callback(ompt_callback_work)(
1800 ompt_work_single_executor, ompt_scope_end,
1801 &(team->t.ompt_team_info.parallel_data),
1802 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1803 OMPT_GET_RETURN_ADDRESS(0));
1809 @ingroup WORK_SHARING
1810 @param loc Source location
1811 @param global_tid Global thread id
1813 Mark the end of a statically scheduled loop.
1815 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1816 KMP_POP_PARTITIONED_TIMER();
1817 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1819 #if OMPT_SUPPORT && OMPT_OPTIONAL
1820 if (ompt_enabled.ompt_callback_work) {
1821 ompt_work_t ompt_work_type = ompt_work_loop;
1822 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1823 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1824 // Determine workshare type
1826 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1827 ompt_work_type = ompt_work_loop;
1828 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1829 ompt_work_type = ompt_work_sections;
1830 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1831 ompt_work_type = ompt_work_distribute;
1833 // use default set above.
1834 // a warning about this case is provided in __kmpc_for_static_init
1836 KMP_DEBUG_ASSERT(ompt_work_type);
1838 ompt_callbacks.ompt_callback(ompt_callback_work)(
1839 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1840 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1843 if (__kmp_env_consistency_check)
1844 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1847 // User routines which take C-style arguments (call by value)
1848 // different from the Fortran equivalent routines
1850 void ompc_set_num_threads(int arg) {
1851 // !!!!! TODO: check the per-task binding
1852 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1855 void ompc_set_dynamic(int flag) {
1858 /* For the thread-private implementation of the internal controls */
1859 thread = __kmp_entry_thread();
1861 __kmp_save_internal_controls(thread);
1863 set__dynamic(thread, flag ? TRUE : FALSE);
1866 void ompc_set_nested(int flag) {
1869 /* For the thread-private internal controls implementation */
1870 thread = __kmp_entry_thread();
1872 __kmp_save_internal_controls(thread);
1874 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1877 void ompc_set_max_active_levels(int max_active_levels) {
1879 /* we want per-task implementation of this internal control */
1881 /* For the per-thread internal controls implementation */
1882 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1885 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1886 // !!!!! TODO: check the per-task binding
1887 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1890 int ompc_get_ancestor_thread_num(int level) {
1891 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1894 int ompc_get_team_size(int level) {
1895 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1898 /* OpenMP 5.0 Affinity Format API */
1900 void ompc_set_affinity_format(char const *format) {
1901 if (!__kmp_init_serial) {
1902 __kmp_serial_initialize();
1904 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1905 format, KMP_STRLEN(format) + 1);
1908 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1910 if (!__kmp_init_serial) {
1911 __kmp_serial_initialize();
1913 format_size = KMP_STRLEN(__kmp_affinity_format);
1914 if (buffer && size) {
1915 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1921 void ompc_display_affinity(char const *format) {
1923 if (!TCR_4(__kmp_init_middle)) {
1924 __kmp_middle_initialize();
1926 gtid = __kmp_get_gtid();
1927 __kmp_aux_display_affinity(gtid, format);
1930 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1931 char const *format) {
1933 size_t num_required;
1934 kmp_str_buf_t capture_buf;
1935 if (!TCR_4(__kmp_init_middle)) {
1936 __kmp_middle_initialize();
1938 gtid = __kmp_get_gtid();
1939 __kmp_str_buf_init(&capture_buf);
1940 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1941 if (buffer && buf_size) {
1942 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1943 capture_buf.used + 1);
1945 __kmp_str_buf_free(&capture_buf);
1946 return num_required;
1949 void kmpc_set_stacksize(int arg) {
1950 // __kmp_aux_set_stacksize initializes the library if needed
1951 __kmp_aux_set_stacksize(arg);
1954 void kmpc_set_stacksize_s(size_t arg) {
1955 // __kmp_aux_set_stacksize initializes the library if needed
1956 __kmp_aux_set_stacksize(arg);
1959 void kmpc_set_blocktime(int arg) {
1963 gtid = __kmp_entry_gtid();
1964 tid = __kmp_tid_from_gtid(gtid);
1965 thread = __kmp_thread_from_gtid(gtid);
1967 __kmp_aux_set_blocktime(arg, thread, tid);
1970 void kmpc_set_library(int arg) {
1971 // __kmp_user_set_library initializes the library if needed
1972 __kmp_user_set_library((enum library_type)arg);
1975 void kmpc_set_defaults(char const *str) {
1976 // __kmp_aux_set_defaults initializes the library if needed
1977 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1980 void kmpc_set_disp_num_buffers(int arg) {
1981 // ignore after initialization because some teams have already
1982 // allocated dispatch buffers
1983 if (__kmp_init_serial == 0 && arg > 0)
1984 __kmp_dispatch_num_buffers = arg;
1987 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1991 if (!TCR_4(__kmp_init_middle)) {
1992 __kmp_middle_initialize();
1994 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1998 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1999 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2002 if (!TCR_4(__kmp_init_middle)) {
2003 __kmp_middle_initialize();
2005 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2009 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2010 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2013 if (!TCR_4(__kmp_init_middle)) {
2014 __kmp_middle_initialize();
2016 return __kmp_aux_get_affinity_mask_proc(proc, mask);
2020 /* -------------------------------------------------------------------------- */
2022 @ingroup THREADPRIVATE
2023 @param loc source location information
2024 @param gtid global thread number
2025 @param cpy_size size of the cpy_data buffer
2026 @param cpy_data pointer to data to be copied
2027 @param cpy_func helper function to call for copying data
2028 @param didit flag variable: 1=single thread; 0=not single thread
2030 __kmpc_copyprivate implements the interface for the private data broadcast
2031 needed for the copyprivate clause associated with a single region in an
2032 OpenMP<sup>*</sup> program (both C and Fortran).
2033 All threads participating in the parallel region call this routine.
2034 One of the threads (called the single thread) should have the <tt>didit</tt>
2035 variable set to 1 and all other threads should have that variable set to 0.
2036 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2038 The OpenMP specification forbids the use of nowait on the single region when a
2039 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2040 barrier internally to avoid race conditions, so the code generation for the
2041 single region should avoid generating a barrier after the call to @ref
2044 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2045 The <tt>loc</tt> parameter is a pointer to source location information.
2047 Internal implementation: The single thread will first copy its descriptor
2048 address (cpy_data) to a team-private location, then the other threads will each
2049 call the function pointed to by the parameter cpy_func, which carries out the
2050 copy by copying the data using the cpy_data buffer.
2052 The cpy_func routine used for the copy and the contents of the data area defined
2053 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2054 to be done. For instance, the cpy_data buffer can hold the actual data to be
2055 copied or it may hold a list of pointers to the data. The cpy_func routine must
2056 interpret the cpy_data buffer appropriately.
2058 The interface to cpy_func is as follows:
2060 void cpy_func( void *destination, void *source )
2062 where void *destination is the cpy_data pointer for the thread being copied to
2063 and void *source is the cpy_data pointer for the thread being copied from.
2065 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2066 void *cpy_data, void (*cpy_func)(void *, void *),
2070 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2074 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2076 if (__kmp_env_consistency_check) {
2078 KMP_WARNING(ConstructIdentInvalid);
2082 // ToDo: Optimize the following two barriers into some kind of split barrier
2085 *data_ptr = cpy_data;
2088 ompt_frame_t *ompt_frame;
2089 if (ompt_enabled.enabled) {
2090 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2091 if (ompt_frame->enter_frame.ptr == NULL)
2092 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2093 OMPT_STORE_RETURN_ADDRESS(gtid);
2096 /* This barrier is not a barrier region boundary */
2098 __kmp_threads[gtid]->th.th_ident = loc;
2100 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2103 (*cpy_func)(cpy_data, *data_ptr);
2105 // Consider next barrier a user-visible barrier for barrier region boundaries
2106 // Nesting checks are already handled by the single construct checks
2109 if (ompt_enabled.enabled) {
2110 OMPT_STORE_RETURN_ADDRESS(gtid);
2114 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2115 // tasks can overwrite the location)
2117 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2118 #if OMPT_SUPPORT && OMPT_OPTIONAL
2119 if (ompt_enabled.enabled) {
2120 ompt_frame->enter_frame = ompt_data_none;
2125 /* -------------------------------------------------------------------------- */
2127 #define INIT_LOCK __kmp_init_user_lock_with_checks
2128 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2129 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2130 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2131 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2132 #define ACQUIRE_NESTED_LOCK_TIMED \
2133 __kmp_acquire_nested_user_lock_with_checks_timed
2134 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2135 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2136 #define TEST_LOCK __kmp_test_user_lock_with_checks
2137 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2138 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2139 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2141 // TODO: Make check abort messages use location info & pass it into
2142 // with_checks routines
2144 #if KMP_USE_DYNAMIC_LOCK
2146 // internal lock initializer
2147 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2148 kmp_dyna_lockseq_t seq) {
2149 if (KMP_IS_D_LOCK(seq)) {
2150 KMP_INIT_D_LOCK(lock, seq);
2152 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2155 KMP_INIT_I_LOCK(lock, seq);
2157 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2158 __kmp_itt_lock_creating(ilk->lock, loc);
2163 // internal nest lock initializer
2164 static __forceinline void
2165 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2166 kmp_dyna_lockseq_t seq) {
2168 // Don't have nested lock implementation for speculative locks
2169 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2170 seq = __kmp_user_lock_seq;
2174 seq = lockseq_nested_tas;
2178 seq = lockseq_nested_futex;
2181 case lockseq_ticket:
2182 seq = lockseq_nested_ticket;
2184 case lockseq_queuing:
2185 seq = lockseq_nested_queuing;
2188 seq = lockseq_nested_drdpa;
2191 seq = lockseq_nested_queuing;
2193 KMP_INIT_I_LOCK(lock, seq);
2195 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2196 __kmp_itt_lock_creating(ilk->lock, loc);
2200 /* initialize the lock with a hint */
2201 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2203 KMP_DEBUG_ASSERT(__kmp_init_serial);
2204 if (__kmp_env_consistency_check && user_lock == NULL) {
2205 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2208 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2210 #if OMPT_SUPPORT && OMPT_OPTIONAL
2211 // This is the case, if called from omp_init_lock_with_hint:
2212 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2214 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2215 if (ompt_enabled.ompt_callback_lock_init) {
2216 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2217 ompt_mutex_lock, (omp_lock_hint_t)hint,
2218 __ompt_get_mutex_impl_type(user_lock),
2219 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2224 /* initialize the lock with a hint */
2225 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2226 void **user_lock, uintptr_t hint) {
2227 KMP_DEBUG_ASSERT(__kmp_init_serial);
2228 if (__kmp_env_consistency_check && user_lock == NULL) {
2229 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2232 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235 // This is the case, if called from omp_init_lock_with_hint:
2236 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2238 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2239 if (ompt_enabled.ompt_callback_lock_init) {
2240 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2241 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2242 __ompt_get_mutex_impl_type(user_lock),
2243 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2248 #endif // KMP_USE_DYNAMIC_LOCK
2250 /* initialize the lock */
2251 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2252 #if KMP_USE_DYNAMIC_LOCK
2254 KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 if (__kmp_env_consistency_check && user_lock == NULL) {
2256 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2258 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2260 #if OMPT_SUPPORT && OMPT_OPTIONAL
2261 // This is the case, if called from omp_init_lock_with_hint:
2262 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2264 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2265 if (ompt_enabled.ompt_callback_lock_init) {
2266 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2267 ompt_mutex_lock, omp_lock_hint_none,
2268 __ompt_get_mutex_impl_type(user_lock),
2269 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2273 #else // KMP_USE_DYNAMIC_LOCK
2275 static char const *const func = "omp_init_lock";
2276 kmp_user_lock_p lck;
2277 KMP_DEBUG_ASSERT(__kmp_init_serial);
2279 if (__kmp_env_consistency_check) {
2280 if (user_lock == NULL) {
2281 KMP_FATAL(LockIsUninitialized, func);
2285 KMP_CHECK_USER_LOCK_INIT();
2287 if ((__kmp_user_lock_kind == lk_tas) &&
2288 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2289 lck = (kmp_user_lock_p)user_lock;
2292 else if ((__kmp_user_lock_kind == lk_futex) &&
2293 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2294 lck = (kmp_user_lock_p)user_lock;
2298 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2301 __kmp_set_user_lock_location(lck, loc);
2303 #if OMPT_SUPPORT && OMPT_OPTIONAL
2304 // This is the case, if called from omp_init_lock_with_hint:
2305 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2307 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2308 if (ompt_enabled.ompt_callback_lock_init) {
2309 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2310 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2311 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2316 __kmp_itt_lock_creating(lck);
2317 #endif /* USE_ITT_BUILD */
2319 #endif // KMP_USE_DYNAMIC_LOCK
2320 } // __kmpc_init_lock
2322 /* initialize the lock */
2323 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2324 #if KMP_USE_DYNAMIC_LOCK
2326 KMP_DEBUG_ASSERT(__kmp_init_serial);
2327 if (__kmp_env_consistency_check && user_lock == NULL) {
2328 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2330 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2332 #if OMPT_SUPPORT && OMPT_OPTIONAL
2333 // This is the case, if called from omp_init_lock_with_hint:
2334 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2336 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2337 if (ompt_enabled.ompt_callback_lock_init) {
2338 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2339 ompt_mutex_nest_lock, omp_lock_hint_none,
2340 __ompt_get_mutex_impl_type(user_lock),
2341 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2345 #else // KMP_USE_DYNAMIC_LOCK
2347 static char const *const func = "omp_init_nest_lock";
2348 kmp_user_lock_p lck;
2349 KMP_DEBUG_ASSERT(__kmp_init_serial);
2351 if (__kmp_env_consistency_check) {
2352 if (user_lock == NULL) {
2353 KMP_FATAL(LockIsUninitialized, func);
2357 KMP_CHECK_USER_LOCK_INIT();
2359 if ((__kmp_user_lock_kind == lk_tas) &&
2360 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2361 OMP_NEST_LOCK_T_SIZE)) {
2362 lck = (kmp_user_lock_p)user_lock;
2365 else if ((__kmp_user_lock_kind == lk_futex) &&
2366 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2367 OMP_NEST_LOCK_T_SIZE)) {
2368 lck = (kmp_user_lock_p)user_lock;
2372 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2375 INIT_NESTED_LOCK(lck);
2376 __kmp_set_user_lock_location(lck, loc);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379 // This is the case, if called from omp_init_lock_with_hint:
2380 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2382 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2383 if (ompt_enabled.ompt_callback_lock_init) {
2384 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2385 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2386 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2391 __kmp_itt_lock_creating(lck);
2392 #endif /* USE_ITT_BUILD */
2394 #endif // KMP_USE_DYNAMIC_LOCK
2395 } // __kmpc_init_nest_lock
2397 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2398 #if KMP_USE_DYNAMIC_LOCK
2401 kmp_user_lock_p lck;
2402 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2403 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2405 lck = (kmp_user_lock_p)user_lock;
2407 __kmp_itt_lock_destroyed(lck);
2409 #if OMPT_SUPPORT && OMPT_OPTIONAL
2410 // This is the case, if called from omp_init_lock_with_hint:
2411 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2413 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2414 if (ompt_enabled.ompt_callback_lock_destroy) {
2415 kmp_user_lock_p lck;
2416 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2417 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2419 lck = (kmp_user_lock_p)user_lock;
2421 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2422 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2425 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2427 kmp_user_lock_p lck;
2429 if ((__kmp_user_lock_kind == lk_tas) &&
2430 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2431 lck = (kmp_user_lock_p)user_lock;
2434 else if ((__kmp_user_lock_kind == lk_futex) &&
2435 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2436 lck = (kmp_user_lock_p)user_lock;
2440 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2443 #if OMPT_SUPPORT && OMPT_OPTIONAL
2444 // This is the case, if called from omp_init_lock_with_hint:
2445 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2447 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2448 if (ompt_enabled.ompt_callback_lock_destroy) {
2449 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2450 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2455 __kmp_itt_lock_destroyed(lck);
2456 #endif /* USE_ITT_BUILD */
2459 if ((__kmp_user_lock_kind == lk_tas) &&
2460 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2464 else if ((__kmp_user_lock_kind == lk_futex) &&
2465 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2470 __kmp_user_lock_free(user_lock, gtid, lck);
2472 #endif // KMP_USE_DYNAMIC_LOCK
2473 } // __kmpc_destroy_lock
2475 /* destroy the lock */
2476 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2477 #if KMP_USE_DYNAMIC_LOCK
2480 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2481 __kmp_itt_lock_destroyed(ilk->lock);
2483 #if OMPT_SUPPORT && OMPT_OPTIONAL
2484 // This is the case, if called from omp_init_lock_with_hint:
2485 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2487 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2488 if (ompt_enabled.ompt_callback_lock_destroy) {
2489 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2490 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2493 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2495 #else // KMP_USE_DYNAMIC_LOCK
2497 kmp_user_lock_p lck;
2499 if ((__kmp_user_lock_kind == lk_tas) &&
2500 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2501 OMP_NEST_LOCK_T_SIZE)) {
2502 lck = (kmp_user_lock_p)user_lock;
2505 else if ((__kmp_user_lock_kind == lk_futex) &&
2506 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2507 OMP_NEST_LOCK_T_SIZE)) {
2508 lck = (kmp_user_lock_p)user_lock;
2512 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2515 #if OMPT_SUPPORT && OMPT_OPTIONAL
2516 // This is the case, if called from omp_init_lock_with_hint:
2517 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2519 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2520 if (ompt_enabled.ompt_callback_lock_destroy) {
2521 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2522 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2527 __kmp_itt_lock_destroyed(lck);
2528 #endif /* USE_ITT_BUILD */
2530 DESTROY_NESTED_LOCK(lck);
2532 if ((__kmp_user_lock_kind == lk_tas) &&
2533 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2534 OMP_NEST_LOCK_T_SIZE)) {
2538 else if ((__kmp_user_lock_kind == lk_futex) &&
2539 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2540 OMP_NEST_LOCK_T_SIZE)) {
2545 __kmp_user_lock_free(user_lock, gtid, lck);
2547 #endif // KMP_USE_DYNAMIC_LOCK
2548 } // __kmpc_destroy_nest_lock
2550 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2551 KMP_COUNT_BLOCK(OMP_set_lock);
2552 #if KMP_USE_DYNAMIC_LOCK
2553 int tag = KMP_EXTRACT_D_TAG(user_lock);
2555 __kmp_itt_lock_acquiring(
2557 user_lock); // itt function will get to the right lock object.
2559 #if OMPT_SUPPORT && OMPT_OPTIONAL
2560 // This is the case, if called from omp_init_lock_with_hint:
2561 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2563 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2564 if (ompt_enabled.ompt_callback_mutex_acquire) {
2565 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2566 ompt_mutex_lock, omp_lock_hint_none,
2567 __ompt_get_mutex_impl_type(user_lock),
2568 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2571 #if KMP_USE_INLINED_TAS
2572 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2573 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2575 #elif KMP_USE_INLINED_FUTEX
2576 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2577 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2581 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2584 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2586 #if OMPT_SUPPORT && OMPT_OPTIONAL
2587 if (ompt_enabled.ompt_callback_mutex_acquired) {
2588 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2589 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2593 #else // KMP_USE_DYNAMIC_LOCK
2595 kmp_user_lock_p lck;
2597 if ((__kmp_user_lock_kind == lk_tas) &&
2598 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2599 lck = (kmp_user_lock_p)user_lock;
2602 else if ((__kmp_user_lock_kind == lk_futex) &&
2603 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2604 lck = (kmp_user_lock_p)user_lock;
2608 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2612 __kmp_itt_lock_acquiring(lck);
2613 #endif /* USE_ITT_BUILD */
2614 #if OMPT_SUPPORT && OMPT_OPTIONAL
2615 // This is the case, if called from omp_init_lock_with_hint:
2616 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2618 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2619 if (ompt_enabled.ompt_callback_mutex_acquire) {
2620 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2621 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2622 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2626 ACQUIRE_LOCK(lck, gtid);
2629 __kmp_itt_lock_acquired(lck);
2630 #endif /* USE_ITT_BUILD */
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633 if (ompt_enabled.ompt_callback_mutex_acquired) {
2634 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2635 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2639 #endif // KMP_USE_DYNAMIC_LOCK
2642 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2643 #if KMP_USE_DYNAMIC_LOCK
2646 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2648 #if OMPT_SUPPORT && OMPT_OPTIONAL
2649 // This is the case, if called from omp_init_lock_with_hint:
2650 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2652 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2653 if (ompt_enabled.enabled) {
2654 if (ompt_enabled.ompt_callback_mutex_acquire) {
2655 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2656 ompt_mutex_nest_lock, omp_lock_hint_none,
2657 __ompt_get_mutex_impl_type(user_lock),
2658 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2662 int acquire_status =
2663 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2664 (void) acquire_status;
2666 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2669 #if OMPT_SUPPORT && OMPT_OPTIONAL
2670 if (ompt_enabled.enabled) {
2671 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2672 if (ompt_enabled.ompt_callback_mutex_acquired) {
2674 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2675 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2679 if (ompt_enabled.ompt_callback_nest_lock) {
2681 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2682 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2688 #else // KMP_USE_DYNAMIC_LOCK
2690 kmp_user_lock_p lck;
2692 if ((__kmp_user_lock_kind == lk_tas) &&
2693 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2694 OMP_NEST_LOCK_T_SIZE)) {
2695 lck = (kmp_user_lock_p)user_lock;
2698 else if ((__kmp_user_lock_kind == lk_futex) &&
2699 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2700 OMP_NEST_LOCK_T_SIZE)) {
2701 lck = (kmp_user_lock_p)user_lock;
2705 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2709 __kmp_itt_lock_acquiring(lck);
2710 #endif /* USE_ITT_BUILD */
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712 // This is the case, if called from omp_init_lock_with_hint:
2713 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2715 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716 if (ompt_enabled.enabled) {
2717 if (ompt_enabled.ompt_callback_mutex_acquire) {
2718 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2719 ompt_mutex_nest_lock, omp_lock_hint_none,
2720 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2726 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2729 __kmp_itt_lock_acquired(lck);
2730 #endif /* USE_ITT_BUILD */
2732 #if OMPT_SUPPORT && OMPT_OPTIONAL
2733 if (ompt_enabled.enabled) {
2734 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2735 if (ompt_enabled.ompt_callback_mutex_acquired) {
2737 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2738 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2741 if (ompt_enabled.ompt_callback_nest_lock) {
2743 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2744 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2750 #endif // KMP_USE_DYNAMIC_LOCK
2753 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2754 #if KMP_USE_DYNAMIC_LOCK
2756 int tag = KMP_EXTRACT_D_TAG(user_lock);
2758 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2760 #if KMP_USE_INLINED_TAS
2761 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2762 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2764 #elif KMP_USE_INLINED_FUTEX
2765 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2766 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2770 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2773 #if OMPT_SUPPORT && OMPT_OPTIONAL
2774 // This is the case, if called from omp_init_lock_with_hint:
2775 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2777 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2778 if (ompt_enabled.ompt_callback_mutex_released) {
2779 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2780 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2784 #else // KMP_USE_DYNAMIC_LOCK
2786 kmp_user_lock_p lck;
2788 /* Can't use serial interval since not block structured */
2789 /* release the lock */
2791 if ((__kmp_user_lock_kind == lk_tas) &&
2792 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2793 #if KMP_OS_LINUX && \
2794 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2795 // "fast" path implemented to fix customer performance issue
2797 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2798 #endif /* USE_ITT_BUILD */
2799 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2802 #if OMPT_SUPPORT && OMPT_OPTIONAL
2803 // This is the case, if called from omp_init_lock_with_hint:
2804 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2806 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2807 if (ompt_enabled.ompt_callback_mutex_released) {
2808 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2809 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2815 lck = (kmp_user_lock_p)user_lock;
2819 else if ((__kmp_user_lock_kind == lk_futex) &&
2820 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2821 lck = (kmp_user_lock_p)user_lock;
2825 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2829 __kmp_itt_lock_releasing(lck);
2830 #endif /* USE_ITT_BUILD */
2832 RELEASE_LOCK(lck, gtid);
2834 #if OMPT_SUPPORT && OMPT_OPTIONAL
2835 // This is the case, if called from omp_init_lock_with_hint:
2836 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2838 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2839 if (ompt_enabled.ompt_callback_mutex_released) {
2840 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2841 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2845 #endif // KMP_USE_DYNAMIC_LOCK
2848 /* release the lock */
2849 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2850 #if KMP_USE_DYNAMIC_LOCK
2853 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2855 int release_status =
2856 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2857 (void) release_status;
2859 #if OMPT_SUPPORT && OMPT_OPTIONAL
2860 // This is the case, if called from omp_init_lock_with_hint:
2861 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2863 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2864 if (ompt_enabled.enabled) {
2865 if (release_status == KMP_LOCK_RELEASED) {
2866 if (ompt_enabled.ompt_callback_mutex_released) {
2867 // release_lock_last
2868 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2869 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2872 } else if (ompt_enabled.ompt_callback_nest_lock) {
2873 // release_lock_prev
2874 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2875 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2880 #else // KMP_USE_DYNAMIC_LOCK
2882 kmp_user_lock_p lck;
2884 /* Can't use serial interval since not block structured */
2886 if ((__kmp_user_lock_kind == lk_tas) &&
2887 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2888 OMP_NEST_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX && \
2890 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891 // "fast" path implemented to fix customer performance issue
2892 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2894 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898 int release_status = KMP_LOCK_STILL_HELD;
2901 if (--(tl->lk.depth_locked) == 0) {
2902 TCW_4(tl->lk.poll, 0);
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904 release_status = KMP_LOCK_RELEASED;
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910 // This is the case, if called from omp_init_lock_with_hint:
2911 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2913 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914 if (ompt_enabled.enabled) {
2915 if (release_status == KMP_LOCK_RELEASED) {
2916 if (ompt_enabled.ompt_callback_mutex_released) {
2917 // release_lock_last
2918 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2919 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2921 } else if (ompt_enabled.ompt_callback_nest_lock) {
2922 // release_lock_previous
2923 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2924 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2931 lck = (kmp_user_lock_p)user_lock;
2935 else if ((__kmp_user_lock_kind == lk_futex) &&
2936 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2937 OMP_NEST_LOCK_T_SIZE)) {
2938 lck = (kmp_user_lock_p)user_lock;
2942 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2946 __kmp_itt_lock_releasing(lck);
2947 #endif /* USE_ITT_BUILD */
2950 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2951 #if OMPT_SUPPORT && OMPT_OPTIONAL
2952 // This is the case, if called from omp_init_lock_with_hint:
2953 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2955 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2956 if (ompt_enabled.enabled) {
2957 if (release_status == KMP_LOCK_RELEASED) {
2958 if (ompt_enabled.ompt_callback_mutex_released) {
2959 // release_lock_last
2960 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2961 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2963 } else if (ompt_enabled.ompt_callback_nest_lock) {
2964 // release_lock_previous
2965 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2966 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2971 #endif // KMP_USE_DYNAMIC_LOCK
2974 /* try to acquire the lock */
2975 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2976 KMP_COUNT_BLOCK(OMP_test_lock);
2978 #if KMP_USE_DYNAMIC_LOCK
2980 int tag = KMP_EXTRACT_D_TAG(user_lock);
2982 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2984 #if OMPT_SUPPORT && OMPT_OPTIONAL
2985 // This is the case, if called from omp_init_lock_with_hint:
2986 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2988 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2989 if (ompt_enabled.ompt_callback_mutex_acquire) {
2990 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2991 ompt_mutex_lock, omp_lock_hint_none,
2992 __ompt_get_mutex_impl_type(user_lock),
2993 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2996 #if KMP_USE_INLINED_TAS
2997 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2998 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3000 #elif KMP_USE_INLINED_FUTEX
3001 if (tag == locktag_futex && !__kmp_env_consistency_check) {
3002 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3006 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3010 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3012 #if OMPT_SUPPORT && OMPT_OPTIONAL
3013 if (ompt_enabled.ompt_callback_mutex_acquired) {
3014 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3015 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3021 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3026 #else // KMP_USE_DYNAMIC_LOCK
3028 kmp_user_lock_p lck;
3031 if ((__kmp_user_lock_kind == lk_tas) &&
3032 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3033 lck = (kmp_user_lock_p)user_lock;
3036 else if ((__kmp_user_lock_kind == lk_futex) &&
3037 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3038 lck = (kmp_user_lock_p)user_lock;
3042 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3046 __kmp_itt_lock_acquiring(lck);
3047 #endif /* USE_ITT_BUILD */
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049 // This is the case, if called from omp_init_lock_with_hint:
3050 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3052 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053 if (ompt_enabled.ompt_callback_mutex_acquire) {
3054 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3055 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3056 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3060 rc = TEST_LOCK(lck, gtid);
3063 __kmp_itt_lock_acquired(lck);
3065 __kmp_itt_lock_cancelled(lck);
3067 #endif /* USE_ITT_BUILD */
3068 #if OMPT_SUPPORT && OMPT_OPTIONAL
3069 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3070 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3071 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3075 return (rc ? FTN_TRUE : FTN_FALSE);
3077 /* Can't use serial interval since not block structured */
3079 #endif // KMP_USE_DYNAMIC_LOCK
3082 /* try to acquire the lock */
3083 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3084 #if KMP_USE_DYNAMIC_LOCK
3087 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3089 #if OMPT_SUPPORT && OMPT_OPTIONAL
3090 // This is the case, if called from omp_init_lock_with_hint:
3091 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3093 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3094 if (ompt_enabled.ompt_callback_mutex_acquire) {
3095 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3096 ompt_mutex_nest_lock, omp_lock_hint_none,
3097 __ompt_get_mutex_impl_type(user_lock),
3098 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3101 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3104 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3106 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110 if (ompt_enabled.enabled && rc) {
3112 if (ompt_enabled.ompt_callback_mutex_acquired) {
3114 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3115 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3119 if (ompt_enabled.ompt_callback_nest_lock) {
3121 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3122 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3129 #else // KMP_USE_DYNAMIC_LOCK
3131 kmp_user_lock_p lck;
3134 if ((__kmp_user_lock_kind == lk_tas) &&
3135 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3136 OMP_NEST_LOCK_T_SIZE)) {
3137 lck = (kmp_user_lock_p)user_lock;
3140 else if ((__kmp_user_lock_kind == lk_futex) &&
3141 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3142 OMP_NEST_LOCK_T_SIZE)) {
3143 lck = (kmp_user_lock_p)user_lock;
3147 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3151 __kmp_itt_lock_acquiring(lck);
3152 #endif /* USE_ITT_BUILD */
3154 #if OMPT_SUPPORT && OMPT_OPTIONAL
3155 // This is the case, if called from omp_init_lock_with_hint:
3156 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3158 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3159 if (ompt_enabled.enabled) &&
3160 ompt_enabled.ompt_callback_mutex_acquire) {
3161 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3162 ompt_mutex_nest_lock, omp_lock_hint_none,
3163 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3168 rc = TEST_NESTED_LOCK(lck, gtid);
3171 __kmp_itt_lock_acquired(lck);
3173 __kmp_itt_lock_cancelled(lck);
3175 #endif /* USE_ITT_BUILD */
3176 #if OMPT_SUPPORT && OMPT_OPTIONAL
3177 if (ompt_enabled.enabled && rc) {
3179 if (ompt_enabled.ompt_callback_mutex_acquired) {
3181 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3182 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3185 if (ompt_enabled.ompt_callback_nest_lock) {
3187 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3188 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3195 /* Can't use serial interval since not block structured */
3197 #endif // KMP_USE_DYNAMIC_LOCK
3200 // Interface to fast scalable reduce methods routines
3202 // keep the selected method in a thread local structure for cross-function
3203 // usage: will be used in __kmpc_end_reduce* functions;
3204 // another solution: to re-determine the method one more time in
3205 // __kmpc_end_reduce* functions (new prototype required then)
3206 // AT: which solution is better?
3207 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3208 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3210 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3211 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3213 // description of the packed_reduction_method variable: look at the macros in
3216 // used in a critical section reduce block
3217 static __forceinline void
3218 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3219 kmp_critical_name *crit) {
3221 // this lock was visible to a customer and to the threading profile tool as a
3222 // serial overhead span (although it's used for an internal purpose only)
3223 // why was it visible in previous implementation?
3224 // should we keep it visible in new reduce block?
3225 kmp_user_lock_p lck;
3227 #if KMP_USE_DYNAMIC_LOCK
3229 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3230 // Check if it is initialized.
3232 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3233 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3234 KMP_GET_D_TAG(__kmp_user_lock_seq));
3236 __kmp_init_indirect_csptr(crit, loc, global_tid,
3237 KMP_GET_I_TAG(__kmp_user_lock_seq));
3240 // Branch for accessing the actual lock object and set operation. This
3241 // branching is inevitable since this lock initialization does not follow the
3242 // normal dispatch path (lock table is not used).
3243 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3244 lck = (kmp_user_lock_p)lk;
3245 KMP_DEBUG_ASSERT(lck != NULL);
3246 if (__kmp_env_consistency_check) {
3247 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3249 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3251 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3253 KMP_DEBUG_ASSERT(lck != NULL);
3254 if (__kmp_env_consistency_check) {
3255 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3257 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3260 #else // KMP_USE_DYNAMIC_LOCK
3262 // We know that the fast reduction code is only emitted by Intel compilers
3263 // with 32 byte critical sections. If there isn't enough space, then we
3264 // have to use a pointer.
3265 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3266 lck = (kmp_user_lock_p)crit;
3268 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3270 KMP_DEBUG_ASSERT(lck != NULL);
3272 if (__kmp_env_consistency_check)
3273 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3275 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3277 #endif // KMP_USE_DYNAMIC_LOCK
3280 // used in a critical section reduce block
3281 static __forceinline void
3282 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3283 kmp_critical_name *crit) {
3285 kmp_user_lock_p lck;
3287 #if KMP_USE_DYNAMIC_LOCK
3289 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3290 lck = (kmp_user_lock_p)crit;
3291 if (__kmp_env_consistency_check)
3292 __kmp_pop_sync(global_tid, ct_critical, loc);
3293 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3295 kmp_indirect_lock_t *ilk =
3296 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3297 if (__kmp_env_consistency_check)
3298 __kmp_pop_sync(global_tid, ct_critical, loc);
3299 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3302 #else // KMP_USE_DYNAMIC_LOCK
3304 // We know that the fast reduction code is only emitted by Intel compilers
3305 // with 32 byte critical sections. If there isn't enough space, then we have
3306 // to use a pointer.
3307 if (__kmp_base_user_lock_size > 32) {
3308 lck = *((kmp_user_lock_p *)crit);
3309 KMP_ASSERT(lck != NULL);
3311 lck = (kmp_user_lock_p)crit;
3314 if (__kmp_env_consistency_check)
3315 __kmp_pop_sync(global_tid, ct_critical, loc);
3317 __kmp_release_user_lock_with_checks(lck, global_tid);
3319 #endif // KMP_USE_DYNAMIC_LOCK
3320 } // __kmp_end_critical_section_reduce_block
3322 static __forceinline int
3323 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3327 // Check if we are inside the teams construct?
3328 if (th->th.th_teams_microtask) {
3329 *team_p = team = th->th.th_team;
3330 if (team->t.t_level == th->th.th_teams_level) {
3331 // This is reduction at teams construct.
3332 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3333 // Let's swap teams temporarily for the reduction.
3334 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3335 th->th.th_team = team->t.t_parent;
3336 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3337 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3338 *task_state = th->th.th_task_state;
3339 th->th.th_task_state = 0;
3347 static __forceinline void
3348 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3349 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3350 th->th.th_info.ds.ds_tid = 0;
3351 th->th.th_team = team;
3352 th->th.th_team_nproc = team->t.t_nproc;
3353 th->th.th_task_team = team->t.t_task_team[task_state];
3354 th->th.th_task_state = task_state;
3357 /* 2.a.i. Reduce Block without a terminating barrier */
3359 @ingroup SYNCHRONIZATION
3360 @param loc source location information
3361 @param global_tid global thread number
3362 @param num_vars number of items (variables) to be reduced
3363 @param reduce_size size of data in bytes to be reduced
3364 @param reduce_data pointer to data to be reduced
3365 @param reduce_func callback function providing reduction operation on two
3366 operands and returning result of reduction in lhs_data
3367 @param lck pointer to the unique lock data structure
3368 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3369 threads if atomic reduction needed
3371 The nowait version is used for a reduce clause with the nowait argument.
3374 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3375 size_t reduce_size, void *reduce_data,
3376 void (*reduce_func)(void *lhs_data, void *rhs_data),
3377 kmp_critical_name *lck) {
3379 KMP_COUNT_BLOCK(REDUCE_nowait);
3381 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3384 int teams_swapped = 0, task_state;
3385 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3387 // why do we need this initialization here at all?
3388 // Reduction clause can not be used as a stand-alone directive.
3390 // do not call __kmp_serial_initialize(), it will be called by
3391 // __kmp_parallel_initialize() if needed
3392 // possible detection of false-positive race by the threadchecker ???
3393 if (!TCR_4(__kmp_init_parallel))
3394 __kmp_parallel_initialize();
3396 __kmp_resume_if_soft_paused();
3398 // check correctness of reduce block nesting
3399 #if KMP_USE_DYNAMIC_LOCK
3400 if (__kmp_env_consistency_check)
3401 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3403 if (__kmp_env_consistency_check)
3404 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3407 th = __kmp_thread_from_gtid(global_tid);
3408 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3410 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3411 // the value should be kept in a variable
3412 // the variable should be either a construct-specific or thread-specific
3413 // property, not a team specific property
3414 // (a thread can reach the next reduce block on the next construct, reduce
3415 // method may differ on the next construct)
3416 // an ident_t "loc" parameter could be used as a construct-specific property
3417 // (what if loc == 0?)
3418 // (if both construct-specific and team-specific variables were shared,
3419 // then unness extra syncs should be needed)
3420 // a thread-specific variable is better regarding two issues above (next
3421 // construct and extra syncs)
3422 // a thread-specific "th_local.reduction_method" variable is used currently
3423 // each thread executes 'determine' and 'set' lines (no need to execute by one
3424 // thread, to avoid unness extra syncs)
3426 packed_reduction_method = __kmp_determine_reduction_method(
3427 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3428 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3430 if (packed_reduction_method == critical_reduce_block) {
3432 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3435 } else if (packed_reduction_method == empty_reduce_block) {
3437 // usage: if team size == 1, no synchronization is required ( Intel
3441 } else if (packed_reduction_method == atomic_reduce_block) {
3445 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3446 // won't be called by the code gen)
3447 // (it's not quite good, because the checking block has been closed by
3449 // but atomic operation has not been executed yet, will be executed
3450 // slightly later, literally on next instruction)
3451 if (__kmp_env_consistency_check)
3452 __kmp_pop_sync(global_tid, ct_reduce, loc);
3454 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3455 tree_reduce_block)) {
3457 // AT: performance issue: a real barrier here
3458 // AT: (if master goes slow, other threads are blocked here waiting for the
3459 // master to come and release them)
3460 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3461 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3462 // be confusing to a customer)
3463 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3464 // might go faster and be more in line with sense of NOWAIT
3465 // AT: TO DO: do epcc test and compare times
3467 // this barrier should be invisible to a customer and to the threading profile
3468 // tool (it's neither a terminating barrier nor customer's code, it's
3469 // used for an internal purpose)
3471 // JP: can this barrier potentially leed to task scheduling?
3472 // JP: as long as there is a barrier in the implementation, OMPT should and
3473 // will provide the barrier events
3474 // so we set-up the necessary frame/return addresses.
3475 ompt_frame_t *ompt_frame;
3476 if (ompt_enabled.enabled) {
3477 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3478 if (ompt_frame->enter_frame.ptr == NULL)
3479 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3480 OMPT_STORE_RETURN_ADDRESS(global_tid);
3484 __kmp_threads[global_tid]->th.th_ident = loc;
3487 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3488 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3489 retval = (retval != 0) ? (0) : (1);
3490 #if OMPT_SUPPORT && OMPT_OPTIONAL
3491 if (ompt_enabled.enabled) {
3492 ompt_frame->enter_frame = ompt_data_none;
3496 // all other workers except master should do this pop here
3497 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3498 if (__kmp_env_consistency_check) {
3500 __kmp_pop_sync(global_tid, ct_reduce, loc);
3506 // should never reach this block
3507 KMP_ASSERT(0); // "unexpected method"
3509 if (teams_swapped) {
3510 __kmp_restore_swapped_teams(th, team, task_state);
3514 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3515 global_tid, packed_reduction_method, retval));
3521 @ingroup SYNCHRONIZATION
3522 @param loc source location information
3523 @param global_tid global thread id.
3524 @param lck pointer to the unique lock data structure
3526 Finish the execution of a reduce nowait.
3528 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3529 kmp_critical_name *lck) {
3531 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3533 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3535 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3537 if (packed_reduction_method == critical_reduce_block) {
3539 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3541 } else if (packed_reduction_method == empty_reduce_block) {
3543 // usage: if team size == 1, no synchronization is required ( on Intel
3546 } else if (packed_reduction_method == atomic_reduce_block) {
3548 // neither master nor other workers should get here
3549 // (code gen does not generate this call in case 2: atomic reduce block)
3550 // actually it's better to remove this elseif at all;
3551 // after removal this value will checked by the 'else' and will assert
3553 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3554 tree_reduce_block)) {
3556 // only master gets here
3560 // should never reach this block
3561 KMP_ASSERT(0); // "unexpected method"
3564 if (__kmp_env_consistency_check)
3565 __kmp_pop_sync(global_tid, ct_reduce, loc);
3567 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3568 global_tid, packed_reduction_method));
3573 /* 2.a.ii. Reduce Block with a terminating barrier */
3576 @ingroup SYNCHRONIZATION
3577 @param loc source location information
3578 @param global_tid global thread number
3579 @param num_vars number of items (variables) to be reduced
3580 @param reduce_size size of data in bytes to be reduced
3581 @param reduce_data pointer to data to be reduced
3582 @param reduce_func callback function providing reduction operation on two
3583 operands and returning result of reduction in lhs_data
3584 @param lck pointer to the unique lock data structure
3585 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3586 threads if atomic reduction needed
3588 A blocking reduce that includes an implicit barrier.
3590 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3591 size_t reduce_size, void *reduce_data,
3592 void (*reduce_func)(void *lhs_data, void *rhs_data),
3593 kmp_critical_name *lck) {
3594 KMP_COUNT_BLOCK(REDUCE_wait);
3596 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3599 int teams_swapped = 0, task_state;
3601 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3603 // why do we need this initialization here at all?
3604 // Reduction clause can not be a stand-alone directive.
3606 // do not call __kmp_serial_initialize(), it will be called by
3607 // __kmp_parallel_initialize() if needed
3608 // possible detection of false-positive race by the threadchecker ???
3609 if (!TCR_4(__kmp_init_parallel))
3610 __kmp_parallel_initialize();
3612 __kmp_resume_if_soft_paused();
3614 // check correctness of reduce block nesting
3615 #if KMP_USE_DYNAMIC_LOCK
3616 if (__kmp_env_consistency_check)
3617 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3619 if (__kmp_env_consistency_check)
3620 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3623 th = __kmp_thread_from_gtid(global_tid);
3624 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3626 packed_reduction_method = __kmp_determine_reduction_method(
3627 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3628 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3630 if (packed_reduction_method == critical_reduce_block) {
3632 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3635 } else if (packed_reduction_method == empty_reduce_block) {
3637 // usage: if team size == 1, no synchronization is required ( Intel
3641 } else if (packed_reduction_method == atomic_reduce_block) {
3645 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3646 tree_reduce_block)) {
3648 // case tree_reduce_block:
3649 // this barrier should be visible to a customer and to the threading profile
3650 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3652 ompt_frame_t *ompt_frame;
3653 if (ompt_enabled.enabled) {
3654 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3655 if (ompt_frame->enter_frame.ptr == NULL)
3656 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3657 OMPT_STORE_RETURN_ADDRESS(global_tid);
3661 __kmp_threads[global_tid]->th.th_ident =
3662 loc; // needed for correct notification of frames
3665 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3666 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3667 retval = (retval != 0) ? (0) : (1);
3668 #if OMPT_SUPPORT && OMPT_OPTIONAL
3669 if (ompt_enabled.enabled) {
3670 ompt_frame->enter_frame = ompt_data_none;
3674 // all other workers except master should do this pop here
3675 // ( none of other workers except master will enter __kmpc_end_reduce() )
3676 if (__kmp_env_consistency_check) {
3677 if (retval == 0) { // 0: all other workers; 1: master
3678 __kmp_pop_sync(global_tid, ct_reduce, loc);
3684 // should never reach this block
3685 KMP_ASSERT(0); // "unexpected method"
3687 if (teams_swapped) {
3688 __kmp_restore_swapped_teams(th, team, task_state);
3692 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3693 global_tid, packed_reduction_method, retval));
3698 @ingroup SYNCHRONIZATION
3699 @param loc source location information
3700 @param global_tid global thread id.
3701 @param lck pointer to the unique lock data structure
3703 Finish the execution of a blocking reduce.
3704 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3707 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3708 kmp_critical_name *lck) {
3710 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3713 int teams_swapped = 0, task_state;
3715 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3717 th = __kmp_thread_from_gtid(global_tid);
3718 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3720 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3722 // this barrier should be visible to a customer and to the threading profile
3723 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3725 if (packed_reduction_method == critical_reduce_block) {
3726 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3728 // TODO: implicit barrier: should be exposed
3730 ompt_frame_t *ompt_frame;
3731 if (ompt_enabled.enabled) {
3732 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3733 if (ompt_frame->enter_frame.ptr == NULL)
3734 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3735 OMPT_STORE_RETURN_ADDRESS(global_tid);
3739 __kmp_threads[global_tid]->th.th_ident = loc;
3741 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3742 #if OMPT_SUPPORT && OMPT_OPTIONAL
3743 if (ompt_enabled.enabled) {
3744 ompt_frame->enter_frame = ompt_data_none;
3748 } else if (packed_reduction_method == empty_reduce_block) {
3750 // usage: if team size==1, no synchronization is required (Intel platforms only)
3752 // TODO: implicit barrier: should be exposed
3754 ompt_frame_t *ompt_frame;
3755 if (ompt_enabled.enabled) {
3756 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3757 if (ompt_frame->enter_frame.ptr == NULL)
3758 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3759 OMPT_STORE_RETURN_ADDRESS(global_tid);
3763 __kmp_threads[global_tid]->th.th_ident = loc;
3765 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3766 #if OMPT_SUPPORT && OMPT_OPTIONAL
3767 if (ompt_enabled.enabled) {
3768 ompt_frame->enter_frame = ompt_data_none;
3772 } else if (packed_reduction_method == atomic_reduce_block) {
3775 ompt_frame_t *ompt_frame;
3776 if (ompt_enabled.enabled) {
3777 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3778 if (ompt_frame->enter_frame.ptr == NULL)
3779 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3780 OMPT_STORE_RETURN_ADDRESS(global_tid);
3783 // TODO: implicit barrier: should be exposed
3785 __kmp_threads[global_tid]->th.th_ident = loc;
3787 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789 if (ompt_enabled.enabled) {
3790 ompt_frame->enter_frame = ompt_data_none;
3794 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3795 tree_reduce_block)) {
3797 // only master executes here (master releases all other workers)
3798 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3803 // should never reach this block
3804 KMP_ASSERT(0); // "unexpected method"
3806 if (teams_swapped) {
3807 __kmp_restore_swapped_teams(th, team, task_state);
3810 if (__kmp_env_consistency_check)
3811 __kmp_pop_sync(global_tid, ct_reduce, loc);
3813 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3814 global_tid, packed_reduction_method));
3819 #undef __KMP_GET_REDUCTION_METHOD
3820 #undef __KMP_SET_REDUCTION_METHOD
3822 /* end of interface to fast scalable reduce routines */
3824 kmp_uint64 __kmpc_get_taskid() {
3829 gtid = __kmp_get_gtid();
3833 thread = __kmp_thread_from_gtid(gtid);
3834 return thread->th.th_current_task->td_task_id;
3836 } // __kmpc_get_taskid
3838 kmp_uint64 __kmpc_get_parent_taskid() {
3842 kmp_taskdata_t *parent_task;
3844 gtid = __kmp_get_gtid();
3848 thread = __kmp_thread_from_gtid(gtid);
3849 parent_task = thread->th.th_current_task->td_parent;
3850 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3852 } // __kmpc_get_parent_taskid
3855 @ingroup WORK_SHARING
3856 @param loc source location information.
3857 @param gtid global thread number.
3858 @param num_dims number of associated doacross loops.
3859 @param dims info on loops bounds.
3861 Initialize doacross loop information.
3862 Expect compiler send us inclusive bounds,
3863 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3865 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3866 const struct kmp_dim *dims) {
3868 kmp_int64 last, trace_count;
3869 kmp_info_t *th = __kmp_threads[gtid];
3870 kmp_team_t *team = th->th.th_team;
3872 kmp_disp_t *pr_buf = th->th.th_dispatch;
3873 dispatch_shared_info_t *sh_buf;
3877 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3878 gtid, num_dims, !team->t.t_serialized));
3879 KMP_DEBUG_ASSERT(dims != NULL);
3880 KMP_DEBUG_ASSERT(num_dims > 0);
3882 if (team->t.t_serialized) {
3883 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3884 return; // no dependencies if team is serialized
3886 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3887 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3889 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3891 // Save bounds info into allocated private buffer
3892 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3893 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3894 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3895 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3896 pr_buf->th_doacross_info[0] =
3897 (kmp_int64)num_dims; // first element is number of dimensions
3898 // Save also address of num_done in order to access it later without knowing
3900 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3901 pr_buf->th_doacross_info[2] = dims[0].lo;
3902 pr_buf->th_doacross_info[3] = dims[0].up;
3903 pr_buf->th_doacross_info[4] = dims[0].st;
3905 for (j = 1; j < num_dims; ++j) {
3907 range_length; // To keep ranges of all dimensions but the first dims[0]
3908 if (dims[j].st == 1) { // most common case
3909 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3910 range_length = dims[j].up - dims[j].lo + 1;
3912 if (dims[j].st > 0) {
3913 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3914 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3915 } else { // negative increment
3916 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3918 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3921 pr_buf->th_doacross_info[last++] = range_length;
3922 pr_buf->th_doacross_info[last++] = dims[j].lo;
3923 pr_buf->th_doacross_info[last++] = dims[j].up;
3924 pr_buf->th_doacross_info[last++] = dims[j].st;
3927 // Compute total trip count.
3928 // Start with range of dims[0] which we don't need to keep in the buffer.
3929 if (dims[0].st == 1) { // most common case
3930 trace_count = dims[0].up - dims[0].lo + 1;
3931 } else if (dims[0].st > 0) {
3932 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3933 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3934 } else { // negative increment
3935 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3936 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3938 for (j = 1; j < num_dims; ++j) {
3939 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3941 KMP_DEBUG_ASSERT(trace_count > 0);
3943 // Check if shared buffer is not occupied by other loop (idx -
3944 // __kmp_dispatch_num_buffers)
3945 if (idx != sh_buf->doacross_buf_idx) {
3946 // Shared buffer is occupied, wait for it to be free
3947 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3951 // Check if we are the first thread. After the CAS the first thread gets 0,
3952 // others get 1 if initialization is in progress, allocated pointer otherwise.
3953 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3954 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3955 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3957 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3958 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3960 if (flags == NULL) {
3961 // we are the first thread, allocate the array of flags
3962 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3963 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3965 sh_buf->doacross_flags = flags;
3966 } else if (flags == (kmp_uint32 *)1) {
3968 // initialization is still in progress, need to wait
3969 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3971 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3978 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3979 pr_buf->th_doacross_flags =
3980 sh_buf->doacross_flags; // save private copy in order to not
3981 // touch shared buffer on each iteration
3982 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3985 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3986 kmp_int32 shft, num_dims, i;
3988 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3989 kmp_info_t *th = __kmp_threads[gtid];
3990 kmp_team_t *team = th->th.th_team;
3992 kmp_int64 lo, up, st;
3994 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3995 if (team->t.t_serialized) {
3996 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3997 return; // no dependencies if team is serialized
4000 // calculate sequential iteration number and check out-of-bounds condition
4001 pr_buf = th->th.th_dispatch;
4002 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4003 num_dims = pr_buf->th_doacross_info[0];
4004 lo = pr_buf->th_doacross_info[2];
4005 up = pr_buf->th_doacross_info[3];
4006 st = pr_buf->th_doacross_info[4];
4007 if (st == 1) { // most common case
4008 if (vec[0] < lo || vec[0] > up) {
4009 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4010 "bounds [%lld,%lld]\n",
4011 gtid, vec[0], lo, up));
4014 iter_number = vec[0] - lo;
4015 } else if (st > 0) {
4016 if (vec[0] < lo || vec[0] > up) {
4017 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4018 "bounds [%lld,%lld]\n",
4019 gtid, vec[0], lo, up));
4022 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4023 } else { // negative increment
4024 if (vec[0] > lo || vec[0] < up) {
4025 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4026 "bounds [%lld,%lld]\n",
4027 gtid, vec[0], lo, up));
4030 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4032 for (i = 1; i < num_dims; ++i) {
4034 kmp_int32 j = i * 4;
4035 ln = pr_buf->th_doacross_info[j + 1];
4036 lo = pr_buf->th_doacross_info[j + 2];
4037 up = pr_buf->th_doacross_info[j + 3];
4038 st = pr_buf->th_doacross_info[j + 4];
4040 if (vec[i] < lo || vec[i] > up) {
4041 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4042 "bounds [%lld,%lld]\n",
4043 gtid, vec[i], lo, up));
4047 } else if (st > 0) {
4048 if (vec[i] < lo || vec[i] > up) {
4049 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4050 "bounds [%lld,%lld]\n",
4051 gtid, vec[i], lo, up));
4054 iter = (kmp_uint64)(vec[i] - lo) / st;
4056 if (vec[i] > lo || vec[i] < up) {
4057 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058 "bounds [%lld,%lld]\n",
4059 gtid, vec[i], lo, up));
4062 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4064 iter_number = iter + ln * iter_number;
4066 shft = iter_number % 32; // use 32-bit granularity
4067 iter_number >>= 5; // divided by 32
4069 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4074 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4075 gtid, (iter_number << 5) + shft));
4078 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4079 kmp_int32 shft, num_dims, i;
4081 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4082 kmp_info_t *th = __kmp_threads[gtid];
4083 kmp_team_t *team = th->th.th_team;
4087 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4088 if (team->t.t_serialized) {
4089 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4090 return; // no dependencies if team is serialized
4093 // calculate sequential iteration number (same as in "wait" but no
4094 // out-of-bounds checks)
4095 pr_buf = th->th.th_dispatch;
4096 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4097 num_dims = pr_buf->th_doacross_info[0];
4098 lo = pr_buf->th_doacross_info[2];
4099 st = pr_buf->th_doacross_info[4];
4100 if (st == 1) { // most common case
4101 iter_number = vec[0] - lo;
4102 } else if (st > 0) {
4103 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4104 } else { // negative increment
4105 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4107 for (i = 1; i < num_dims; ++i) {
4109 kmp_int32 j = i * 4;
4110 ln = pr_buf->th_doacross_info[j + 1];
4111 lo = pr_buf->th_doacross_info[j + 2];
4112 st = pr_buf->th_doacross_info[j + 4];
4115 } else if (st > 0) {
4116 iter = (kmp_uint64)(vec[i] - lo) / st;
4118 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4120 iter_number = iter + ln * iter_number;
4122 shft = iter_number % 32; // use 32-bit granularity
4123 iter_number >>= 5; // divided by 32
4126 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4127 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4128 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4129 (iter_number << 5) + shft));
4132 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4134 kmp_info_t *th = __kmp_threads[gtid];
4135 kmp_team_t *team = th->th.th_team;
4136 kmp_disp_t *pr_buf = th->th.th_dispatch;
4138 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4139 if (team->t.t_serialized) {
4140 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4141 return; // nothing to do
4143 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4144 if (num_done == th->th.th_team_nproc) {
4145 // we are the last thread, need to free shared resources
4146 int idx = pr_buf->th_doacross_buf_idx - 1;
4147 dispatch_shared_info_t *sh_buf =
4148 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4149 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4150 (kmp_int64)&sh_buf->doacross_num_done);
4151 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4152 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4153 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4154 sh_buf->doacross_flags = NULL;
4155 sh_buf->doacross_num_done = 0;
4156 sh_buf->doacross_buf_idx +=
4157 __kmp_dispatch_num_buffers; // free buffer for future re-use
4159 // free private resources (need to keep buffer index forever)
4160 pr_buf->th_doacross_flags = NULL;
4161 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4162 pr_buf->th_doacross_info = NULL;
4163 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4166 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4167 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4168 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4171 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4172 __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4175 int __kmpc_get_target_offload(void) {
4176 if (!__kmp_init_serial) {
4177 __kmp_serial_initialize();
4179 return __kmp_target_offload;
4182 int __kmpc_pause_resource(kmp_pause_status_t level) {
4183 if (!__kmp_init_serial) {
4184 return 1; // Can't pause if runtime is not initialized
4186 return __kmp_pause_resource(level);