2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
5 //===----------------------------------------------------------------------===//
7 // The LLVM Compiler Infrastructure
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
12 //===----------------------------------------------------------------------===//
15 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp_error.h"
21 #include "kmp_stats.h"
24 #include "ompt-specific.h"
27 #define MAX_MESSAGE 512
29 // flags will be used in future, e.g. to implement openmp_strict library
33 * @ingroup STARTUP_SHUTDOWN
34 * @param loc in source location information
35 * @param flags in for future use (currently ignored)
37 * Initialize the runtime library. This call is optional; if it is not made then
38 * it will be implicitly called by attempts to use other library functions.
40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
41 // By default __kmpc_begin() is no-op.
43 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
44 __kmp_str_match_true(env)) {
45 __kmp_middle_initialize();
46 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
47 } else if (__kmp_ignore_mppbeg() == FALSE) {
48 // By default __kmp_ignore_mppbeg() returns TRUE.
49 __kmp_internal_begin();
50 KC_TRACE(10, ("__kmpc_begin: called\n"));
55 * @ingroup STARTUP_SHUTDOWN
56 * @param loc source location information
58 * Shutdown the runtime library. This is also optional, and even if called will
59 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
62 void __kmpc_end(ident_t *loc) {
63 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
64 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
65 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
66 // returns FALSE and __kmpc_end() will unregister this root (it can cause
67 // library shut down).
68 if (__kmp_ignore_mppend() == FALSE) {
69 KC_TRACE(10, ("__kmpc_end: called\n"));
70 KA_TRACE(30, ("__kmpc_end\n"));
72 __kmp_internal_end_thread(-1);
74 #if KMP_OS_WINDOWS && OMPT_SUPPORT
75 // Normal exit process on Windows does not allow worker threads of the final
76 // parallel region to finish reporting their events, so shutting down the
77 // library here fixes the issue at least for the cases where __kmpc_end() is
79 if (ompt_enabled.enabled)
80 __kmp_internal_end_library(__kmp_gtid_get_specific());
85 @ingroup THREAD_STATES
86 @param loc Source location information.
87 @return The global thread index of the active thread.
89 This function can be called in any context.
91 If the runtime has ony been entered at the outermost level from a
92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
93 that which would be returned by omp_get_thread_num() in the outermost
94 active parallel construct. (Or zero if there is no active parallel
95 construct, since the master thread is necessarily thread zero).
97 If multiple non-OpenMP threads all enter an OpenMP construct then this
98 will be a unique thread identifier among all the threads created by
99 the OpenMP runtime (but the value cannote be defined in terms of
100 OpenMP thread ids returned by omp_get_thread_num()).
102 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
103 kmp_int32 gtid = __kmp_entry_gtid();
105 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
111 @ingroup THREAD_STATES
112 @param loc Source location information.
113 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
115 This function can be called in any context.
116 It returns the total number of threads under the control of the OpenMP runtime.
117 That is not a number that can be determined by any OpenMP standard calls, since
118 the library may be called from more than one non-OpenMP thread, and this
119 reflects the total over all such calls. Similarly the runtime maintains
120 underlying threads even when they are not active (since the cost of creating
121 and destroying OS threads is high), this call counts all such threads even if
122 they are not waiting for work.
124 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
126 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
128 return TCR_4(__kmp_all_nth);
132 @ingroup THREAD_STATES
133 @param loc Source location information.
134 @return The thread number of the calling thread in the innermost active parallel
137 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
138 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
139 return __kmp_tid_from_gtid(__kmp_entry_gtid());
143 @ingroup THREAD_STATES
144 @param loc Source location information.
145 @return The number of threads in the innermost active parallel construct.
147 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
148 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
150 return __kmp_entry_thread()->th.th_team->t.t_nproc;
154 * @ingroup DEPRECATED
155 * @param loc location description
157 * This function need not be called. It always returns TRUE.
159 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
170 if (__kmp_par_range == 0) {
173 semi2 = loc->psource;
177 semi2 = strchr(semi2, ';');
181 semi2 = strchr(semi2 + 1, ';');
185 if (__kmp_par_range_filename[0]) {
186 const char *name = semi2 - 1;
187 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
190 if ((*name == '/') || (*name == ';')) {
193 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
194 return __kmp_par_range < 0;
197 semi3 = strchr(semi2 + 1, ';');
198 if (__kmp_par_range_routine[0]) {
199 if ((semi3 != NULL) && (semi3 > semi2) &&
200 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
201 return __kmp_par_range < 0;
204 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
205 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
206 return __kmp_par_range > 0;
208 return __kmp_par_range < 0;
212 #endif /* KMP_DEBUG */
216 @ingroup THREAD_STATES
217 @param loc Source location information.
218 @return 1 if this thread is executing inside an active parallel region, zero if
221 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
222 return __kmp_entry_thread()->th.th_root->r.r_active;
227 @param loc source location information
228 @param global_tid global thread number
229 @param num_threads number of threads requested for this parallel construct
231 Set the number of threads to be used by the next fork spawned by this thread.
232 This call is only required if the parallel construct has a `num_threads` clause.
234 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
235 kmp_int32 num_threads) {
236 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
237 global_tid, num_threads));
239 __kmp_push_num_threads(loc, global_tid, num_threads);
242 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
243 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
245 /* the num_threads are automatically popped */
250 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
251 kmp_int32 proc_bind) {
252 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
255 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
258 #endif /* OMP_40_ENABLED */
262 @param loc source location information
263 @param argc total number of arguments in the ellipsis
264 @param microtask pointer to callback routine consisting of outlined parallel
266 @param ... pointers to shared variables that aren't global
268 Do the actual fork and call the microtask in the relevant number of threads.
270 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
271 int gtid = __kmp_entry_gtid();
273 #if (KMP_STATS_ENABLED)
274 // If we were in a serial region, then stop the serial timer, record
275 // the event, and start parallel region timer
276 stats_state_e previous_state = KMP_GET_THREAD_STATE();
277 if (previous_state == stats_state_e::SERIAL_REGION) {
278 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
280 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
282 int inParallel = __kmpc_in_parallel(loc);
284 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
286 KMP_COUNT_BLOCK(OMP_PARALLEL);
290 // maybe to save thr_state is enough here
293 va_start(ap, microtask);
296 ompt_frame_t *ompt_frame;
297 if (ompt_enabled.enabled) {
298 kmp_info_t *master_th = __kmp_threads[gtid];
299 kmp_team_t *parent_team = master_th->th.th_team;
300 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
302 ompt_frame = &(lwt->ompt_task_info.frame);
304 int tid = __kmp_tid_from_gtid(gtid);
306 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
308 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
309 OMPT_STORE_RETURN_ADDRESS(gtid);
313 #if INCLUDE_SSC_MARKS
316 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
317 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
318 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
319 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
320 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
326 #if INCLUDE_SSC_MARKS
329 __kmp_join_call(loc, gtid
339 #if KMP_STATS_ENABLED
340 if (previous_state == stats_state_e::SERIAL_REGION) {
341 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
343 KMP_POP_PARTITIONED_TIMER();
345 #endif // KMP_STATS_ENABLED
351 @param loc source location information
352 @param global_tid global thread number
353 @param num_teams number of teams requested for the teams construct
354 @param num_threads number of threads per team requested for the teams construct
356 Set the number of teams to be used by the teams construct.
357 This call is only required if the teams construct has a `num_teams` clause
358 or a `thread_limit` clause (or both).
360 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
361 kmp_int32 num_teams, kmp_int32 num_threads) {
363 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
364 global_tid, num_teams, num_threads));
366 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
371 @param loc source location information
372 @param argc total number of arguments in the ellipsis
373 @param microtask pointer to callback routine consisting of outlined teams
375 @param ... pointers to shared variables that aren't global
377 Do the actual fork and call the microtask in the relevant number of threads.
379 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
381 int gtid = __kmp_entry_gtid();
382 kmp_info_t *this_thr = __kmp_threads[gtid];
384 va_start(ap, microtask);
386 KMP_COUNT_BLOCK(OMP_TEAMS);
388 // remember teams entry point and nesting level
389 this_thr->th.th_teams_microtask = microtask;
390 this_thr->th.th_teams_level =
391 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
394 kmp_team_t *parent_team = this_thr->th.th_team;
395 int tid = __kmp_tid_from_gtid(gtid);
396 if (ompt_enabled.enabled) {
397 parent_team->t.t_implicit_task_taskdata[tid]
398 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
400 OMPT_STORE_RETURN_ADDRESS(gtid);
403 // check if __kmpc_push_num_teams called, set default number of teams
405 if (this_thr->th.th_teams_size.nteams == 0) {
406 __kmp_push_num_teams(loc, gtid, 0, 0);
408 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
409 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
410 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
412 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
413 VOLATILE_CAST(microtask_t)
414 __kmp_teams_master, // "wrapped" task
415 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
416 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
422 __kmp_join_call(loc, gtid
429 this_thr->th.th_teams_microtask = NULL;
430 this_thr->th.th_teams_level = 0;
431 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
434 #endif /* OMP_40_ENABLED */
436 // I don't think this function should ever have been exported.
437 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
438 // openmp code ever called it, but it's been exported from the RTL for so
439 // long that I'm afraid to remove the definition.
440 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
444 @param loc source location information
445 @param global_tid global thread number
447 Enter a serialized parallel construct. This interface is used to handle a
448 conditional parallel region, like this,
450 #pragma omp parallel if (condition)
452 when the condition is false.
454 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
455 // The implementation is now in kmp_runtime.cpp so that it can share static
456 // functions with kmp_fork_call since the tasks to be done are similar in
459 OMPT_STORE_RETURN_ADDRESS(global_tid);
461 __kmp_serialized_parallel(loc, global_tid);
466 @param loc source location information
467 @param global_tid global thread number
469 Leave a serialized parallel construct.
471 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
472 kmp_internal_control_t *top;
473 kmp_info_t *this_thr;
474 kmp_team_t *serial_team;
477 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
479 /* skip all this code for autopar serialized loops since it results in
480 unacceptable overhead */
481 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
485 if (!TCR_4(__kmp_init_parallel))
486 __kmp_parallel_initialize();
488 this_thr = __kmp_threads[global_tid];
489 serial_team = this_thr->th.th_serial_team;
492 kmp_task_team_t *task_team = this_thr->th.th_task_team;
494 // we need to wait for the proxy tasks before finishing the thread
495 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
496 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
500 KMP_DEBUG_ASSERT(serial_team);
501 KMP_ASSERT(serial_team->t.t_serialized);
502 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
503 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
504 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
505 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
508 if (ompt_enabled.enabled &&
509 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
510 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
511 if (ompt_enabled.ompt_callback_implicit_task) {
512 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
513 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
514 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
517 // reset clear the task id only after unlinking the task
518 ompt_data_t *parent_task_data;
519 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
521 if (ompt_enabled.ompt_callback_parallel_end) {
522 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
523 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
524 ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
526 __ompt_lw_taskteam_unlink(this_thr);
527 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
531 /* If necessary, pop the internal control stack values and replace the team
533 top = serial_team->t.t_control_stack_top;
534 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
535 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
536 serial_team->t.t_control_stack_top = top->next;
540 // if( serial_team -> t.t_serialized > 1 )
541 serial_team->t.t_level--;
543 /* pop dispatch buffers stack */
544 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
546 dispatch_private_info_t *disp_buffer =
547 serial_team->t.t_dispatch->th_disp_buffer;
548 serial_team->t.t_dispatch->th_disp_buffer =
549 serial_team->t.t_dispatch->th_disp_buffer->next;
550 __kmp_free(disp_buffer);
553 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
556 --serial_team->t.t_serialized;
557 if (serial_team->t.t_serialized == 0) {
559 /* return to the parallel section */
561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
562 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
563 __kmp_clear_x87_fpu_status_word();
564 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
565 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
567 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
569 this_thr->th.th_team = serial_team->t.t_parent;
570 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
572 /* restore values cached in the thread */
573 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
574 this_thr->th.th_team_master =
575 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
576 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
578 /* TODO the below shouldn't need to be adjusted for serialized teams */
579 this_thr->th.th_dispatch =
580 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
582 __kmp_pop_current_task_from_thread(this_thr);
584 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
585 this_thr->th.th_current_task->td_flags.executing = 1;
587 if (__kmp_tasking_mode != tskm_immediate_exec) {
588 // Copy the task team from the new child / old parent team to the thread.
589 this_thr->th.th_task_team =
590 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
592 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
594 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
597 if (__kmp_tasking_mode != tskm_immediate_exec) {
598 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
599 "depth of serial team %p to %d\n",
600 global_tid, serial_team, serial_team->t.t_serialized));
604 if (__kmp_env_consistency_check)
605 __kmp_pop_parallel(global_tid, NULL);
607 if (ompt_enabled.enabled)
608 this_thr->th.ompt_thread_info.state =
609 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
610 : ompt_state_work_parallel);
615 @ingroup SYNCHRONIZATION
616 @param loc source location information.
618 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
619 depending on the memory ordering convention obeyed by the compiler
620 even that may not be necessary).
622 void __kmpc_flush(ident_t *loc) {
623 KC_TRACE(10, ("__kmpc_flush: called\n"));
625 /* need explicit __mf() here since use volatile instead in library */
626 KMP_MB(); /* Flush all pending memory write invalidates. */
628 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
630 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
631 // We shouldn't need it, though, since the ABI rules require that
632 // * If the compiler generates NGO stores it also generates the fence
633 // * If users hand-code NGO stores they should insert the fence
634 // therefore no incomplete unordered stores should be visible.
637 // This is to address non-temporal store instructions (sfence needed).
638 // The clflush instruction is addressed either (mfence needed).
639 // Probably the non-temporal load monvtdqa instruction should also be
641 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
642 if (!__kmp_cpuinfo.initialized) {
643 __kmp_query_cpuid(&__kmp_cpuinfo);
645 if (!__kmp_cpuinfo.sse2) {
646 // CPU cannot execute SSE2 instructions.
650 #elif KMP_COMPILER_MSVC
653 __sync_synchronize();
654 #endif // KMP_COMPILER_ICC
657 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
658 // Nothing to see here move along
660 // Nothing needed here (we have a real MB above).
662 // The flushing thread needs to yield here; this prevents a
663 // busy-waiting thread from saturating the pipeline. flush is
664 // often used in loops like this:
666 // #pragma omp flush(flag)
668 // and adding the yield here is good for at least a 10x speedup
669 // when running >2 threads per core (on the NAS LU benchmark).
673 #error Unknown or unsupported architecture
676 #if OMPT_SUPPORT && OMPT_OPTIONAL
677 if (ompt_enabled.ompt_callback_flush) {
678 ompt_callbacks.ompt_callback(ompt_callback_flush)(
679 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
684 /* -------------------------------------------------------------------------- */
686 @ingroup SYNCHRONIZATION
687 @param loc source location information
688 @param global_tid thread id.
692 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
693 KMP_COUNT_BLOCK(OMP_BARRIER);
694 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
696 if (!TCR_4(__kmp_init_parallel))
697 __kmp_parallel_initialize();
699 if (__kmp_env_consistency_check) {
701 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
704 __kmp_check_barrier(global_tid, ct_barrier, loc);
708 ompt_frame_t *ompt_frame;
709 if (ompt_enabled.enabled) {
710 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
711 if (ompt_frame->enter_frame.ptr == NULL)
712 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
713 OMPT_STORE_RETURN_ADDRESS(global_tid);
716 __kmp_threads[global_tid]->th.th_ident = loc;
717 // TODO: explicit barrier_wait_id:
718 // this function is called when 'barrier' directive is present or
719 // implicit barrier at the end of a worksharing construct.
720 // 1) better to add a per-thread barrier counter to a thread data structure
721 // 2) set to 0 when a new team is created
722 // 4) no sync is required
724 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
725 #if OMPT_SUPPORT && OMPT_OPTIONAL
726 if (ompt_enabled.enabled) {
727 ompt_frame->enter_frame = ompt_data_none;
732 /* The BARRIER for a MASTER section is always explicit */
734 @ingroup WORK_SHARING
735 @param loc source location information.
736 @param global_tid global thread number .
737 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
739 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
742 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
744 if (!TCR_4(__kmp_init_parallel))
745 __kmp_parallel_initialize();
747 if (KMP_MASTER_GTID(global_tid)) {
748 KMP_COUNT_BLOCK(OMP_MASTER);
749 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
753 #if OMPT_SUPPORT && OMPT_OPTIONAL
755 if (ompt_enabled.ompt_callback_master) {
756 kmp_info_t *this_thr = __kmp_threads[global_tid];
757 kmp_team_t *team = this_thr->th.th_team;
759 int tid = __kmp_tid_from_gtid(global_tid);
760 ompt_callbacks.ompt_callback(ompt_callback_master)(
761 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
762 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
763 OMPT_GET_RETURN_ADDRESS(0));
768 if (__kmp_env_consistency_check) {
769 #if KMP_USE_DYNAMIC_LOCK
771 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
773 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
776 __kmp_push_sync(global_tid, ct_master, loc, NULL);
778 __kmp_check_sync(global_tid, ct_master, loc, NULL);
786 @ingroup WORK_SHARING
787 @param loc source location information.
788 @param global_tid global thread number .
790 Mark the end of a <tt>master</tt> region. This should only be called by the
791 thread that executes the <tt>master</tt> region.
793 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
794 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
796 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
797 KMP_POP_PARTITIONED_TIMER();
799 #if OMPT_SUPPORT && OMPT_OPTIONAL
800 kmp_info_t *this_thr = __kmp_threads[global_tid];
801 kmp_team_t *team = this_thr->th.th_team;
802 if (ompt_enabled.ompt_callback_master) {
803 int tid = __kmp_tid_from_gtid(global_tid);
804 ompt_callbacks.ompt_callback(ompt_callback_master)(
805 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
806 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
807 OMPT_GET_RETURN_ADDRESS(0));
811 if (__kmp_env_consistency_check) {
813 KMP_WARNING(ThreadIdentInvalid);
815 if (KMP_MASTER_GTID(global_tid))
816 __kmp_pop_sync(global_tid, ct_master, loc);
821 @ingroup WORK_SHARING
822 @param loc source location information.
823 @param gtid global thread number.
825 Start execution of an <tt>ordered</tt> construct.
827 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
830 KMP_DEBUG_ASSERT(__kmp_init_serial);
832 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
834 if (!TCR_4(__kmp_init_parallel))
835 __kmp_parallel_initialize();
838 __kmp_itt_ordered_prep(gtid);
839 // TODO: ordered_wait_id
840 #endif /* USE_ITT_BUILD */
842 th = __kmp_threads[gtid];
844 #if OMPT_SUPPORT && OMPT_OPTIONAL
848 if (ompt_enabled.enabled) {
849 OMPT_STORE_RETURN_ADDRESS(gtid);
850 team = __kmp_team_from_gtid(gtid);
851 lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
852 /* OMPT state update */
853 th->th.ompt_thread_info.wait_id = lck;
854 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
856 /* OMPT event callback */
857 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
858 if (ompt_enabled.ompt_callback_mutex_acquire) {
859 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
860 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
861 (ompt_wait_id_t)lck, codeptr_ra);
866 if (th->th.th_dispatch->th_deo_fcn != 0)
867 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
869 __kmp_parallel_deo(>id, &cid, loc);
871 #if OMPT_SUPPORT && OMPT_OPTIONAL
872 if (ompt_enabled.enabled) {
873 /* OMPT state update */
874 th->th.ompt_thread_info.state = ompt_state_work_parallel;
875 th->th.ompt_thread_info.wait_id = 0;
877 /* OMPT event callback */
878 if (ompt_enabled.ompt_callback_mutex_acquired) {
879 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
880 ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
886 __kmp_itt_ordered_start(gtid);
887 #endif /* USE_ITT_BUILD */
891 @ingroup WORK_SHARING
892 @param loc source location information.
893 @param gtid global thread number.
895 End execution of an <tt>ordered</tt> construct.
897 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
901 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
904 __kmp_itt_ordered_end(gtid);
905 // TODO: ordered_wait_id
906 #endif /* USE_ITT_BUILD */
908 th = __kmp_threads[gtid];
910 if (th->th.th_dispatch->th_dxo_fcn != 0)
911 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
913 __kmp_parallel_dxo(>id, &cid, loc);
915 #if OMPT_SUPPORT && OMPT_OPTIONAL
916 OMPT_STORE_RETURN_ADDRESS(gtid);
917 if (ompt_enabled.ompt_callback_mutex_released) {
918 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
920 (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
921 OMPT_LOAD_RETURN_ADDRESS(gtid));
926 #if KMP_USE_DYNAMIC_LOCK
928 static __forceinline void
929 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
930 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
931 // Pointer to the allocated indirect lock is written to crit, while indexing
934 kmp_indirect_lock_t **lck;
935 lck = (kmp_indirect_lock_t **)crit;
936 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
937 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
938 KMP_SET_I_LOCK_LOCATION(ilk, loc);
939 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
941 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
943 __kmp_itt_critical_creating(ilk->lock, loc);
945 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
948 __kmp_itt_critical_destroyed(ilk->lock);
950 // We don't really need to destroy the unclaimed lock here since it will be
951 // cleaned up at program exit.
952 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
954 KMP_DEBUG_ASSERT(*lck != NULL);
957 // Fast-path acquire tas lock
958 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
960 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
961 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
962 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
963 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
964 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
966 KMP_FSYNC_PREPARE(l); \
967 KMP_INIT_YIELD(spins); \
968 if (TCR_4(__kmp_nth) > \
969 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
972 KMP_YIELD_SPIN(spins); \
974 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
976 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
977 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
978 __kmp_spin_backoff(&backoff); \
979 if (TCR_4(__kmp_nth) > \
980 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
983 KMP_YIELD_SPIN(spins); \
987 KMP_FSYNC_ACQUIRED(l); \
990 // Fast-path test tas lock
991 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
993 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
994 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
995 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
996 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
997 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1000 // Fast-path release tas lock
1001 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1002 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1006 #include <sys/syscall.h>
1009 #define FUTEX_WAIT 0
1012 #define FUTEX_WAKE 1
1015 // Fast-path acquire futex lock
1016 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1018 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1019 kmp_int32 gtid_code = (gtid + 1) << 1; \
1021 KMP_FSYNC_PREPARE(ftx); \
1022 kmp_int32 poll_val; \
1023 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1024 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1025 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1026 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1028 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1030 KMP_LOCK_BUSY(1, futex))) { \
1033 poll_val |= KMP_LOCK_BUSY(1, futex); \
1036 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1037 NULL, NULL, 0)) != 0) { \
1042 KMP_FSYNC_ACQUIRED(ftx); \
1045 // Fast-path test futex lock
1046 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1048 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1049 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1050 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1051 KMP_FSYNC_ACQUIRED(ftx); \
1058 // Fast-path release futex lock
1059 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1061 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1063 KMP_FSYNC_RELEASING(ftx); \
1064 kmp_int32 poll_val = \
1065 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1066 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1067 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1068 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1071 KMP_YIELD(TCR_4(__kmp_nth) > \
1072 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1075 #endif // KMP_USE_FUTEX
1077 #else // KMP_USE_DYNAMIC_LOCK
1079 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1082 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1084 // Because of the double-check, the following load doesn't need to be volatile
1085 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1090 // Allocate & initialize the lock.
1091 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1092 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1093 __kmp_init_user_lock_with_checks(lck);
1094 __kmp_set_user_lock_location(lck, loc);
1096 __kmp_itt_critical_creating(lck);
1097 // __kmp_itt_critical_creating() should be called *before* the first usage
1098 // of underlying lock. It is the only place where we can guarantee it. There
1099 // are chances the lock will destroyed with no usage, but it is not a
1100 // problem, because this is not real event seen by user but rather setting
1101 // name for object (lock). See more details in kmp_itt.h.
1102 #endif /* USE_ITT_BUILD */
1104 // Use a cmpxchg instruction to slam the start of the critical section with
1105 // the lock pointer. If another thread beat us to it, deallocate the lock,
1106 // and use the lock that the other thread allocated.
1107 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1110 // Deallocate the lock and reload the value.
1112 __kmp_itt_critical_destroyed(lck);
1113 // Let ITT know the lock is destroyed and the same memory location may be reused
1114 // for another purpose.
1115 #endif /* USE_ITT_BUILD */
1116 __kmp_destroy_user_lock_with_checks(lck);
1117 __kmp_user_lock_free(&idx, gtid, lck);
1118 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1119 KMP_DEBUG_ASSERT(lck != NULL);
1125 #endif // KMP_USE_DYNAMIC_LOCK
1128 @ingroup WORK_SHARING
1129 @param loc source location information.
1130 @param global_tid global thread number .
1131 @param crit identity of the critical section. This could be a pointer to a lock
1132 associated with the critical section, or some other suitably unique value.
1134 Enter code protected by a `critical` construct.
1135 This function blocks until the executing thread can enter the critical section.
1137 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1138 kmp_critical_name *crit) {
1139 #if KMP_USE_DYNAMIC_LOCK
1140 #if OMPT_SUPPORT && OMPT_OPTIONAL
1141 OMPT_STORE_RETURN_ADDRESS(global_tid);
1142 #endif // OMPT_SUPPORT
1143 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1145 KMP_COUNT_BLOCK(OMP_CRITICAL);
1146 #if OMPT_SUPPORT && OMPT_OPTIONAL
1147 ompt_state_t prev_state = ompt_state_undefined;
1148 ompt_thread_info_t ti;
1150 kmp_user_lock_p lck;
1152 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1154 // TODO: add THR_OVHD_STATE
1156 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1157 KMP_CHECK_USER_LOCK_INIT();
1159 if ((__kmp_user_lock_kind == lk_tas) &&
1160 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1161 lck = (kmp_user_lock_p)crit;
1164 else if ((__kmp_user_lock_kind == lk_futex) &&
1165 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1166 lck = (kmp_user_lock_p)crit;
1169 else { // ticket, queuing or drdpa
1170 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1173 if (__kmp_env_consistency_check)
1174 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1176 // since the critical directive binds to all threads, not just the current
1177 // team we have to check this even if we are in a serialized team.
1178 // also, even if we are the uber thread, we still have to conduct the lock,
1179 // as we have to contend with sibling threads.
1182 __kmp_itt_critical_acquiring(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185 OMPT_STORE_RETURN_ADDRESS(gtid);
1186 void *codeptr_ra = NULL;
1187 if (ompt_enabled.enabled) {
1188 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1189 /* OMPT state update */
1190 prev_state = ti.state;
1191 ti.wait_id = (ompt_wait_id_t)lck;
1192 ti.state = ompt_state_wait_critical;
1194 /* OMPT event callback */
1195 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1196 if (ompt_enabled.ompt_callback_mutex_acquire) {
1197 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1198 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1199 (ompt_wait_id_t)crit, codeptr_ra);
1203 // Value of 'crit' should be good for using as a critical_id of the critical
1204 // section directive.
1205 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1208 __kmp_itt_critical_acquired(lck);
1209 #endif /* USE_ITT_BUILD */
1210 #if OMPT_SUPPORT && OMPT_OPTIONAL
1211 if (ompt_enabled.enabled) {
1212 /* OMPT state update */
1213 ti.state = prev_state;
1216 /* OMPT event callback */
1217 if (ompt_enabled.ompt_callback_mutex_acquired) {
1218 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1219 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1223 KMP_POP_PARTITIONED_TIMER();
1225 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1226 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1227 #endif // KMP_USE_DYNAMIC_LOCK
1230 #if KMP_USE_DYNAMIC_LOCK
1232 // Converts the given hint to an internal lock implementation
1233 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1235 #define KMP_TSX_LOCK(seq) lockseq_##seq
1237 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1240 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1241 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1243 #define KMP_CPUINFO_RTM 0
1246 // Hints that do not require further logic
1247 if (hint & kmp_lock_hint_hle)
1248 return KMP_TSX_LOCK(hle);
1249 if (hint & kmp_lock_hint_rtm)
1250 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1251 if (hint & kmp_lock_hint_adaptive)
1252 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1254 // Rule out conflicting hints first by returning the default lock
1255 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1256 return __kmp_user_lock_seq;
1257 if ((hint & omp_lock_hint_speculative) &&
1258 (hint & omp_lock_hint_nonspeculative))
1259 return __kmp_user_lock_seq;
1261 // Do not even consider speculation when it appears to be contended
1262 if (hint & omp_lock_hint_contended)
1263 return lockseq_queuing;
1265 // Uncontended lock without speculation
1266 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1269 // HLE lock for speculation
1270 if (hint & omp_lock_hint_speculative)
1271 return KMP_TSX_LOCK(hle);
1273 return __kmp_user_lock_seq;
1276 #if OMPT_SUPPORT && OMPT_OPTIONAL
1277 #if KMP_USE_DYNAMIC_LOCK
1278 static kmp_mutex_impl_t
1279 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1281 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1286 return kmp_mutex_impl_queuing;
1289 return kmp_mutex_impl_spin;
1292 return kmp_mutex_impl_speculative;
1295 return kmp_mutex_impl_none;
1297 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1300 switch (ilock->type) {
1302 case locktag_adaptive:
1304 return kmp_mutex_impl_speculative;
1306 case locktag_nested_tas:
1307 return kmp_mutex_impl_spin;
1309 case locktag_nested_futex:
1311 case locktag_ticket:
1312 case locktag_queuing:
1314 case locktag_nested_ticket:
1315 case locktag_nested_queuing:
1316 case locktag_nested_drdpa:
1317 return kmp_mutex_impl_queuing;
1319 return kmp_mutex_impl_none;
1323 // For locks without dynamic binding
1324 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1325 switch (__kmp_user_lock_kind) {
1327 return kmp_mutex_impl_spin;
1334 return kmp_mutex_impl_queuing;
1339 return kmp_mutex_impl_speculative;
1342 return kmp_mutex_impl_none;
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1349 @ingroup WORK_SHARING
1350 @param loc source location information.
1351 @param global_tid global thread number.
1352 @param crit identity of the critical section. This could be a pointer to a lock
1353 associated with the critical section, or some other suitably unique value.
1354 @param hint the lock hint.
1356 Enter code protected by a `critical` construct with a hint. The hint value is
1357 used to suggest a lock implementation. This function blocks until the executing
1358 thread can enter the critical section unless the hint suggests use of
1359 speculative execution and the hardware supports it.
1361 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1362 kmp_critical_name *crit, uint32_t hint) {
1363 KMP_COUNT_BLOCK(OMP_CRITICAL);
1364 kmp_user_lock_p lck;
1365 #if OMPT_SUPPORT && OMPT_OPTIONAL
1366 ompt_state_t prev_state = ompt_state_undefined;
1367 ompt_thread_info_t ti;
1368 // This is the case, if called from __kmpc_critical:
1369 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1371 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1374 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1376 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1377 // Check if it is initialized.
1378 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1380 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1381 if (KMP_IS_D_LOCK(lckseq)) {
1382 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1383 KMP_GET_D_TAG(lckseq));
1385 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1388 // Branch for accessing the actual lock object and set operation. This
1389 // branching is inevitable since this lock initialization does not follow the
1390 // normal dispatch path (lock table is not used).
1391 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1392 lck = (kmp_user_lock_p)lk;
1393 if (__kmp_env_consistency_check) {
1394 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1395 __kmp_map_hint_to_lock(hint));
1398 __kmp_itt_critical_acquiring(lck);
1400 #if OMPT_SUPPORT && OMPT_OPTIONAL
1401 if (ompt_enabled.enabled) {
1402 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1403 /* OMPT state update */
1404 prev_state = ti.state;
1405 ti.wait_id = (ompt_wait_id_t)lck;
1406 ti.state = ompt_state_wait_critical;
1408 /* OMPT event callback */
1409 if (ompt_enabled.ompt_callback_mutex_acquire) {
1410 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1411 ompt_mutex_critical, (unsigned int)hint,
1412 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1416 #if KMP_USE_INLINED_TAS
1417 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1418 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1420 #elif KMP_USE_INLINED_FUTEX
1421 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1422 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1426 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1429 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1431 if (__kmp_env_consistency_check) {
1432 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1433 __kmp_map_hint_to_lock(hint));
1436 __kmp_itt_critical_acquiring(lck);
1438 #if OMPT_SUPPORT && OMPT_OPTIONAL
1439 if (ompt_enabled.enabled) {
1440 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1441 /* OMPT state update */
1442 prev_state = ti.state;
1443 ti.wait_id = (ompt_wait_id_t)lck;
1444 ti.state = ompt_state_wait_critical;
1446 /* OMPT event callback */
1447 if (ompt_enabled.ompt_callback_mutex_acquire) {
1448 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1449 ompt_mutex_critical, (unsigned int)hint,
1450 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1454 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1456 KMP_POP_PARTITIONED_TIMER();
1459 __kmp_itt_critical_acquired(lck);
1460 #endif /* USE_ITT_BUILD */
1461 #if OMPT_SUPPORT && OMPT_OPTIONAL
1462 if (ompt_enabled.enabled) {
1463 /* OMPT state update */
1464 ti.state = prev_state;
1467 /* OMPT event callback */
1468 if (ompt_enabled.ompt_callback_mutex_acquired) {
1469 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1470 ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1475 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1476 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1477 } // __kmpc_critical_with_hint
1479 #endif // KMP_USE_DYNAMIC_LOCK
1482 @ingroup WORK_SHARING
1483 @param loc source location information.
1484 @param global_tid global thread number .
1485 @param crit identity of the critical section. This could be a pointer to a lock
1486 associated with the critical section, or some other suitably unique value.
1488 Leave a critical section, releasing any lock that was held during its execution.
1490 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1491 kmp_critical_name *crit) {
1492 kmp_user_lock_p lck;
1494 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1496 #if KMP_USE_DYNAMIC_LOCK
1497 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1498 lck = (kmp_user_lock_p)crit;
1499 KMP_ASSERT(lck != NULL);
1500 if (__kmp_env_consistency_check) {
1501 __kmp_pop_sync(global_tid, ct_critical, loc);
1504 __kmp_itt_critical_releasing(lck);
1506 #if KMP_USE_INLINED_TAS
1507 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1508 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1510 #elif KMP_USE_INLINED_FUTEX
1511 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1512 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1516 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1519 kmp_indirect_lock_t *ilk =
1520 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1521 KMP_ASSERT(ilk != NULL);
1523 if (__kmp_env_consistency_check) {
1524 __kmp_pop_sync(global_tid, ct_critical, loc);
1527 __kmp_itt_critical_releasing(lck);
1529 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1532 #else // KMP_USE_DYNAMIC_LOCK
1534 if ((__kmp_user_lock_kind == lk_tas) &&
1535 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1536 lck = (kmp_user_lock_p)crit;
1539 else if ((__kmp_user_lock_kind == lk_futex) &&
1540 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1541 lck = (kmp_user_lock_p)crit;
1544 else { // ticket, queuing or drdpa
1545 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1548 KMP_ASSERT(lck != NULL);
1550 if (__kmp_env_consistency_check)
1551 __kmp_pop_sync(global_tid, ct_critical, loc);
1554 __kmp_itt_critical_releasing(lck);
1555 #endif /* USE_ITT_BUILD */
1556 // Value of 'crit' should be good for using as a critical_id of the critical
1557 // section directive.
1558 __kmp_release_user_lock_with_checks(lck, global_tid);
1560 #endif // KMP_USE_DYNAMIC_LOCK
1562 #if OMPT_SUPPORT && OMPT_OPTIONAL
1563 /* OMPT release event triggers after lock is released; place here to trigger
1564 * for all #if branches */
1565 OMPT_STORE_RETURN_ADDRESS(global_tid);
1566 if (ompt_enabled.ompt_callback_mutex_released) {
1567 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1568 ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1572 KMP_POP_PARTITIONED_TIMER();
1573 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1577 @ingroup SYNCHRONIZATION
1578 @param loc source location information
1579 @param global_tid thread id.
1580 @return one if the thread should execute the master block, zero otherwise
1582 Start execution of a combined barrier and master. The barrier is executed inside
1585 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1588 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1590 if (!TCR_4(__kmp_init_parallel))
1591 __kmp_parallel_initialize();
1593 if (__kmp_env_consistency_check)
1594 __kmp_check_barrier(global_tid, ct_barrier, loc);
1597 ompt_frame_t *ompt_frame;
1598 if (ompt_enabled.enabled) {
1599 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1600 if (ompt_frame->enter_frame.ptr == NULL)
1601 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1602 OMPT_STORE_RETURN_ADDRESS(global_tid);
1606 __kmp_threads[global_tid]->th.th_ident = loc;
1608 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1609 #if OMPT_SUPPORT && OMPT_OPTIONAL
1610 if (ompt_enabled.enabled) {
1611 ompt_frame->enter_frame = ompt_data_none;
1615 return (status != 0) ? 0 : 1;
1619 @ingroup SYNCHRONIZATION
1620 @param loc source location information
1621 @param global_tid thread id.
1623 Complete the execution of a combined barrier and master. This function should
1624 only be called at the completion of the <tt>master</tt> code. Other threads will
1625 still be waiting at the barrier and this call releases them.
1627 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1628 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1630 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1634 @ingroup SYNCHRONIZATION
1635 @param loc source location information
1636 @param global_tid thread id.
1637 @return one if the thread should execute the master block, zero otherwise
1639 Start execution of a combined barrier and master(nowait) construct.
1640 The barrier is executed inside this function.
1641 There is no equivalent "end" function, since the
1643 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1646 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1648 if (!TCR_4(__kmp_init_parallel))
1649 __kmp_parallel_initialize();
1651 if (__kmp_env_consistency_check) {
1653 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1655 __kmp_check_barrier(global_tid, ct_barrier, loc);
1659 ompt_frame_t *ompt_frame;
1660 if (ompt_enabled.enabled) {
1661 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1662 if (ompt_frame->enter_frame.ptr == NULL)
1663 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1664 OMPT_STORE_RETURN_ADDRESS(global_tid);
1668 __kmp_threads[global_tid]->th.th_ident = loc;
1670 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1671 #if OMPT_SUPPORT && OMPT_OPTIONAL
1672 if (ompt_enabled.enabled) {
1673 ompt_frame->enter_frame = ompt_data_none;
1677 ret = __kmpc_master(loc, global_tid);
1679 if (__kmp_env_consistency_check) {
1680 /* there's no __kmpc_end_master called; so the (stats) */
1681 /* actions of __kmpc_end_master are done here */
1683 if (global_tid < 0) {
1684 KMP_WARNING(ThreadIdentInvalid);
1687 /* only one thread should do the pop since only */
1688 /* one did the push (see __kmpc_master()) */
1690 __kmp_pop_sync(global_tid, ct_master, loc);
1697 /* The BARRIER for a SINGLE process section is always explicit */
1699 @ingroup WORK_SHARING
1700 @param loc source location information
1701 @param global_tid global thread number
1702 @return One if this thread should execute the single construct, zero otherwise.
1704 Test whether to execute a <tt>single</tt> construct.
1705 There are no implicit barriers in the two "single" calls, rather the compiler
1706 should introduce an explicit barrier if it is required.
1709 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1710 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1713 // We are going to execute the single statement, so we should count it.
1714 KMP_COUNT_BLOCK(OMP_SINGLE);
1715 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1718 #if OMPT_SUPPORT && OMPT_OPTIONAL
1719 kmp_info_t *this_thr = __kmp_threads[global_tid];
1720 kmp_team_t *team = this_thr->th.th_team;
1721 int tid = __kmp_tid_from_gtid(global_tid);
1723 if (ompt_enabled.enabled) {
1725 if (ompt_enabled.ompt_callback_work) {
1726 ompt_callbacks.ompt_callback(ompt_callback_work)(
1727 ompt_work_single_executor, ompt_scope_begin,
1728 &(team->t.ompt_team_info.parallel_data),
1729 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1730 1, OMPT_GET_RETURN_ADDRESS(0));
1733 if (ompt_enabled.ompt_callback_work) {
1734 ompt_callbacks.ompt_callback(ompt_callback_work)(
1735 ompt_work_single_other, ompt_scope_begin,
1736 &(team->t.ompt_team_info.parallel_data),
1737 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1738 1, OMPT_GET_RETURN_ADDRESS(0));
1739 ompt_callbacks.ompt_callback(ompt_callback_work)(
1740 ompt_work_single_other, ompt_scope_end,
1741 &(team->t.ompt_team_info.parallel_data),
1742 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1743 1, OMPT_GET_RETURN_ADDRESS(0));
1753 @ingroup WORK_SHARING
1754 @param loc source location information
1755 @param global_tid global thread number
1757 Mark the end of a <tt>single</tt> construct. This function should
1758 only be called by the thread that executed the block of code protected
1759 by the `single` construct.
1761 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1762 __kmp_exit_single(global_tid);
1763 KMP_POP_PARTITIONED_TIMER();
1765 #if OMPT_SUPPORT && OMPT_OPTIONAL
1766 kmp_info_t *this_thr = __kmp_threads[global_tid];
1767 kmp_team_t *team = this_thr->th.th_team;
1768 int tid = __kmp_tid_from_gtid(global_tid);
1770 if (ompt_enabled.ompt_callback_work) {
1771 ompt_callbacks.ompt_callback(ompt_callback_work)(
1772 ompt_work_single_executor, ompt_scope_end,
1773 &(team->t.ompt_team_info.parallel_data),
1774 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1775 OMPT_GET_RETURN_ADDRESS(0));
1781 @ingroup WORK_SHARING
1782 @param loc Source location
1783 @param global_tid Global thread id
1785 Mark the end of a statically scheduled loop.
1787 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1788 KMP_POP_PARTITIONED_TIMER();
1789 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1791 #if OMPT_SUPPORT && OMPT_OPTIONAL
1792 if (ompt_enabled.ompt_callback_work) {
1793 ompt_work_t ompt_work_type = ompt_work_loop;
1794 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1795 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1796 // Determine workshare type
1798 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1799 ompt_work_type = ompt_work_loop;
1800 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1801 ompt_work_type = ompt_work_sections;
1802 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1803 ompt_work_type = ompt_work_distribute;
1805 // use default set above.
1806 // a warning about this case is provided in __kmpc_for_static_init
1808 KMP_DEBUG_ASSERT(ompt_work_type);
1810 ompt_callbacks.ompt_callback(ompt_callback_work)(
1811 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1812 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1815 if (__kmp_env_consistency_check)
1816 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1819 // User routines which take C-style arguments (call by value)
1820 // different from the Fortran equivalent routines
1822 void ompc_set_num_threads(int arg) {
1823 // !!!!! TODO: check the per-task binding
1824 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1827 void ompc_set_dynamic(int flag) {
1830 /* For the thread-private implementation of the internal controls */
1831 thread = __kmp_entry_thread();
1833 __kmp_save_internal_controls(thread);
1835 set__dynamic(thread, flag ? TRUE : FALSE);
1838 void ompc_set_nested(int flag) {
1841 /* For the thread-private internal controls implementation */
1842 thread = __kmp_entry_thread();
1844 __kmp_save_internal_controls(thread);
1846 set__nested(thread, flag ? TRUE : FALSE);
1849 void ompc_set_max_active_levels(int max_active_levels) {
1851 /* we want per-task implementation of this internal control */
1853 /* For the per-thread internal controls implementation */
1854 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1857 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1858 // !!!!! TODO: check the per-task binding
1859 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1862 int ompc_get_ancestor_thread_num(int level) {
1863 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1866 int ompc_get_team_size(int level) {
1867 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1871 /* OpenMP 5.0 Affinity Format API */
1873 void ompc_set_affinity_format(char const *format) {
1874 if (!__kmp_init_serial) {
1875 __kmp_serial_initialize();
1877 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1878 format, KMP_STRLEN(format) + 1);
1881 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1883 if (!__kmp_init_serial) {
1884 __kmp_serial_initialize();
1886 format_size = KMP_STRLEN(__kmp_affinity_format);
1887 if (buffer && size) {
1888 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1894 void ompc_display_affinity(char const *format) {
1896 if (!TCR_4(__kmp_init_middle)) {
1897 __kmp_middle_initialize();
1899 gtid = __kmp_get_gtid();
1900 __kmp_aux_display_affinity(gtid, format);
1903 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1904 char const *format) {
1906 size_t num_required;
1907 kmp_str_buf_t capture_buf;
1908 if (!TCR_4(__kmp_init_middle)) {
1909 __kmp_middle_initialize();
1911 gtid = __kmp_get_gtid();
1912 __kmp_str_buf_init(&capture_buf);
1913 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1914 if (buffer && buf_size) {
1915 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1916 capture_buf.used + 1);
1918 __kmp_str_buf_free(&capture_buf);
1919 return num_required;
1921 #endif /* OMP_50_ENABLED */
1923 void kmpc_set_stacksize(int arg) {
1924 // __kmp_aux_set_stacksize initializes the library if needed
1925 __kmp_aux_set_stacksize(arg);
1928 void kmpc_set_stacksize_s(size_t arg) {
1929 // __kmp_aux_set_stacksize initializes the library if needed
1930 __kmp_aux_set_stacksize(arg);
1933 void kmpc_set_blocktime(int arg) {
1937 gtid = __kmp_entry_gtid();
1938 tid = __kmp_tid_from_gtid(gtid);
1939 thread = __kmp_thread_from_gtid(gtid);
1941 __kmp_aux_set_blocktime(arg, thread, tid);
1944 void kmpc_set_library(int arg) {
1945 // __kmp_user_set_library initializes the library if needed
1946 __kmp_user_set_library((enum library_type)arg);
1949 void kmpc_set_defaults(char const *str) {
1950 // __kmp_aux_set_defaults initializes the library if needed
1951 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1954 void kmpc_set_disp_num_buffers(int arg) {
1955 // ignore after initialization because some teams have already
1956 // allocated dispatch buffers
1957 if (__kmp_init_serial == 0 && arg > 0)
1958 __kmp_dispatch_num_buffers = arg;
1961 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1962 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1965 if (!TCR_4(__kmp_init_middle)) {
1966 __kmp_middle_initialize();
1968 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1972 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1973 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1976 if (!TCR_4(__kmp_init_middle)) {
1977 __kmp_middle_initialize();
1979 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1983 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1984 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1987 if (!TCR_4(__kmp_init_middle)) {
1988 __kmp_middle_initialize();
1990 return __kmp_aux_get_affinity_mask_proc(proc, mask);
1994 /* -------------------------------------------------------------------------- */
1996 @ingroup THREADPRIVATE
1997 @param loc source location information
1998 @param gtid global thread number
1999 @param cpy_size size of the cpy_data buffer
2000 @param cpy_data pointer to data to be copied
2001 @param cpy_func helper function to call for copying data
2002 @param didit flag variable: 1=single thread; 0=not single thread
2004 __kmpc_copyprivate implements the interface for the private data broadcast
2005 needed for the copyprivate clause associated with a single region in an
2006 OpenMP<sup>*</sup> program (both C and Fortran).
2007 All threads participating in the parallel region call this routine.
2008 One of the threads (called the single thread) should have the <tt>didit</tt>
2009 variable set to 1 and all other threads should have that variable set to 0.
2010 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2012 The OpenMP specification forbids the use of nowait on the single region when a
2013 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2014 barrier internally to avoid race conditions, so the code generation for the
2015 single region should avoid generating a barrier after the call to @ref
2018 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2019 The <tt>loc</tt> parameter is a pointer to source location information.
2021 Internal implementation: The single thread will first copy its descriptor
2022 address (cpy_data) to a team-private location, then the other threads will each
2023 call the function pointed to by the parameter cpy_func, which carries out the
2024 copy by copying the data using the cpy_data buffer.
2026 The cpy_func routine used for the copy and the contents of the data area defined
2027 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2028 to be done. For instance, the cpy_data buffer can hold the actual data to be
2029 copied or it may hold a list of pointers to the data. The cpy_func routine must
2030 interpret the cpy_data buffer appropriately.
2032 The interface to cpy_func is as follows:
2034 void cpy_func( void *destination, void *source )
2036 where void *destination is the cpy_data pointer for the thread being copied to
2037 and void *source is the cpy_data pointer for the thread being copied from.
2039 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2040 void *cpy_data, void (*cpy_func)(void *, void *),
2044 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2048 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2050 if (__kmp_env_consistency_check) {
2052 KMP_WARNING(ConstructIdentInvalid);
2056 // ToDo: Optimize the following two barriers into some kind of split barrier
2059 *data_ptr = cpy_data;
2062 ompt_frame_t *ompt_frame;
2063 if (ompt_enabled.enabled) {
2064 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2065 if (ompt_frame->enter_frame.ptr == NULL)
2066 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2067 OMPT_STORE_RETURN_ADDRESS(gtid);
2070 /* This barrier is not a barrier region boundary */
2072 __kmp_threads[gtid]->th.th_ident = loc;
2074 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2077 (*cpy_func)(cpy_data, *data_ptr);
2079 // Consider next barrier a user-visible barrier for barrier region boundaries
2080 // Nesting checks are already handled by the single construct checks
2083 if (ompt_enabled.enabled) {
2084 OMPT_STORE_RETURN_ADDRESS(gtid);
2088 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2089 // tasks can overwrite the location)
2091 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2092 #if OMPT_SUPPORT && OMPT_OPTIONAL
2093 if (ompt_enabled.enabled) {
2094 ompt_frame->enter_frame = ompt_data_none;
2099 /* -------------------------------------------------------------------------- */
2101 #define INIT_LOCK __kmp_init_user_lock_with_checks
2102 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2103 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2104 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2105 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2106 #define ACQUIRE_NESTED_LOCK_TIMED \
2107 __kmp_acquire_nested_user_lock_with_checks_timed
2108 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2109 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2110 #define TEST_LOCK __kmp_test_user_lock_with_checks
2111 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2112 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2113 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2115 // TODO: Make check abort messages use location info & pass it into
2116 // with_checks routines
2118 #if KMP_USE_DYNAMIC_LOCK
2120 // internal lock initializer
2121 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2122 kmp_dyna_lockseq_t seq) {
2123 if (KMP_IS_D_LOCK(seq)) {
2124 KMP_INIT_D_LOCK(lock, seq);
2126 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2129 KMP_INIT_I_LOCK(lock, seq);
2131 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2132 __kmp_itt_lock_creating(ilk->lock, loc);
2137 // internal nest lock initializer
2138 static __forceinline void
2139 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2140 kmp_dyna_lockseq_t seq) {
2142 // Don't have nested lock implementation for speculative locks
2143 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2144 seq = __kmp_user_lock_seq;
2148 seq = lockseq_nested_tas;
2152 seq = lockseq_nested_futex;
2155 case lockseq_ticket:
2156 seq = lockseq_nested_ticket;
2158 case lockseq_queuing:
2159 seq = lockseq_nested_queuing;
2162 seq = lockseq_nested_drdpa;
2165 seq = lockseq_nested_queuing;
2167 KMP_INIT_I_LOCK(lock, seq);
2169 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2170 __kmp_itt_lock_creating(ilk->lock, loc);
2174 /* initialize the lock with a hint */
2175 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2177 KMP_DEBUG_ASSERT(__kmp_init_serial);
2178 if (__kmp_env_consistency_check && user_lock == NULL) {
2179 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2182 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2184 #if OMPT_SUPPORT && OMPT_OPTIONAL
2185 // This is the case, if called from omp_init_lock_with_hint:
2186 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2188 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2189 if (ompt_enabled.ompt_callback_lock_init) {
2190 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2191 ompt_mutex_lock, (omp_lock_hint_t)hint,
2192 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2198 /* initialize the lock with a hint */
2199 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2200 void **user_lock, uintptr_t hint) {
2201 KMP_DEBUG_ASSERT(__kmp_init_serial);
2202 if (__kmp_env_consistency_check && user_lock == NULL) {
2203 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2206 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2208 #if OMPT_SUPPORT && OMPT_OPTIONAL
2209 // This is the case, if called from omp_init_lock_with_hint:
2210 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2212 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2213 if (ompt_enabled.ompt_callback_lock_init) {
2214 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2215 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2216 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2222 #endif // KMP_USE_DYNAMIC_LOCK
2224 /* initialize the lock */
2225 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2226 #if KMP_USE_DYNAMIC_LOCK
2228 KMP_DEBUG_ASSERT(__kmp_init_serial);
2229 if (__kmp_env_consistency_check && user_lock == NULL) {
2230 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2232 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2234 #if OMPT_SUPPORT && OMPT_OPTIONAL
2235 // This is the case, if called from omp_init_lock_with_hint:
2236 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2238 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2239 if (ompt_enabled.ompt_callback_lock_init) {
2240 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2241 ompt_mutex_lock, omp_lock_hint_none,
2242 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2247 #else // KMP_USE_DYNAMIC_LOCK
2249 static char const *const func = "omp_init_lock";
2250 kmp_user_lock_p lck;
2251 KMP_DEBUG_ASSERT(__kmp_init_serial);
2253 if (__kmp_env_consistency_check) {
2254 if (user_lock == NULL) {
2255 KMP_FATAL(LockIsUninitialized, func);
2259 KMP_CHECK_USER_LOCK_INIT();
2261 if ((__kmp_user_lock_kind == lk_tas) &&
2262 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2263 lck = (kmp_user_lock_p)user_lock;
2266 else if ((__kmp_user_lock_kind == lk_futex) &&
2267 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2268 lck = (kmp_user_lock_p)user_lock;
2272 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2275 __kmp_set_user_lock_location(lck, loc);
2277 #if OMPT_SUPPORT && OMPT_OPTIONAL
2278 // This is the case, if called from omp_init_lock_with_hint:
2279 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2281 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2282 if (ompt_enabled.ompt_callback_lock_init) {
2283 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2284 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2285 (ompt_wait_id_t)user_lock, codeptr);
2290 __kmp_itt_lock_creating(lck);
2291 #endif /* USE_ITT_BUILD */
2293 #endif // KMP_USE_DYNAMIC_LOCK
2294 } // __kmpc_init_lock
2296 /* initialize the lock */
2297 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2298 #if KMP_USE_DYNAMIC_LOCK
2300 KMP_DEBUG_ASSERT(__kmp_init_serial);
2301 if (__kmp_env_consistency_check && user_lock == NULL) {
2302 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2304 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2306 #if OMPT_SUPPORT && OMPT_OPTIONAL
2307 // This is the case, if called from omp_init_lock_with_hint:
2308 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2310 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2311 if (ompt_enabled.ompt_callback_lock_init) {
2312 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2313 ompt_mutex_nest_lock, omp_lock_hint_none,
2314 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2319 #else // KMP_USE_DYNAMIC_LOCK
2321 static char const *const func = "omp_init_nest_lock";
2322 kmp_user_lock_p lck;
2323 KMP_DEBUG_ASSERT(__kmp_init_serial);
2325 if (__kmp_env_consistency_check) {
2326 if (user_lock == NULL) {
2327 KMP_FATAL(LockIsUninitialized, func);
2331 KMP_CHECK_USER_LOCK_INIT();
2333 if ((__kmp_user_lock_kind == lk_tas) &&
2334 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2335 OMP_NEST_LOCK_T_SIZE)) {
2336 lck = (kmp_user_lock_p)user_lock;
2339 else if ((__kmp_user_lock_kind == lk_futex) &&
2340 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2341 OMP_NEST_LOCK_T_SIZE)) {
2342 lck = (kmp_user_lock_p)user_lock;
2346 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2349 INIT_NESTED_LOCK(lck);
2350 __kmp_set_user_lock_location(lck, loc);
2352 #if OMPT_SUPPORT && OMPT_OPTIONAL
2353 // This is the case, if called from omp_init_lock_with_hint:
2354 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2356 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2357 if (ompt_enabled.ompt_callback_lock_init) {
2358 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2359 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2360 (ompt_wait_id_t)user_lock, codeptr);
2365 __kmp_itt_lock_creating(lck);
2366 #endif /* USE_ITT_BUILD */
2368 #endif // KMP_USE_DYNAMIC_LOCK
2369 } // __kmpc_init_nest_lock
2371 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2372 #if KMP_USE_DYNAMIC_LOCK
2375 kmp_user_lock_p lck;
2376 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2377 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2379 lck = (kmp_user_lock_p)user_lock;
2381 __kmp_itt_lock_destroyed(lck);
2383 #if OMPT_SUPPORT && OMPT_OPTIONAL
2384 // This is the case, if called from omp_init_lock_with_hint:
2385 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2387 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2388 if (ompt_enabled.ompt_callback_lock_destroy) {
2389 kmp_user_lock_p lck;
2390 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2391 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2393 lck = (kmp_user_lock_p)user_lock;
2395 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2396 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2399 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2401 kmp_user_lock_p lck;
2403 if ((__kmp_user_lock_kind == lk_tas) &&
2404 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2405 lck = (kmp_user_lock_p)user_lock;
2408 else if ((__kmp_user_lock_kind == lk_futex) &&
2409 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2410 lck = (kmp_user_lock_p)user_lock;
2414 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418 // This is the case, if called from omp_init_lock_with_hint:
2419 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2421 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2422 if (ompt_enabled.ompt_callback_lock_destroy) {
2423 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2424 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2429 __kmp_itt_lock_destroyed(lck);
2430 #endif /* USE_ITT_BUILD */
2433 if ((__kmp_user_lock_kind == lk_tas) &&
2434 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2438 else if ((__kmp_user_lock_kind == lk_futex) &&
2439 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2444 __kmp_user_lock_free(user_lock, gtid, lck);
2446 #endif // KMP_USE_DYNAMIC_LOCK
2447 } // __kmpc_destroy_lock
2449 /* destroy the lock */
2450 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2451 #if KMP_USE_DYNAMIC_LOCK
2454 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2455 __kmp_itt_lock_destroyed(ilk->lock);
2457 #if OMPT_SUPPORT && OMPT_OPTIONAL
2458 // This is the case, if called from omp_init_lock_with_hint:
2459 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2461 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2462 if (ompt_enabled.ompt_callback_lock_destroy) {
2463 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2464 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2467 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2469 #else // KMP_USE_DYNAMIC_LOCK
2471 kmp_user_lock_p lck;
2473 if ((__kmp_user_lock_kind == lk_tas) &&
2474 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2475 OMP_NEST_LOCK_T_SIZE)) {
2476 lck = (kmp_user_lock_p)user_lock;
2479 else if ((__kmp_user_lock_kind == lk_futex) &&
2480 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2481 OMP_NEST_LOCK_T_SIZE)) {
2482 lck = (kmp_user_lock_p)user_lock;
2486 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2489 #if OMPT_SUPPORT && OMPT_OPTIONAL
2490 // This is the case, if called from omp_init_lock_with_hint:
2491 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2493 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2494 if (ompt_enabled.ompt_callback_lock_destroy) {
2495 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2496 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2501 __kmp_itt_lock_destroyed(lck);
2502 #endif /* USE_ITT_BUILD */
2504 DESTROY_NESTED_LOCK(lck);
2506 if ((__kmp_user_lock_kind == lk_tas) &&
2507 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2508 OMP_NEST_LOCK_T_SIZE)) {
2512 else if ((__kmp_user_lock_kind == lk_futex) &&
2513 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2514 OMP_NEST_LOCK_T_SIZE)) {
2519 __kmp_user_lock_free(user_lock, gtid, lck);
2521 #endif // KMP_USE_DYNAMIC_LOCK
2522 } // __kmpc_destroy_nest_lock
2524 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2525 KMP_COUNT_BLOCK(OMP_set_lock);
2526 #if KMP_USE_DYNAMIC_LOCK
2527 int tag = KMP_EXTRACT_D_TAG(user_lock);
2529 __kmp_itt_lock_acquiring(
2531 user_lock); // itt function will get to the right lock object.
2533 #if OMPT_SUPPORT && OMPT_OPTIONAL
2534 // This is the case, if called from omp_init_lock_with_hint:
2535 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2537 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2538 if (ompt_enabled.ompt_callback_mutex_acquire) {
2539 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2540 ompt_mutex_lock, omp_lock_hint_none,
2541 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2545 #if KMP_USE_INLINED_TAS
2546 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2547 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2549 #elif KMP_USE_INLINED_FUTEX
2550 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2551 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2555 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2558 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2560 #if OMPT_SUPPORT && OMPT_OPTIONAL
2561 if (ompt_enabled.ompt_callback_mutex_acquired) {
2562 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2563 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2567 #else // KMP_USE_DYNAMIC_LOCK
2569 kmp_user_lock_p lck;
2571 if ((__kmp_user_lock_kind == lk_tas) &&
2572 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2573 lck = (kmp_user_lock_p)user_lock;
2576 else if ((__kmp_user_lock_kind == lk_futex) &&
2577 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2578 lck = (kmp_user_lock_p)user_lock;
2582 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2586 __kmp_itt_lock_acquiring(lck);
2587 #endif /* USE_ITT_BUILD */
2588 #if OMPT_SUPPORT && OMPT_OPTIONAL
2589 // This is the case, if called from omp_init_lock_with_hint:
2590 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2592 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2593 if (ompt_enabled.ompt_callback_mutex_acquire) {
2594 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2595 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2596 (ompt_wait_id_t)lck, codeptr);
2600 ACQUIRE_LOCK(lck, gtid);
2603 __kmp_itt_lock_acquired(lck);
2604 #endif /* USE_ITT_BUILD */
2606 #if OMPT_SUPPORT && OMPT_OPTIONAL
2607 if (ompt_enabled.ompt_callback_mutex_acquired) {
2608 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2609 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2613 #endif // KMP_USE_DYNAMIC_LOCK
2616 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2617 #if KMP_USE_DYNAMIC_LOCK
2620 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2622 #if OMPT_SUPPORT && OMPT_OPTIONAL
2623 // This is the case, if called from omp_init_lock_with_hint:
2624 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2626 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2627 if (ompt_enabled.enabled) {
2628 if (ompt_enabled.ompt_callback_mutex_acquire) {
2629 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2630 ompt_mutex_nest_lock, omp_lock_hint_none,
2631 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2636 int acquire_status =
2637 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2638 (void) acquire_status;
2640 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2643 #if OMPT_SUPPORT && OMPT_OPTIONAL
2644 if (ompt_enabled.enabled) {
2645 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2646 if (ompt_enabled.ompt_callback_mutex_acquired) {
2648 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2649 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2652 if (ompt_enabled.ompt_callback_nest_lock) {
2654 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2655 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2661 #else // KMP_USE_DYNAMIC_LOCK
2663 kmp_user_lock_p lck;
2665 if ((__kmp_user_lock_kind == lk_tas) &&
2666 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2667 OMP_NEST_LOCK_T_SIZE)) {
2668 lck = (kmp_user_lock_p)user_lock;
2671 else if ((__kmp_user_lock_kind == lk_futex) &&
2672 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2673 OMP_NEST_LOCK_T_SIZE)) {
2674 lck = (kmp_user_lock_p)user_lock;
2678 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2682 __kmp_itt_lock_acquiring(lck);
2683 #endif /* USE_ITT_BUILD */
2684 #if OMPT_SUPPORT && OMPT_OPTIONAL
2685 // This is the case, if called from omp_init_lock_with_hint:
2686 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2688 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2689 if (ompt_enabled.enabled) {
2690 if (ompt_enabled.ompt_callback_mutex_acquire) {
2691 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2692 ompt_mutex_nest_lock, omp_lock_hint_none,
2693 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2698 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2701 __kmp_itt_lock_acquired(lck);
2702 #endif /* USE_ITT_BUILD */
2704 #if OMPT_SUPPORT && OMPT_OPTIONAL
2705 if (ompt_enabled.enabled) {
2706 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2707 if (ompt_enabled.ompt_callback_mutex_acquired) {
2709 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2710 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2713 if (ompt_enabled.ompt_callback_nest_lock) {
2715 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2716 ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2722 #endif // KMP_USE_DYNAMIC_LOCK
2725 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2726 #if KMP_USE_DYNAMIC_LOCK
2728 int tag = KMP_EXTRACT_D_TAG(user_lock);
2730 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2732 #if KMP_USE_INLINED_TAS
2733 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2734 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2736 #elif KMP_USE_INLINED_FUTEX
2737 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2738 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2742 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746 // This is the case, if called from omp_init_lock_with_hint:
2747 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2749 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750 if (ompt_enabled.ompt_callback_mutex_released) {
2751 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2752 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2756 #else // KMP_USE_DYNAMIC_LOCK
2758 kmp_user_lock_p lck;
2760 /* Can't use serial interval since not block structured */
2761 /* release the lock */
2763 if ((__kmp_user_lock_kind == lk_tas) &&
2764 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2765 #if KMP_OS_LINUX && \
2766 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2767 // "fast" path implemented to fix customer performance issue
2769 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2770 #endif /* USE_ITT_BUILD */
2771 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2774 #if OMPT_SUPPORT && OMPT_OPTIONAL
2775 // This is the case, if called from omp_init_lock_with_hint:
2776 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2778 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2779 if (ompt_enabled.ompt_callback_mutex_released) {
2780 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2781 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2787 lck = (kmp_user_lock_p)user_lock;
2791 else if ((__kmp_user_lock_kind == lk_futex) &&
2792 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2793 lck = (kmp_user_lock_p)user_lock;
2797 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2801 __kmp_itt_lock_releasing(lck);
2802 #endif /* USE_ITT_BUILD */
2804 RELEASE_LOCK(lck, gtid);
2806 #if OMPT_SUPPORT && OMPT_OPTIONAL
2807 // This is the case, if called from omp_init_lock_with_hint:
2808 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2810 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2811 if (ompt_enabled.ompt_callback_mutex_released) {
2812 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2813 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2817 #endif // KMP_USE_DYNAMIC_LOCK
2820 /* release the lock */
2821 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2822 #if KMP_USE_DYNAMIC_LOCK
2825 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2827 int release_status =
2828 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2829 (void) release_status;
2831 #if OMPT_SUPPORT && OMPT_OPTIONAL
2832 // This is the case, if called from omp_init_lock_with_hint:
2833 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2835 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2836 if (ompt_enabled.enabled) {
2837 if (release_status == KMP_LOCK_RELEASED) {
2838 if (ompt_enabled.ompt_callback_mutex_released) {
2839 // release_lock_last
2840 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2841 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2843 } else if (ompt_enabled.ompt_callback_nest_lock) {
2844 // release_lock_prev
2845 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2846 ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2851 #else // KMP_USE_DYNAMIC_LOCK
2853 kmp_user_lock_p lck;
2855 /* Can't use serial interval since not block structured */
2857 if ((__kmp_user_lock_kind == lk_tas) &&
2858 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2859 OMP_NEST_LOCK_T_SIZE)) {
2860 #if KMP_OS_LINUX && \
2861 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2862 // "fast" path implemented to fix customer performance issue
2863 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2865 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2866 #endif /* USE_ITT_BUILD */
2868 #if OMPT_SUPPORT && OMPT_OPTIONAL
2869 int release_status = KMP_LOCK_STILL_HELD;
2872 if (--(tl->lk.depth_locked) == 0) {
2873 TCW_4(tl->lk.poll, 0);
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875 release_status = KMP_LOCK_RELEASED;
2880 #if OMPT_SUPPORT && OMPT_OPTIONAL
2881 // This is the case, if called from omp_init_lock_with_hint:
2882 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2884 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2885 if (ompt_enabled.enabled) {
2886 if (release_status == KMP_LOCK_RELEASED) {
2887 if (ompt_enabled.ompt_callback_mutex_released) {
2888 // release_lock_last
2889 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2890 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2892 } else if (ompt_enabled.ompt_callback_nest_lock) {
2893 // release_lock_previous
2894 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2895 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2902 lck = (kmp_user_lock_p)user_lock;
2906 else if ((__kmp_user_lock_kind == lk_futex) &&
2907 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2908 OMP_NEST_LOCK_T_SIZE)) {
2909 lck = (kmp_user_lock_p)user_lock;
2913 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2917 __kmp_itt_lock_releasing(lck);
2918 #endif /* USE_ITT_BUILD */
2921 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923 // This is the case, if called from omp_init_lock_with_hint:
2924 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2926 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2927 if (ompt_enabled.enabled) {
2928 if (release_status == KMP_LOCK_RELEASED) {
2929 if (ompt_enabled.ompt_callback_mutex_released) {
2930 // release_lock_last
2931 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2932 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2934 } else if (ompt_enabled.ompt_callback_nest_lock) {
2935 // release_lock_previous
2936 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2937 ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2942 #endif // KMP_USE_DYNAMIC_LOCK
2945 /* try to acquire the lock */
2946 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2947 KMP_COUNT_BLOCK(OMP_test_lock);
2949 #if KMP_USE_DYNAMIC_LOCK
2951 int tag = KMP_EXTRACT_D_TAG(user_lock);
2953 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2955 #if OMPT_SUPPORT && OMPT_OPTIONAL
2956 // This is the case, if called from omp_init_lock_with_hint:
2957 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2959 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2960 if (ompt_enabled.ompt_callback_mutex_acquire) {
2961 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2962 ompt_mutex_lock, omp_lock_hint_none,
2963 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2967 #if KMP_USE_INLINED_TAS
2968 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2969 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2971 #elif KMP_USE_INLINED_FUTEX
2972 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2973 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2977 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2981 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2983 #if OMPT_SUPPORT && OMPT_OPTIONAL
2984 if (ompt_enabled.ompt_callback_mutex_acquired) {
2985 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2986 ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2992 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2997 #else // KMP_USE_DYNAMIC_LOCK
2999 kmp_user_lock_p lck;
3002 if ((__kmp_user_lock_kind == lk_tas) &&
3003 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3004 lck = (kmp_user_lock_p)user_lock;
3007 else if ((__kmp_user_lock_kind == lk_futex) &&
3008 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3009 lck = (kmp_user_lock_p)user_lock;
3013 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3017 __kmp_itt_lock_acquiring(lck);
3018 #endif /* USE_ITT_BUILD */
3019 #if OMPT_SUPPORT && OMPT_OPTIONAL
3020 // This is the case, if called from omp_init_lock_with_hint:
3021 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3023 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3024 if (ompt_enabled.ompt_callback_mutex_acquire) {
3025 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3026 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3027 (ompt_wait_id_t)lck, codeptr);
3031 rc = TEST_LOCK(lck, gtid);
3034 __kmp_itt_lock_acquired(lck);
3036 __kmp_itt_lock_cancelled(lck);
3038 #endif /* USE_ITT_BUILD */
3039 #if OMPT_SUPPORT && OMPT_OPTIONAL
3040 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3041 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3042 ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3046 return (rc ? FTN_TRUE : FTN_FALSE);
3048 /* Can't use serial interval since not block structured */
3050 #endif // KMP_USE_DYNAMIC_LOCK
3053 /* try to acquire the lock */
3054 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3055 #if KMP_USE_DYNAMIC_LOCK
3058 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3060 #if OMPT_SUPPORT && OMPT_OPTIONAL
3061 // This is the case, if called from omp_init_lock_with_hint:
3062 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3064 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3065 if (ompt_enabled.ompt_callback_mutex_acquire) {
3066 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3067 ompt_mutex_nest_lock, omp_lock_hint_none,
3068 __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3072 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3075 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3077 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3080 #if OMPT_SUPPORT && OMPT_OPTIONAL
3081 if (ompt_enabled.enabled && rc) {
3083 if (ompt_enabled.ompt_callback_mutex_acquired) {
3085 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3086 ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3089 if (ompt_enabled.ompt_callback_nest_lock) {
3091 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3092 ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3099 #else // KMP_USE_DYNAMIC_LOCK
3101 kmp_user_lock_p lck;
3104 if ((__kmp_user_lock_kind == lk_tas) &&
3105 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3106 OMP_NEST_LOCK_T_SIZE)) {
3107 lck = (kmp_user_lock_p)user_lock;
3110 else if ((__kmp_user_lock_kind == lk_futex) &&
3111 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3112 OMP_NEST_LOCK_T_SIZE)) {
3113 lck = (kmp_user_lock_p)user_lock;
3117 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3121 __kmp_itt_lock_acquiring(lck);
3122 #endif /* USE_ITT_BUILD */
3124 #if OMPT_SUPPORT && OMPT_OPTIONAL
3125 // This is the case, if called from omp_init_lock_with_hint:
3126 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3128 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3129 if (ompt_enabled.enabled) &&
3130 ompt_enabled.ompt_callback_mutex_acquire) {
3131 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3132 ompt_mutex_nest_lock, omp_lock_hint_none,
3133 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3137 rc = TEST_NESTED_LOCK(lck, gtid);
3140 __kmp_itt_lock_acquired(lck);
3142 __kmp_itt_lock_cancelled(lck);
3144 #endif /* USE_ITT_BUILD */
3145 #if OMPT_SUPPORT && OMPT_OPTIONAL
3146 if (ompt_enabled.enabled && rc) {
3148 if (ompt_enabled.ompt_callback_mutex_acquired) {
3150 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3151 ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3154 if (ompt_enabled.ompt_callback_nest_lock) {
3156 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3157 ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3164 /* Can't use serial interval since not block structured */
3166 #endif // KMP_USE_DYNAMIC_LOCK
3169 // Interface to fast scalable reduce methods routines
3171 // keep the selected method in a thread local structure for cross-function
3172 // usage: will be used in __kmpc_end_reduce* functions;
3173 // another solution: to re-determine the method one more time in
3174 // __kmpc_end_reduce* functions (new prototype required then)
3175 // AT: which solution is better?
3176 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3177 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3179 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3180 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3182 // description of the packed_reduction_method variable: look at the macros in
3185 // used in a critical section reduce block
3186 static __forceinline void
3187 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3188 kmp_critical_name *crit) {
3190 // this lock was visible to a customer and to the threading profile tool as a
3191 // serial overhead span (although it's used for an internal purpose only)
3192 // why was it visible in previous implementation?
3193 // should we keep it visible in new reduce block?
3194 kmp_user_lock_p lck;
3196 #if KMP_USE_DYNAMIC_LOCK
3198 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3199 // Check if it is initialized.
3201 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3202 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3203 KMP_GET_D_TAG(__kmp_user_lock_seq));
3205 __kmp_init_indirect_csptr(crit, loc, global_tid,
3206 KMP_GET_I_TAG(__kmp_user_lock_seq));
3209 // Branch for accessing the actual lock object and set operation. This
3210 // branching is inevitable since this lock initialization does not follow the
3211 // normal dispatch path (lock table is not used).
3212 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3213 lck = (kmp_user_lock_p)lk;
3214 KMP_DEBUG_ASSERT(lck != NULL);
3215 if (__kmp_env_consistency_check) {
3216 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3218 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3220 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3222 KMP_DEBUG_ASSERT(lck != NULL);
3223 if (__kmp_env_consistency_check) {
3224 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3226 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3229 #else // KMP_USE_DYNAMIC_LOCK
3231 // We know that the fast reduction code is only emitted by Intel compilers
3232 // with 32 byte critical sections. If there isn't enough space, then we
3233 // have to use a pointer.
3234 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3235 lck = (kmp_user_lock_p)crit;
3237 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3239 KMP_DEBUG_ASSERT(lck != NULL);
3241 if (__kmp_env_consistency_check)
3242 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3244 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3246 #endif // KMP_USE_DYNAMIC_LOCK
3249 // used in a critical section reduce block
3250 static __forceinline void
3251 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3252 kmp_critical_name *crit) {
3254 kmp_user_lock_p lck;
3256 #if KMP_USE_DYNAMIC_LOCK
3258 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3259 lck = (kmp_user_lock_p)crit;
3260 if (__kmp_env_consistency_check)
3261 __kmp_pop_sync(global_tid, ct_critical, loc);
3262 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3264 kmp_indirect_lock_t *ilk =
3265 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3266 if (__kmp_env_consistency_check)
3267 __kmp_pop_sync(global_tid, ct_critical, loc);
3268 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3271 #else // KMP_USE_DYNAMIC_LOCK
3273 // We know that the fast reduction code is only emitted by Intel compilers
3274 // with 32 byte critical sections. If there isn't enough space, then we have
3275 // to use a pointer.
3276 if (__kmp_base_user_lock_size > 32) {
3277 lck = *((kmp_user_lock_p *)crit);
3278 KMP_ASSERT(lck != NULL);
3280 lck = (kmp_user_lock_p)crit;
3283 if (__kmp_env_consistency_check)
3284 __kmp_pop_sync(global_tid, ct_critical, loc);
3286 __kmp_release_user_lock_with_checks(lck, global_tid);
3288 #endif // KMP_USE_DYNAMIC_LOCK
3289 } // __kmp_end_critical_section_reduce_block
3292 static __forceinline int
3293 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3297 // Check if we are inside the teams construct?
3298 if (th->th.th_teams_microtask) {
3299 *team_p = team = th->th.th_team;
3300 if (team->t.t_level == th->th.th_teams_level) {
3301 // This is reduction at teams construct.
3302 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3303 // Let's swap teams temporarily for the reduction.
3304 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3305 th->th.th_team = team->t.t_parent;
3306 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3307 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3308 *task_state = th->th.th_task_state;
3309 th->th.th_task_state = 0;
3317 static __forceinline void
3318 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3319 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3320 th->th.th_info.ds.ds_tid = 0;
3321 th->th.th_team = team;
3322 th->th.th_team_nproc = team->t.t_nproc;
3323 th->th.th_task_team = team->t.t_task_team[task_state];
3324 th->th.th_task_state = task_state;
3328 /* 2.a.i. Reduce Block without a terminating barrier */
3330 @ingroup SYNCHRONIZATION
3331 @param loc source location information
3332 @param global_tid global thread number
3333 @param num_vars number of items (variables) to be reduced
3334 @param reduce_size size of data in bytes to be reduced
3335 @param reduce_data pointer to data to be reduced
3336 @param reduce_func callback function providing reduction operation on two
3337 operands and returning result of reduction in lhs_data
3338 @param lck pointer to the unique lock data structure
3339 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3340 threads if atomic reduction needed
3342 The nowait version is used for a reduce clause with the nowait argument.
3345 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3346 size_t reduce_size, void *reduce_data,
3347 void (*reduce_func)(void *lhs_data, void *rhs_data),
3348 kmp_critical_name *lck) {
3350 KMP_COUNT_BLOCK(REDUCE_nowait);
3352 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3356 int teams_swapped = 0, task_state;
3358 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3360 // why do we need this initialization here at all?
3361 // Reduction clause can not be used as a stand-alone directive.
3363 // do not call __kmp_serial_initialize(), it will be called by
3364 // __kmp_parallel_initialize() if needed
3365 // possible detection of false-positive race by the threadchecker ???
3366 if (!TCR_4(__kmp_init_parallel))
3367 __kmp_parallel_initialize();
3369 // check correctness of reduce block nesting
3370 #if KMP_USE_DYNAMIC_LOCK
3371 if (__kmp_env_consistency_check)
3372 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3374 if (__kmp_env_consistency_check)
3375 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3379 th = __kmp_thread_from_gtid(global_tid);
3380 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3381 #endif // OMP_40_ENABLED
3383 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3384 // the value should be kept in a variable
3385 // the variable should be either a construct-specific or thread-specific
3386 // property, not a team specific property
3387 // (a thread can reach the next reduce block on the next construct, reduce
3388 // method may differ on the next construct)
3389 // an ident_t "loc" parameter could be used as a construct-specific property
3390 // (what if loc == 0?)
3391 // (if both construct-specific and team-specific variables were shared,
3392 // then unness extra syncs should be needed)
3393 // a thread-specific variable is better regarding two issues above (next
3394 // construct and extra syncs)
3395 // a thread-specific "th_local.reduction_method" variable is used currently
3396 // each thread executes 'determine' and 'set' lines (no need to execute by one
3397 // thread, to avoid unness extra syncs)
3399 packed_reduction_method = __kmp_determine_reduction_method(
3400 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3401 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3403 if (packed_reduction_method == critical_reduce_block) {
3405 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3408 } else if (packed_reduction_method == empty_reduce_block) {
3410 // usage: if team size == 1, no synchronization is required ( Intel
3414 } else if (packed_reduction_method == atomic_reduce_block) {
3418 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3419 // won't be called by the code gen)
3420 // (it's not quite good, because the checking block has been closed by
3422 // but atomic operation has not been executed yet, will be executed
3423 // slightly later, literally on next instruction)
3424 if (__kmp_env_consistency_check)
3425 __kmp_pop_sync(global_tid, ct_reduce, loc);
3427 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3428 tree_reduce_block)) {
3430 // AT: performance issue: a real barrier here
3431 // AT: (if master goes slow, other threads are blocked here waiting for the
3432 // master to come and release them)
3433 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3434 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3435 // be confusing to a customer)
3436 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3437 // might go faster and be more in line with sense of NOWAIT
3438 // AT: TO DO: do epcc test and compare times
3440 // this barrier should be invisible to a customer and to the threading profile
3441 // tool (it's neither a terminating barrier nor customer's code, it's
3442 // used for an internal purpose)
3444 // JP: can this barrier potentially leed to task scheduling?
3445 // JP: as long as there is a barrier in the implementation, OMPT should and
3446 // will provide the barrier events
3447 // so we set-up the necessary frame/return addresses.
3448 ompt_frame_t *ompt_frame;
3449 if (ompt_enabled.enabled) {
3450 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3451 if (ompt_frame->enter_frame.ptr == NULL)
3452 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3453 OMPT_STORE_RETURN_ADDRESS(global_tid);
3457 __kmp_threads[global_tid]->th.th_ident = loc;
3460 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3461 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3462 retval = (retval != 0) ? (0) : (1);
3463 #if OMPT_SUPPORT && OMPT_OPTIONAL
3464 if (ompt_enabled.enabled) {
3465 ompt_frame->enter_frame = ompt_data_none;
3469 // all other workers except master should do this pop here
3470 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3471 if (__kmp_env_consistency_check) {
3473 __kmp_pop_sync(global_tid, ct_reduce, loc);
3479 // should never reach this block
3480 KMP_ASSERT(0); // "unexpected method"
3483 if (teams_swapped) {
3484 __kmp_restore_swapped_teams(th, team, task_state);
3489 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3490 global_tid, packed_reduction_method, retval));
3496 @ingroup SYNCHRONIZATION
3497 @param loc source location information
3498 @param global_tid global thread id.
3499 @param lck pointer to the unique lock data structure
3501 Finish the execution of a reduce nowait.
3503 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3504 kmp_critical_name *lck) {
3506 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3508 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3510 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3512 if (packed_reduction_method == critical_reduce_block) {
3514 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3516 } else if (packed_reduction_method == empty_reduce_block) {
3518 // usage: if team size == 1, no synchronization is required ( on Intel
3521 } else if (packed_reduction_method == atomic_reduce_block) {
3523 // neither master nor other workers should get here
3524 // (code gen does not generate this call in case 2: atomic reduce block)
3525 // actually it's better to remove this elseif at all;
3526 // after removal this value will checked by the 'else' and will assert
3528 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3529 tree_reduce_block)) {
3531 // only master gets here
3535 // should never reach this block
3536 KMP_ASSERT(0); // "unexpected method"
3539 if (__kmp_env_consistency_check)
3540 __kmp_pop_sync(global_tid, ct_reduce, loc);
3542 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3543 global_tid, packed_reduction_method));
3548 /* 2.a.ii. Reduce Block with a terminating barrier */
3551 @ingroup SYNCHRONIZATION
3552 @param loc source location information
3553 @param global_tid global thread number
3554 @param num_vars number of items (variables) to be reduced
3555 @param reduce_size size of data in bytes to be reduced
3556 @param reduce_data pointer to data to be reduced
3557 @param reduce_func callback function providing reduction operation on two
3558 operands and returning result of reduction in lhs_data
3559 @param lck pointer to the unique lock data structure
3560 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3561 threads if atomic reduction needed
3563 A blocking reduce that includes an implicit barrier.
3565 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3566 size_t reduce_size, void *reduce_data,
3567 void (*reduce_func)(void *lhs_data, void *rhs_data),
3568 kmp_critical_name *lck) {
3569 KMP_COUNT_BLOCK(REDUCE_wait);
3571 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3575 int teams_swapped = 0, task_state;
3578 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3580 // why do we need this initialization here at all?
3581 // Reduction clause can not be a stand-alone directive.
3583 // do not call __kmp_serial_initialize(), it will be called by
3584 // __kmp_parallel_initialize() if needed
3585 // possible detection of false-positive race by the threadchecker ???
3586 if (!TCR_4(__kmp_init_parallel))
3587 __kmp_parallel_initialize();
3589 // check correctness of reduce block nesting
3590 #if KMP_USE_DYNAMIC_LOCK
3591 if (__kmp_env_consistency_check)
3592 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3594 if (__kmp_env_consistency_check)
3595 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3599 th = __kmp_thread_from_gtid(global_tid);
3600 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3601 #endif // OMP_40_ENABLED
3603 packed_reduction_method = __kmp_determine_reduction_method(
3604 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3605 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3607 if (packed_reduction_method == critical_reduce_block) {
3609 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3612 } else if (packed_reduction_method == empty_reduce_block) {
3614 // usage: if team size == 1, no synchronization is required ( Intel
3618 } else if (packed_reduction_method == atomic_reduce_block) {
3622 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3623 tree_reduce_block)) {
3625 // case tree_reduce_block:
3626 // this barrier should be visible to a customer and to the threading profile
3627 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3629 ompt_frame_t *ompt_frame;
3630 if (ompt_enabled.enabled) {
3631 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3632 if (ompt_frame->enter_frame.ptr == NULL)
3633 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3634 OMPT_STORE_RETURN_ADDRESS(global_tid);
3638 __kmp_threads[global_tid]->th.th_ident =
3639 loc; // needed for correct notification of frames
3642 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3643 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3644 retval = (retval != 0) ? (0) : (1);
3645 #if OMPT_SUPPORT && OMPT_OPTIONAL
3646 if (ompt_enabled.enabled) {
3647 ompt_frame->enter_frame = ompt_data_none;
3651 // all other workers except master should do this pop here
3652 // ( none of other workers except master will enter __kmpc_end_reduce() )
3653 if (__kmp_env_consistency_check) {
3654 if (retval == 0) { // 0: all other workers; 1: master
3655 __kmp_pop_sync(global_tid, ct_reduce, loc);
3661 // should never reach this block
3662 KMP_ASSERT(0); // "unexpected method"
3665 if (teams_swapped) {
3666 __kmp_restore_swapped_teams(th, team, task_state);
3671 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3672 global_tid, packed_reduction_method, retval));
3678 @ingroup SYNCHRONIZATION
3679 @param loc source location information
3680 @param global_tid global thread id.
3681 @param lck pointer to the unique lock data structure
3683 Finish the execution of a blocking reduce.
3684 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3687 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3688 kmp_critical_name *lck) {
3690 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3694 int teams_swapped = 0, task_state;
3697 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3700 th = __kmp_thread_from_gtid(global_tid);
3701 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3702 #endif // OMP_40_ENABLED
3704 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3706 // this barrier should be visible to a customer and to the threading profile
3707 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3709 if (packed_reduction_method == critical_reduce_block) {
3711 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3713 // TODO: implicit barrier: should be exposed
3715 ompt_frame_t *ompt_frame;
3716 if (ompt_enabled.enabled) {
3717 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3718 if (ompt_frame->enter_frame.ptr == NULL)
3719 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3720 OMPT_STORE_RETURN_ADDRESS(global_tid);
3724 __kmp_threads[global_tid]->th.th_ident = loc;
3726 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3727 #if OMPT_SUPPORT && OMPT_OPTIONAL
3728 if (ompt_enabled.enabled) {
3729 ompt_frame->enter_frame = ompt_data_none;
3733 } else if (packed_reduction_method == empty_reduce_block) {
3735 // usage: if team size==1, no synchronization is required (Intel platforms only)
3737 // TODO: implicit barrier: should be exposed
3739 ompt_frame_t *ompt_frame;
3740 if (ompt_enabled.enabled) {
3741 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3742 if (ompt_frame->enter_frame.ptr == NULL)
3743 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3744 OMPT_STORE_RETURN_ADDRESS(global_tid);
3748 __kmp_threads[global_tid]->th.th_ident = loc;
3750 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3751 #if OMPT_SUPPORT && OMPT_OPTIONAL
3752 if (ompt_enabled.enabled) {
3753 ompt_frame->enter_frame = ompt_data_none;
3757 } else if (packed_reduction_method == atomic_reduce_block) {
3760 ompt_frame_t *ompt_frame;
3761 if (ompt_enabled.enabled) {
3762 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3763 if (ompt_frame->enter_frame.ptr == NULL)
3764 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3765 OMPT_STORE_RETURN_ADDRESS(global_tid);
3768 // TODO: implicit barrier: should be exposed
3770 __kmp_threads[global_tid]->th.th_ident = loc;
3772 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3773 #if OMPT_SUPPORT && OMPT_OPTIONAL
3774 if (ompt_enabled.enabled) {
3775 ompt_frame->enter_frame = ompt_data_none;
3779 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3780 tree_reduce_block)) {
3782 // only master executes here (master releases all other workers)
3783 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3788 // should never reach this block
3789 KMP_ASSERT(0); // "unexpected method"
3792 if (teams_swapped) {
3793 __kmp_restore_swapped_teams(th, team, task_state);
3797 if (__kmp_env_consistency_check)
3798 __kmp_pop_sync(global_tid, ct_reduce, loc);
3800 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3801 global_tid, packed_reduction_method));
3806 #undef __KMP_GET_REDUCTION_METHOD
3807 #undef __KMP_SET_REDUCTION_METHOD
3809 /* end of interface to fast scalable reduce routines */
3811 kmp_uint64 __kmpc_get_taskid() {
3816 gtid = __kmp_get_gtid();
3820 thread = __kmp_thread_from_gtid(gtid);
3821 return thread->th.th_current_task->td_task_id;
3823 } // __kmpc_get_taskid
3825 kmp_uint64 __kmpc_get_parent_taskid() {
3829 kmp_taskdata_t *parent_task;
3831 gtid = __kmp_get_gtid();
3835 thread = __kmp_thread_from_gtid(gtid);
3836 parent_task = thread->th.th_current_task->td_parent;
3837 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3839 } // __kmpc_get_parent_taskid
3843 @ingroup WORK_SHARING
3844 @param loc source location information.
3845 @param gtid global thread number.
3846 @param num_dims number of associated doacross loops.
3847 @param dims info on loops bounds.
3849 Initialize doacross loop information.
3850 Expect compiler send us inclusive bounds,
3851 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3853 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3854 const struct kmp_dim *dims) {
3856 kmp_int64 last, trace_count;
3857 kmp_info_t *th = __kmp_threads[gtid];
3858 kmp_team_t *team = th->th.th_team;
3860 kmp_disp_t *pr_buf = th->th.th_dispatch;
3861 dispatch_shared_info_t *sh_buf;
3865 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3866 gtid, num_dims, !team->t.t_serialized));
3867 KMP_DEBUG_ASSERT(dims != NULL);
3868 KMP_DEBUG_ASSERT(num_dims > 0);
3870 if (team->t.t_serialized) {
3871 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3872 return; // no dependencies if team is serialized
3874 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3875 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3877 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3879 // Save bounds info into allocated private buffer
3880 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3881 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3882 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3883 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3884 pr_buf->th_doacross_info[0] =
3885 (kmp_int64)num_dims; // first element is number of dimensions
3886 // Save also address of num_done in order to access it later without knowing
3888 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3889 pr_buf->th_doacross_info[2] = dims[0].lo;
3890 pr_buf->th_doacross_info[3] = dims[0].up;
3891 pr_buf->th_doacross_info[4] = dims[0].st;
3893 for (j = 1; j < num_dims; ++j) {
3895 range_length; // To keep ranges of all dimensions but the first dims[0]
3896 if (dims[j].st == 1) { // most common case
3897 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3898 range_length = dims[j].up - dims[j].lo + 1;
3900 if (dims[j].st > 0) {
3901 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3902 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3903 } else { // negative increment
3904 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3906 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3909 pr_buf->th_doacross_info[last++] = range_length;
3910 pr_buf->th_doacross_info[last++] = dims[j].lo;
3911 pr_buf->th_doacross_info[last++] = dims[j].up;
3912 pr_buf->th_doacross_info[last++] = dims[j].st;
3915 // Compute total trip count.
3916 // Start with range of dims[0] which we don't need to keep in the buffer.
3917 if (dims[0].st == 1) { // most common case
3918 trace_count = dims[0].up - dims[0].lo + 1;
3919 } else if (dims[0].st > 0) {
3920 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3921 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3922 } else { // negative increment
3923 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3924 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3926 for (j = 1; j < num_dims; ++j) {
3927 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3929 KMP_DEBUG_ASSERT(trace_count > 0);
3931 // Check if shared buffer is not occupied by other loop (idx -
3932 // __kmp_dispatch_num_buffers)
3933 if (idx != sh_buf->doacross_buf_idx) {
3934 // Shared buffer is occupied, wait for it to be free
3935 __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3939 // Check if we are the first thread. After the CAS the first thread gets 0,
3940 // others get 1 if initialization is in progress, allocated pointer otherwise.
3941 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3942 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3943 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3945 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3946 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3948 if (flags == NULL) {
3949 // we are the first thread, allocate the array of flags
3950 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3951 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3953 sh_buf->doacross_flags = flags;
3954 } else if (flags == (kmp_uint32 *)1) {
3956 // initialization is still in progress, need to wait
3957 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3959 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3966 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3967 pr_buf->th_doacross_flags =
3968 sh_buf->doacross_flags; // save private copy in order to not
3969 // touch shared buffer on each iteration
3970 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3973 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3974 kmp_int32 shft, num_dims, i;
3976 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3977 kmp_info_t *th = __kmp_threads[gtid];
3978 kmp_team_t *team = th->th.th_team;
3980 kmp_int64 lo, up, st;
3982 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3983 if (team->t.t_serialized) {
3984 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3985 return; // no dependencies if team is serialized
3988 // calculate sequential iteration number and check out-of-bounds condition
3989 pr_buf = th->th.th_dispatch;
3990 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3991 num_dims = pr_buf->th_doacross_info[0];
3992 lo = pr_buf->th_doacross_info[2];
3993 up = pr_buf->th_doacross_info[3];
3994 st = pr_buf->th_doacross_info[4];
3995 if (st == 1) { // most common case
3996 if (vec[0] < lo || vec[0] > up) {
3997 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3998 "bounds [%lld,%lld]\n",
3999 gtid, vec[0], lo, up));
4002 iter_number = vec[0] - lo;
4003 } else if (st > 0) {
4004 if (vec[0] < lo || vec[0] > up) {
4005 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4006 "bounds [%lld,%lld]\n",
4007 gtid, vec[0], lo, up));
4010 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4011 } else { // negative increment
4012 if (vec[0] > lo || vec[0] < up) {
4013 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4014 "bounds [%lld,%lld]\n",
4015 gtid, vec[0], lo, up));
4018 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4020 for (i = 1; i < num_dims; ++i) {
4022 kmp_int32 j = i * 4;
4023 ln = pr_buf->th_doacross_info[j + 1];
4024 lo = pr_buf->th_doacross_info[j + 2];
4025 up = pr_buf->th_doacross_info[j + 3];
4026 st = pr_buf->th_doacross_info[j + 4];
4028 if (vec[i] < lo || vec[i] > up) {
4029 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4030 "bounds [%lld,%lld]\n",
4031 gtid, vec[i], lo, up));
4035 } else if (st > 0) {
4036 if (vec[i] < lo || vec[i] > up) {
4037 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4038 "bounds [%lld,%lld]\n",
4039 gtid, vec[i], lo, up));
4042 iter = (kmp_uint64)(vec[i] - lo) / st;
4044 if (vec[i] > lo || vec[i] < up) {
4045 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4046 "bounds [%lld,%lld]\n",
4047 gtid, vec[i], lo, up));
4050 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4052 iter_number = iter + ln * iter_number;
4054 shft = iter_number % 32; // use 32-bit granularity
4055 iter_number >>= 5; // divided by 32
4057 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4062 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4063 gtid, (iter_number << 5) + shft));
4066 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4067 kmp_int32 shft, num_dims, i;
4069 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4070 kmp_info_t *th = __kmp_threads[gtid];
4071 kmp_team_t *team = th->th.th_team;
4075 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4076 if (team->t.t_serialized) {
4077 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4078 return; // no dependencies if team is serialized
4081 // calculate sequential iteration number (same as in "wait" but no
4082 // out-of-bounds checks)
4083 pr_buf = th->th.th_dispatch;
4084 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4085 num_dims = pr_buf->th_doacross_info[0];
4086 lo = pr_buf->th_doacross_info[2];
4087 st = pr_buf->th_doacross_info[4];
4088 if (st == 1) { // most common case
4089 iter_number = vec[0] - lo;
4090 } else if (st > 0) {
4091 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4092 } else { // negative increment
4093 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4095 for (i = 1; i < num_dims; ++i) {
4097 kmp_int32 j = i * 4;
4098 ln = pr_buf->th_doacross_info[j + 1];
4099 lo = pr_buf->th_doacross_info[j + 2];
4100 st = pr_buf->th_doacross_info[j + 4];
4103 } else if (st > 0) {
4104 iter = (kmp_uint64)(vec[i] - lo) / st;
4106 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4108 iter_number = iter + ln * iter_number;
4110 shft = iter_number % 32; // use 32-bit granularity
4111 iter_number >>= 5; // divided by 32
4114 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4115 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4116 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4117 (iter_number << 5) + shft));
4120 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4122 kmp_info_t *th = __kmp_threads[gtid];
4123 kmp_team_t *team = th->th.th_team;
4124 kmp_disp_t *pr_buf = th->th.th_dispatch;
4126 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4127 if (team->t.t_serialized) {
4128 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4129 return; // nothing to do
4131 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4132 if (num_done == th->th.th_team_nproc) {
4133 // we are the last thread, need to free shared resources
4134 int idx = pr_buf->th_doacross_buf_idx - 1;
4135 dispatch_shared_info_t *sh_buf =
4136 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4137 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4138 (kmp_int64)&sh_buf->doacross_num_done);
4139 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4140 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4141 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4142 sh_buf->doacross_flags = NULL;
4143 sh_buf->doacross_num_done = 0;
4144 sh_buf->doacross_buf_idx +=
4145 __kmp_dispatch_num_buffers; // free buffer for future re-use
4147 // free private resources (need to keep buffer index forever)
4148 pr_buf->th_doacross_flags = NULL;
4149 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4150 pr_buf->th_doacross_info = NULL;
4151 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4156 int __kmpc_get_target_offload(void) {
4157 if (!__kmp_init_serial) {
4158 __kmp_serial_initialize();
4160 return __kmp_target_offload;
4162 #endif // OMP_50_ENABLED