1 // z_Linux_asm.S: - microtasking routines specifically
2 // written for Intel platforms running Linux* OS
5 ////===----------------------------------------------------------------------===//
7 //// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 //// See https://llvm.org/LICENSE.txt for license information.
9 //// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 ////===----------------------------------------------------------------------===//
14 // -----------------------------------------------------------------------
16 // -----------------------------------------------------------------------
18 #include "kmp_config.h"
20 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
23 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
24 // The delay operation has the effect of removing the current thread from
25 // the round-robin HT mechanism, and therefore speeds up the issue rate of
26 // the other threads on the same core.
28 // A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29 // barrier time to increase greatly for 3 or more threads per core.
31 // A value of 100 works pretty well for up to 4 threads per core, but isn't
32 // quite as fast as 0 for 2 threads per core.
34 // We need to check what happens for oversubscription / > 4 threads per core.
35 // It is possible that we need to pass the delay value in as a parameter
36 // that the caller determines based on the total # threads / # cores.
43 # define pause_op .byte 0xf3,0x90
47 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
48 # define KMP_LABEL(x) L_##x // form the name of label
49 .macro KMP_CFI_DEF_OFFSET
53 .macro KMP_CFI_REGISTER
61 /* Not sure what .size does in icc, not sure if we need to do something
67 .globl KMP_PREFIX_UNDERSCORE($0)
68 KMP_PREFIX_UNDERSCORE($0):
70 # else // KMP_OS_DARWIN
71 # define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72 // Format labels so that they don't override function names in gdb's backtraces
73 // MIC assembler doesn't accept .L syntax, the L works fine there (as well as
76 # define KMP_LABEL(x) L_##x // local label
78 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
83 .macro DEBUG_INFO proc
85 // Not sure why we need .type and .size for the functions
92 .globl KMP_PREFIX_UNDERSCORE(\proc)
93 KMP_PREFIX_UNDERSCORE(\proc):
96 .macro KMP_CFI_DEF_OFFSET sz
97 .cfi_def_cfa_offset \sz
99 .macro KMP_CFI_OFFSET reg, sz
102 .macro KMP_CFI_REGISTER reg
103 .cfi_def_cfa_register \reg
105 .macro KMP_CFI_DEF reg, sz
106 .cfi_def_cfa \reg,\sz
108 # endif // KMP_OS_DARWIN
109 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
111 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
114 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
115 # define KMP_LABEL(x) L_##x // form the name of label
122 /* Not sure what .size does in icc, not sure if we need to do something
129 .globl KMP_PREFIX_UNDERSCORE($0)
130 KMP_PREFIX_UNDERSCORE($0):
132 # else // KMP_OS_DARWIN
133 # define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
134 // Format labels so that they don't override function names in gdb's backtraces
135 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
141 .macro DEBUG_INFO proc
143 // Not sure why we need .type and .size for the functions
145 .type \proc,@function
151 .globl KMP_PREFIX_UNDERSCORE(\proc)
152 KMP_PREFIX_UNDERSCORE(\proc):
155 # endif // KMP_OS_DARWIN
157 #endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
159 // -----------------------------------------------------------------------
161 // -----------------------------------------------------------------------
163 #ifdef KMP_GOMP_COMPAT
165 // Support for unnamed common blocks.
167 // Because the symbol ".gomp_critical_user_" contains a ".", we have to
168 // put this stuff in assembly.
173 .comm .gomp_critical_user_,32
175 .globl ___kmp_unnamed_critical_addr
176 ___kmp_unnamed_critical_addr:
177 .long .gomp_critical_user_
178 # else /* Linux* OS */
180 .comm .gomp_critical_user_,32,8
183 .global __kmp_unnamed_critical_addr
184 __kmp_unnamed_critical_addr:
185 .4byte .gomp_critical_user_
186 .type __kmp_unnamed_critical_addr,@object
187 .size __kmp_unnamed_critical_addr,4
188 # endif /* KMP_OS_DARWIN */
189 # endif /* KMP_ARCH_X86 */
194 .comm .gomp_critical_user_,32
196 .globl ___kmp_unnamed_critical_addr
197 ___kmp_unnamed_critical_addr:
198 .quad .gomp_critical_user_
199 # else /* Linux* OS */
201 .comm .gomp_critical_user_,32,8
204 .global __kmp_unnamed_critical_addr
205 __kmp_unnamed_critical_addr:
206 .8byte .gomp_critical_user_
207 .type __kmp_unnamed_critical_addr,@object
208 .size __kmp_unnamed_critical_addr,8
209 # endif /* KMP_OS_DARWIN */
210 # endif /* KMP_ARCH_X86_64 */
212 #endif /* KMP_GOMP_COMPAT */
215 #if KMP_ARCH_X86 && !KMP_ARCH_PPC64
217 // -----------------------------------------------------------------------
218 // microtasking routines specifically written for IA-32 architecture
220 // -----------------------------------------------------------------------
222 .ident "Intel Corporation"
226 // __kmp_x86_pause( void );
234 DEBUG_INFO __kmp_x86_pause
236 # if !KMP_ASM_INTRINS
238 //------------------------------------------------------------------------
240 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
242 PROC __kmp_test_then_add32
250 DEBUG_INFO __kmp_test_then_add32
252 //------------------------------------------------------------------------
253 // FUNCTION __kmp_xchg_fixed8
256 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
263 PROC __kmp_xchg_fixed8
265 movl 4(%esp), %ecx // "p"
266 movb 8(%esp), %al // "d"
272 DEBUG_INFO __kmp_xchg_fixed8
275 //------------------------------------------------------------------------
276 // FUNCTION __kmp_xchg_fixed16
279 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
285 PROC __kmp_xchg_fixed16
287 movl 4(%esp), %ecx // "p"
288 movw 8(%esp), %ax // "d"
294 DEBUG_INFO __kmp_xchg_fixed16
297 //------------------------------------------------------------------------
298 // FUNCTION __kmp_xchg_fixed32
301 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
308 PROC __kmp_xchg_fixed32
310 movl 4(%esp), %ecx // "p"
311 movl 8(%esp), %eax // "d"
317 DEBUG_INFO __kmp_xchg_fixed32
321 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
322 PROC __kmp_compare_and_store8
329 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
330 and $1, %eax // sign extend previous instruction
333 DEBUG_INFO __kmp_compare_and_store8
336 // __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
337 PROC __kmp_compare_and_store16
344 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
345 and $1, %eax // sign extend previous instruction
348 DEBUG_INFO __kmp_compare_and_store16
351 // __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
352 PROC __kmp_compare_and_store32
359 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
360 and $1, %eax // sign extend previous instruction
363 DEBUG_INFO __kmp_compare_and_store32
366 // __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
367 PROC __kmp_compare_and_store64
374 movl 12(%ebp), %eax // "cv" low order word
375 movl 16(%ebp), %edx // "cv" high order word
376 movl 20(%ebp), %ebx // "sv" low order word
377 movl 24(%ebp), %ecx // "sv" high order word
380 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
381 and $1, %eax // sign extend previous instruction
388 DEBUG_INFO __kmp_compare_and_store64
391 // __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
392 PROC __kmp_compare_and_store_ret8
401 DEBUG_INFO __kmp_compare_and_store_ret8
404 // __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
406 PROC __kmp_compare_and_store_ret16
415 DEBUG_INFO __kmp_compare_and_store_ret16
418 // __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
420 PROC __kmp_compare_and_store_ret32
429 DEBUG_INFO __kmp_compare_and_store_ret32
432 // __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
434 PROC __kmp_compare_and_store_ret64
441 movl 12(%ebp), %eax // "cv" low order word
442 movl 16(%ebp), %edx // "cv" high order word
443 movl 20(%ebp), %ebx // "sv" low order word
444 movl 24(%ebp), %ecx // "sv" high order word
453 DEBUG_INFO __kmp_compare_and_store_ret64
456 //------------------------------------------------------------------------
457 // FUNCTION __kmp_xchg_real32
460 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
467 PROC __kmp_xchg_real32
493 DEBUG_INFO __kmp_xchg_real32
495 # endif /* !KMP_ASM_INTRINS */
497 //------------------------------------------------------------------------
499 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
500 // int gtid, int tid,
501 // int argc, void *p_argv[]
504 // void **exit_frame_ptr
508 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
511 // (*pkfn)( & gtid, & tid, argv[0], ... );
515 // -- Begin __kmp_invoke_microtask
517 PROC __kmp_invoke_microtask
521 KMP_CFI_OFFSET ebp,-8
522 movl %esp,%ebp // establish the base pointer for this routine.
524 subl $8,%esp // allocate space for two local variables.
525 // These varibales are:
529 pushl %ebx // save %ebx to use during this routine
532 movl 28(%ebp),%ebx // get exit_frame address
533 movl %ebp,(%ebx) // save exit_frame
536 movl 20(%ebp),%ebx // Stack alignment - # args
537 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
538 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
540 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
541 movl %eax,%ebx // Save to %ebx
542 andl $0xFFFFFF80,%eax // mask off 7 bits
543 subl %eax,%ebx // Amount to subtract from %esp
544 subl %ebx,%esp // Prepare the stack ptr --
545 // now it will be aligned on 128-byte boundary at the call
547 movl 24(%ebp),%eax // copy from p_argv[]
548 movl %eax,-4(%ebp) // into the local variable *argv.
550 movl 20(%ebp),%ebx // argc is 20(%ebp)
555 jg KMP_LABEL(invoke_4)
556 jmp KMP_LABEL(invoke_3)
560 subl $4,%ebx // decrement argc.
561 addl %ebx,%eax // index into argv.
565 jmp KMP_LABEL(invoke_2)
568 leal 16(%ebp),%eax // push & tid
571 leal 12(%ebp),%eax // push & gtid
575 call *%ebx // call (*pkfn)();
577 movl $1,%eax // return 1;
579 movl -12(%ebp),%ebx // restore %ebx
584 DEBUG_INFO __kmp_invoke_microtask
585 // -- End __kmp_invoke_microtask
589 // __kmp_hardware_timestamp(void)
590 PROC __kmp_hardware_timestamp
594 DEBUG_INFO __kmp_hardware_timestamp
595 // -- End __kmp_hardware_timestamp
597 #endif /* KMP_ARCH_X86 */
602 // -----------------------------------------------------------------------
603 // microtasking routines specifically written for IA-32 architecture and
604 // Intel(R) 64 running Linux* OS
605 // -----------------------------------------------------------------------
608 // mark_description "Intel Corporation";
609 .ident "Intel Corporation"
610 // -- .file "z_Linux_asm.S"
614 // To prevent getting our code into .data section .text added to every routine
615 // definition for x86_64.
616 //------------------------------------------------------------------------
617 # if !KMP_ASM_INTRINS
619 //------------------------------------------------------------------------
620 // FUNCTION __kmp_test_then_add32
623 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
631 PROC __kmp_test_then_add32
633 movl %esi, %eax // "d"
638 DEBUG_INFO __kmp_test_then_add32
641 //------------------------------------------------------------------------
642 // FUNCTION __kmp_test_then_add64
645 // __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
652 PROC __kmp_test_then_add64
654 movq %rsi, %rax // "d"
659 DEBUG_INFO __kmp_test_then_add64
662 //------------------------------------------------------------------------
663 // FUNCTION __kmp_xchg_fixed8
666 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
674 PROC __kmp_xchg_fixed8
676 movb %sil, %al // "d"
682 DEBUG_INFO __kmp_xchg_fixed8
685 //------------------------------------------------------------------------
686 // FUNCTION __kmp_xchg_fixed16
689 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
696 PROC __kmp_xchg_fixed16
704 DEBUG_INFO __kmp_xchg_fixed16
707 //------------------------------------------------------------------------
708 // FUNCTION __kmp_xchg_fixed32
711 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
719 PROC __kmp_xchg_fixed32
721 movl %esi, %eax // "d"
727 DEBUG_INFO __kmp_xchg_fixed32
730 //------------------------------------------------------------------------
731 // FUNCTION __kmp_xchg_fixed64
734 // __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
741 PROC __kmp_xchg_fixed64
743 movq %rsi, %rax // "d"
749 DEBUG_INFO __kmp_xchg_fixed64
752 //------------------------------------------------------------------------
753 // FUNCTION __kmp_compare_and_store8
756 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
765 PROC __kmp_compare_and_store8
767 movb %sil, %al // "cv"
770 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
771 andq $1, %rax // sign extend previous instruction for return value
774 DEBUG_INFO __kmp_compare_and_store8
777 //------------------------------------------------------------------------
778 // FUNCTION __kmp_compare_and_store16
781 // __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
790 PROC __kmp_compare_and_store16
792 movw %si, %ax // "cv"
795 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
796 andq $1, %rax // sign extend previous instruction for return value
799 DEBUG_INFO __kmp_compare_and_store16
802 //------------------------------------------------------------------------
803 // FUNCTION __kmp_compare_and_store32
806 // __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
815 PROC __kmp_compare_and_store32
817 movl %esi, %eax // "cv"
820 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
821 andq $1, %rax // sign extend previous instruction for return value
824 DEBUG_INFO __kmp_compare_and_store32
827 //------------------------------------------------------------------------
828 // FUNCTION __kmp_compare_and_store64
831 // __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
839 PROC __kmp_compare_and_store64
841 movq %rsi, %rax // "cv"
844 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
845 andq $1, %rax // sign extend previous instruction for return value
848 DEBUG_INFO __kmp_compare_and_store64
850 //------------------------------------------------------------------------
851 // FUNCTION __kmp_compare_and_store_ret8
854 // __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
863 PROC __kmp_compare_and_store_ret8
865 movb %sil, %al // "cv"
870 DEBUG_INFO __kmp_compare_and_store_ret8
873 //------------------------------------------------------------------------
874 // FUNCTION __kmp_compare_and_store_ret16
877 // __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
886 PROC __kmp_compare_and_store_ret16
888 movw %si, %ax // "cv"
893 DEBUG_INFO __kmp_compare_and_store_ret16
896 //------------------------------------------------------------------------
897 // FUNCTION __kmp_compare_and_store_ret32
900 // __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
909 PROC __kmp_compare_and_store_ret32
911 movl %esi, %eax // "cv"
916 DEBUG_INFO __kmp_compare_and_store_ret32
919 //------------------------------------------------------------------------
920 // FUNCTION __kmp_compare_and_store_ret64
923 // __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
931 PROC __kmp_compare_and_store_ret64
933 movq %rsi, %rax // "cv"
938 DEBUG_INFO __kmp_compare_and_store_ret64
940 # endif /* !KMP_ASM_INTRINS */
945 # if !KMP_ASM_INTRINS
947 //------------------------------------------------------------------------
948 // FUNCTION __kmp_xchg_real32
951 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
955 // data: %xmm0 (lower 4 bytes)
957 // return: %xmm0 (lower 4 bytes)
959 PROC __kmp_xchg_real32
961 movd %xmm0, %eax // load "data" to eax
966 movd %eax, %xmm0 // load old value into return register
970 DEBUG_INFO __kmp_xchg_real32
973 //------------------------------------------------------------------------
974 // FUNCTION __kmp_xchg_real64
977 // __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
981 // data: %xmm0 (lower 8 bytes)
982 // return: %xmm0 (lower 8 bytes)
984 PROC __kmp_xchg_real64
986 movd %xmm0, %rax // load "data" to rax
991 movd %rax, %xmm0 // load old value into return register
994 DEBUG_INFO __kmp_xchg_real64
997 # endif /* !KMP_MIC */
999 # endif /* !KMP_ASM_INTRINS */
1001 //------------------------------------------------------------------------
1003 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1004 // int gtid, int tid,
1005 // int argc, void *p_argv[]
1008 // void **exit_frame_ptr
1012 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1015 // (*pkfn)( & gtid, & tid, argv[0], ... );
1019 // note: at call to pkfn must have %rsp 128-byte aligned for compiler
1030 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1031 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1034 // %rax: used all over the place
1035 // %rdx: used in stack pointer alignment calculation
1036 // %r11: used to traverse p_argv array
1037 // %rsi: used as temporary for stack parameters
1038 // used as temporary for number of pkfn parms to push
1039 // %rbx: used to hold pkfn address, and zero constant, callee-save
1041 // return: %eax (always 1/TRUE)
1045 // -- Begin __kmp_invoke_microtask
1048 PROC __kmp_invoke_microtask
1050 pushq %rbp // save base pointer
1051 KMP_CFI_DEF_OFFSET 16
1052 KMP_CFI_OFFSET rbp,-16
1053 movq %rsp,%rbp // establish the base pointer for this routine.
1054 KMP_CFI_REGISTER rbp
1057 movq %rbp, (%r9) // save exit_frame
1060 pushq %rbx // %rbx is callee-saved register
1061 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1062 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1064 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1065 movq $0, %rbx // constant for cmovs later
1066 subq $4, %rax // subtract four args passed in registers to pkfn
1068 js KMP_LABEL(kmp_0) // jump to movq
1069 jmp KMP_LABEL(kmp_0_exit) // jump ahead
1071 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1072 KMP_LABEL(kmp_0_exit):
1074 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1077 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1078 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1081 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1082 // without align, stack ptr would be this
1083 movq %rdx, %rax // Save to %rax
1085 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1086 subq %rax, %rdx // Amount to subtract from %rsp
1087 subq %rdx, %rsp // Prepare the stack ptr --
1088 // now %rsp will align to 128-byte boundary at call site
1090 // setup pkfn parameter reg and stack
1091 movq %rcx, %rax // argc -> %rax
1093 je KMP_LABEL(kmp_invoke_pass_parms) // jump ahead if no parms to push
1094 shlq $3, %rcx // argc*8 -> %rcx
1095 movq %r8, %rdx // p_argv -> %rdx
1096 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1098 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1100 KMP_LABEL(kmp_invoke_push_parms):
1101 // push nth - 7th parms to pkfn on stack
1102 subq $8, %rdx // decrement p_argv pointer to previous parm
1103 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1104 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1107 // C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1108 // if the name of the label that is an operand of this jecxz starts with a dot (".");
1109 // Apple's linker does not support 1-byte length relocation;
1110 // Resolution: replace all .labelX entries with L_labelX.
1112 jecxz KMP_LABEL(kmp_invoke_pass_parms) // stop when four p_argv[] parms left
1113 jmp KMP_LABEL(kmp_invoke_push_parms)
1115 KMP_LABEL(kmp_invoke_pass_parms): // put 1st - 6th parms to pkfn in registers.
1116 // order here is important to avoid trashing
1117 // registers used for both input and output parms!
1118 movq %rdi, %rbx // pkfn -> %rbx
1119 leaq __gtid(%rbp), %rdi // >id -> %rdi (store 1st parm to pkfn)
1120 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1122 movq %r8, %r11 // p_argv -> %r11
1125 cmpq $4, %rax // argc >= 4?
1126 jns KMP_LABEL(kmp_4) // jump to movq
1127 jmp KMP_LABEL(kmp_4_exit) // jump ahead
1129 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1130 KMP_LABEL(kmp_4_exit):
1132 cmpq $3, %rax // argc >= 3?
1133 jns KMP_LABEL(kmp_3) // jump to movq
1134 jmp KMP_LABEL(kmp_3_exit) // jump ahead
1136 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1137 KMP_LABEL(kmp_3_exit):
1139 cmpq $2, %rax // argc >= 2?
1140 jns KMP_LABEL(kmp_2) // jump to movq
1141 jmp KMP_LABEL(kmp_2_exit) // jump ahead
1143 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1144 KMP_LABEL(kmp_2_exit):
1146 cmpq $1, %rax // argc >= 1?
1147 jns KMP_LABEL(kmp_1) // jump to movq
1148 jmp KMP_LABEL(kmp_1_exit) // jump ahead
1150 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1151 KMP_LABEL(kmp_1_exit):
1153 cmpq $4, %rax // argc >= 4?
1154 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1156 cmpq $3, %rax // argc >= 3?
1157 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1159 cmpq $2, %rax // argc >= 2?
1160 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1162 cmpq $1, %rax // argc >= 1?
1163 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1166 call *%rbx // call (*pkfn)();
1167 movq $1, %rax // move 1 into return register;
1169 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1170 movq %rbp, %rsp // restore stack pointer
1171 popq %rbp // restore frame pointer
1175 DEBUG_INFO __kmp_invoke_microtask
1176 // -- End __kmp_invoke_microtask
1179 // __kmp_hardware_timestamp(void)
1181 PROC __kmp_hardware_timestamp
1187 DEBUG_INFO __kmp_hardware_timestamp
1188 // -- End __kmp_hardware_timestamp
1190 //------------------------------------------------------------------------
1191 // FUNCTION __kmp_bsr32
1194 // __kmp_bsr32( int );
1201 DEBUG_INFO __kmp_bsr32
1203 // -----------------------------------------------------------------------
1204 #endif /* KMP_ARCH_X86_64 */
1207 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
1209 //------------------------------------------------------------------------
1211 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1212 // int gtid, int tid,
1213 // int argc, void *p_argv[]
1216 // void **exit_frame_ptr
1220 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1223 // (*pkfn)( & gtid, & tid, argv[0], ... );
1225 // // FIXME: This is done at call-site and can be removed here.
1227 // *exit_frame_ptr = 0;
1242 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1243 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1246 // x8: used to hold pkfn address
1247 // w9: used as temporary for number of pkfn parms
1248 // x10: used to traverse p_argv array
1249 // x11: used as temporary for stack placement calculation
1250 // x12: used as temporary for stack parameters
1251 // x19: used to preserve exit_frame_ptr, callee-save
1253 // return: w0 (always 1/TRUE)
1259 // -- Begin __kmp_invoke_microtask
1262 PROC __kmp_invoke_microtask
1264 stp x29, x30, [sp, #-16]!
1266 stp x19, x20, [sp, #-16]!
1271 add w9, w9, w3, lsr #1
1272 sub sp, sp, w9, uxtw #4
1276 str w1, [x29, #-__gtid]
1277 str w2, [x29, #-__tid]
1285 sub x0, x29, #__gtid
1288 cbz w9, KMP_LABEL(kmp_1)
1292 cbz w9, KMP_LABEL(kmp_1)
1296 cbz w9, KMP_LABEL(kmp_1)
1300 cbz w9, KMP_LABEL(kmp_1)
1304 cbz w9, KMP_LABEL(kmp_1)
1308 cbz w9, KMP_LABEL(kmp_1)
1313 cbz w9, KMP_LABEL(kmp_1)
1323 ldp x19, x20, [sp], #16
1325 ldp x29, x30, [sp], #16
1328 DEBUG_INFO __kmp_invoke_microtask
1329 // -- End __kmp_invoke_microtask
1331 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
1335 //------------------------------------------------------------------------
1337 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1338 // int gtid, int tid,
1339 // int argc, void *p_argv[]
1342 // void **exit_frame_ptr
1346 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1349 // (*pkfn)( & gtid, & tid, argv[0], ... );
1351 // // FIXME: This is done at call-site and can be removed here.
1353 // *exit_frame_ptr = 0;
1367 // return: r3 (always 1/TRUE)
1370 # if KMP_ARCH_PPC64_ELFv2
1373 .globl __kmp_invoke_microtask
1375 # if KMP_ARCH_PPC64_ELFv2
1381 .type __kmp_invoke_microtask,@function
1383 # if KMP_ARCH_PPC64_ELFv2
1384 __kmp_invoke_microtask:
1387 addis 2, 12, .TOC.-.Lfunc_gep0@ha
1388 addi 2, 2, .TOC.-.Lfunc_gep0@l
1390 .localentry __kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1392 .section .opd,"aw",@progbits
1393 __kmp_invoke_microtask:
1402 // -- Begin __kmp_invoke_microtask
1405 // We need to allocate a stack frame large enough to hold all of the parameters
1406 // on the stack for the microtask plus what this function needs. That's 48
1407 // bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1408 // parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1409 // and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1410 // to save r30 to hold a copy of r8.
1417 // This is unusual because normally we'd set r31 equal to r1 after the stack
1418 // frame is established. In this case, however, we need to dynamically compute
1419 // the stack frame size, and so we keep a direct copy of r1 to access our
1420 // register save areas and restore the r1 value before returning.
1422 .cfi_def_cfa_register r31
1426 // Compute the size necessary for the local stack frame.
1427 # if KMP_ARCH_PPC64_ELFv2
1436 // We need to make sure that the stack frame stays aligned (to 16 bytes, except
1437 // under the BG/Q CNK, where it must be to 32 bytes).
1445 // Establish the local stack frame.
1449 .cfi_offset r30, -16
1455 // Store gtid and tid to the stack because they're passed by reference to the microtask.
1495 // There are more than 6 microtask parameters, so we need to store the
1496 // remainder to the stack.
1500 // These are set to 8 bytes before the first desired store address (we're using
1501 // pre-increment loads and stores in the loop below). The parameter save area
1502 // for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1503 // 32 + 8*8 == 96 bytes above r1 for ELFv2.
1505 # if KMP_ARCH_PPC64_ELFv2
1517 # if KMP_ARCH_PPC64_ELFv2
1522 // For ELFv1, we need to load the actual function address from the function descriptor.
1533 # if KMP_ARCH_PPC64_ELFv2
1559 .size __kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1562 // -- End __kmp_invoke_microtask
1564 #endif /* KMP_ARCH_PPC64 */
1566 #if KMP_ARCH_RISCV64
1568 //------------------------------------------------------------------------
1570 // typedef void (*microtask_t)(int *gtid, int *tid, ...);
1572 // int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1576 // void **exit_frame_ptr
1580 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1583 // (*pkfn)(>id, &tid, argv[0], ...);
1594 // a5: exit_frame_ptr
1597 // __gtid: gtid param pushed on stack so can pass >id to pkfn
1598 // __tid: tid param pushed on stack so can pass &tid to pkfn
1602 // t0: used to calculate the dynamic stack size / used to hold pkfn address
1603 // t1: used as temporary for stack placement calculation
1604 // t2: used as temporary for stack arguments
1605 // t3: used as temporary for number of remaining pkfn parms
1606 // t4: used to traverse p_argv array
1608 // return: a0 (always 1/TRUE)
1614 // -- Begin __kmp_invoke_microtask
1617 .globl __kmp_invoke_microtask
1619 .type __kmp_invoke_microtask,@function
1620 __kmp_invoke_microtask:
1623 // First, save ra and fp
1632 // Compute the dynamic stack size:
1634 // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1636 // - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1637 // function by register. Given that we have 8 of such registers (a[0-7])
1638 // and two + 'argc' arguments (consider >id and &tid), we need to
1639 // reserve max(0, argc - 6)*8 extra bytes
1641 // The total number of bytes is then max(0, argc - 6)*8 + 8
1643 // Compute max(0, argc - 6) using the following bithack:
1644 // max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1645 // Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1656 // Align the stack to 16 bytes
1664 // Save frame pointer into exit_frame
1668 // Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1699 // Prepare any additional argument passed through the stack
1713 // Call pkfn function
1716 // Restore stack and return
1726 .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1729 // -- End __kmp_invoke_microtask
1731 #endif /* KMP_ARCH_RISCV64 */
1733 #if KMP_ARCH_ARM || KMP_ARCH_MIPS
1735 .comm .gomp_critical_user_,32,8
1738 .global __kmp_unnamed_critical_addr
1739 __kmp_unnamed_critical_addr:
1740 .4byte .gomp_critical_user_
1741 .size __kmp_unnamed_critical_addr,4
1742 #endif /* KMP_ARCH_ARM */
1744 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
1746 .comm .gomp_critical_user_,32,8
1749 .global __kmp_unnamed_critical_addr
1750 __kmp_unnamed_critical_addr:
1751 .8byte .gomp_critical_user_
1752 .size __kmp_unnamed_critical_addr,8
1753 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
1758 .section .note.GNU-stack,"",%progbits
1760 .section .note.GNU-stack,"",@progbits