1 // z_Linux_asm.S: - microtasking routines specifically
2 // written for Intel platforms running Linux* OS
5 ////===----------------------------------------------------------------------===//
7 //// The LLVM Compiler Infrastructure
9 //// This file is dual licensed under the MIT and the University of Illinois Open
10 //// Source Licenses. See LICENSE.txt for details.
12 ////===----------------------------------------------------------------------===//
15 // -----------------------------------------------------------------------
17 // -----------------------------------------------------------------------
19 #include "kmp_config.h"
21 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
24 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
25 // The delay operation has the effect of removing the current thread from
26 // the round-robin HT mechanism, and therefore speeds up the issue rate of
27 // the other threads on the same core.
29 // A value of 0 works fine for <= 2 threads per core, but causes the EPCC
30 // barrier time to increase greatly for 3 or more threads per core.
32 // A value of 100 works pretty well for up to 4 threads per core, but isn't
33 // quite as fast as 0 for 2 threads per core.
35 // We need to check what happens for oversubscription / > 4 threads per core.
36 // It is possible that we need to pass the delay value in as a parameter
37 // that the caller determines based on the total # threads / # cores.
44 # define pause_op .byte 0xf3,0x90
48 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
49 # define KMP_LABEL(x) L_##x // form the name of label
50 .macro KMP_CFI_DEF_OFFSET
54 .macro KMP_CFI_REGISTER
62 /* Not sure what .size does in icc, not sure if we need to do something
68 .globl KMP_PREFIX_UNDERSCORE($0)
69 KMP_PREFIX_UNDERSCORE($0):
71 # else // KMP_OS_DARWIN
72 # define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
73 // Format labels so that they don't override function names in gdb's backtraces
74 // MIC assembler doesn't accept .L syntax, the L works fine there (as well as
77 # define KMP_LABEL(x) L_##x // local label
79 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
84 .macro DEBUG_INFO proc
86 // Not sure why we need .type and .size for the functions
93 .globl KMP_PREFIX_UNDERSCORE(\proc)
94 KMP_PREFIX_UNDERSCORE(\proc):
97 .macro KMP_CFI_DEF_OFFSET sz
98 .cfi_def_cfa_offset \sz
100 .macro KMP_CFI_OFFSET reg, sz
103 .macro KMP_CFI_REGISTER reg
104 .cfi_def_cfa_register \reg
106 .macro KMP_CFI_DEF reg, sz
107 .cfi_def_cfa \reg,\sz
109 # endif // KMP_OS_DARWIN
110 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
112 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
115 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
116 # define KMP_LABEL(x) L_##x // form the name of label
123 /* Not sure what .size does in icc, not sure if we need to do something
130 .globl KMP_PREFIX_UNDERSCORE($0)
131 KMP_PREFIX_UNDERSCORE($0):
133 # else // KMP_OS_DARWIN
134 # define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
135 // Format labels so that they don't override function names in gdb's backtraces
136 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
142 .macro DEBUG_INFO proc
144 // Not sure why we need .type and .size for the functions
146 .type \proc,@function
152 .globl KMP_PREFIX_UNDERSCORE(\proc)
153 KMP_PREFIX_UNDERSCORE(\proc):
156 # endif // KMP_OS_DARWIN
158 #endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
160 // -----------------------------------------------------------------------
162 // -----------------------------------------------------------------------
164 #ifdef KMP_GOMP_COMPAT
166 // Support for unnamed common blocks.
168 // Because the symbol ".gomp_critical_user_" contains a ".", we have to
169 // put this stuff in assembly.
174 .comm .gomp_critical_user_,32
176 .globl ___kmp_unnamed_critical_addr
177 ___kmp_unnamed_critical_addr:
178 .long .gomp_critical_user_
179 # else /* Linux* OS */
181 .comm .gomp_critical_user_,32,8
184 .global __kmp_unnamed_critical_addr
185 __kmp_unnamed_critical_addr:
186 .4byte .gomp_critical_user_
187 .type __kmp_unnamed_critical_addr,@object
188 .size __kmp_unnamed_critical_addr,4
189 # endif /* KMP_OS_DARWIN */
190 # endif /* KMP_ARCH_X86 */
195 .comm .gomp_critical_user_,32
197 .globl ___kmp_unnamed_critical_addr
198 ___kmp_unnamed_critical_addr:
199 .quad .gomp_critical_user_
200 # else /* Linux* OS */
202 .comm .gomp_critical_user_,32,8
205 .global __kmp_unnamed_critical_addr
206 __kmp_unnamed_critical_addr:
207 .8byte .gomp_critical_user_
208 .type __kmp_unnamed_critical_addr,@object
209 .size __kmp_unnamed_critical_addr,8
210 # endif /* KMP_OS_DARWIN */
211 # endif /* KMP_ARCH_X86_64 */
213 #endif /* KMP_GOMP_COMPAT */
216 #if KMP_ARCH_X86 && !KMP_ARCH_PPC64
218 // -----------------------------------------------------------------------
219 // microtasking routines specifically written for IA-32 architecture
221 // -----------------------------------------------------------------------
223 .ident "Intel Corporation"
227 // __kmp_x86_pause( void );
235 DEBUG_INFO __kmp_x86_pause
238 // __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
251 cpuid // Query the CPUID for the current processor
267 DEBUG_INFO __kmp_x86_cpuid
270 # if !KMP_ASM_INTRINS
272 //------------------------------------------------------------------------
274 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
276 PROC __kmp_test_then_add32
284 DEBUG_INFO __kmp_test_then_add32
286 //------------------------------------------------------------------------
287 // FUNCTION __kmp_xchg_fixed8
290 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
297 PROC __kmp_xchg_fixed8
299 movl 4(%esp), %ecx // "p"
300 movb 8(%esp), %al // "d"
306 DEBUG_INFO __kmp_xchg_fixed8
309 //------------------------------------------------------------------------
310 // FUNCTION __kmp_xchg_fixed16
313 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
319 PROC __kmp_xchg_fixed16
321 movl 4(%esp), %ecx // "p"
322 movw 8(%esp), %ax // "d"
328 DEBUG_INFO __kmp_xchg_fixed16
331 //------------------------------------------------------------------------
332 // FUNCTION __kmp_xchg_fixed32
335 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
342 PROC __kmp_xchg_fixed32
344 movl 4(%esp), %ecx // "p"
345 movl 8(%esp), %eax // "d"
351 DEBUG_INFO __kmp_xchg_fixed32
355 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
356 PROC __kmp_compare_and_store8
363 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
364 and $1, %eax // sign extend previous instruction
367 DEBUG_INFO __kmp_compare_and_store8
370 // __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
371 PROC __kmp_compare_and_store16
378 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
379 and $1, %eax // sign extend previous instruction
382 DEBUG_INFO __kmp_compare_and_store16
385 // __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
386 PROC __kmp_compare_and_store32
393 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
394 and $1, %eax // sign extend previous instruction
397 DEBUG_INFO __kmp_compare_and_store32
400 // __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
401 PROC __kmp_compare_and_store64
408 movl 12(%ebp), %eax // "cv" low order word
409 movl 16(%ebp), %edx // "cv" high order word
410 movl 20(%ebp), %ebx // "sv" low order word
411 movl 24(%ebp), %ecx // "sv" high order word
414 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
415 and $1, %eax // sign extend previous instruction
422 DEBUG_INFO __kmp_compare_and_store64
425 // __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
426 PROC __kmp_compare_and_store_ret8
435 DEBUG_INFO __kmp_compare_and_store_ret8
438 // __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
440 PROC __kmp_compare_and_store_ret16
449 DEBUG_INFO __kmp_compare_and_store_ret16
452 // __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
454 PROC __kmp_compare_and_store_ret32
463 DEBUG_INFO __kmp_compare_and_store_ret32
466 // __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
468 PROC __kmp_compare_and_store_ret64
475 movl 12(%ebp), %eax // "cv" low order word
476 movl 16(%ebp), %edx // "cv" high order word
477 movl 20(%ebp), %ebx // "sv" low order word
478 movl 24(%ebp), %ecx // "sv" high order word
487 DEBUG_INFO __kmp_compare_and_store_ret64
490 //------------------------------------------------------------------------
491 // FUNCTION __kmp_xchg_real32
494 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
501 PROC __kmp_xchg_real32
527 DEBUG_INFO __kmp_xchg_real32
529 # endif /* !KMP_ASM_INTRINS */
532 //------------------------------------------------------------------------
533 // FUNCTION __kmp_load_x87_fpu_control_word
536 // __kmp_load_x87_fpu_control_word( kmp_int16 *p );
540 PROC __kmp_load_x87_fpu_control_word
546 DEBUG_INFO __kmp_load_x87_fpu_control_word
549 //------------------------------------------------------------------------
550 // FUNCTION __kmp_store_x87_fpu_control_word
553 // __kmp_store_x87_fpu_control_word( kmp_int16 *p );
557 PROC __kmp_store_x87_fpu_control_word
563 DEBUG_INFO __kmp_store_x87_fpu_control_word
566 //------------------------------------------------------------------------
567 // FUNCTION __kmp_clear_x87_fpu_status_word
570 // __kmp_clear_x87_fpu_status_word();
571 PROC __kmp_clear_x87_fpu_status_word
576 DEBUG_INFO __kmp_clear_x87_fpu_status_word
579 //------------------------------------------------------------------------
580 // typedef void (*microtask_t)( int *gtid, int *tid, ... );
583 // __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
584 // int argc, void *p_argv[] ) {
585 // (*pkfn)( & gtid, & gtid, argv[0], ... );
589 // -- Begin __kmp_invoke_microtask
591 PROC __kmp_invoke_microtask
595 KMP_CFI_OFFSET ebp,-8
596 movl %esp,%ebp // establish the base pointer for this routine.
598 subl $8,%esp // allocate space for two local variables.
599 // These varibales are:
603 pushl %ebx // save %ebx to use during this routine
606 movl 28(%ebp),%ebx // get exit_frame address
607 movl %ebp,(%ebx) // save exit_frame
610 movl 20(%ebp),%ebx // Stack alignment - # args
611 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
612 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
614 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
615 movl %eax,%ebx // Save to %ebx
616 andl $0xFFFFFF80,%eax // mask off 7 bits
617 subl %eax,%ebx // Amount to subtract from %esp
618 subl %ebx,%esp // Prepare the stack ptr --
619 // now it will be aligned on 128-byte boundary at the call
621 movl 24(%ebp),%eax // copy from p_argv[]
622 movl %eax,-4(%ebp) // into the local variable *argv.
624 movl 20(%ebp),%ebx // argc is 20(%ebp)
629 jg KMP_LABEL(invoke_4)
630 jmp KMP_LABEL(invoke_3)
634 subl $4,%ebx // decrement argc.
635 addl %ebx,%eax // index into argv.
639 jmp KMP_LABEL(invoke_2)
642 leal 16(%ebp),%eax // push & tid
645 leal 12(%ebp),%eax // push & gtid
649 call *%ebx // call (*pkfn)();
651 movl $1,%eax // return 1;
653 movl -12(%ebp),%ebx // restore %ebx
658 DEBUG_INFO __kmp_invoke_microtask
659 // -- End __kmp_invoke_microtask
663 // __kmp_hardware_timestamp(void)
664 PROC __kmp_hardware_timestamp
668 DEBUG_INFO __kmp_hardware_timestamp
669 // -- End __kmp_hardware_timestamp
671 #endif /* KMP_ARCH_X86 */
676 // -----------------------------------------------------------------------
677 // microtasking routines specifically written for IA-32 architecture and
678 // Intel(R) 64 running Linux* OS
679 // -----------------------------------------------------------------------
682 // mark_description "Intel Corporation";
683 .ident "Intel Corporation"
684 // -- .file "z_Linux_asm.S"
688 // To prevent getting our code into .data section .text added to every routine
689 // definition for x86_64.
690 //------------------------------------------------------------------------
691 // FUNCTION __kmp_x86_cpuid
694 // __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
699 // cpuid_buffer: %rdx
705 pushq %rbx // callee-save register
707 movl %esi, %ecx // "mode2"
708 movl %edi, %eax // "mode"
709 movq %rdx, %rsi // cpuid_buffer
710 cpuid // Query the CPUID for the current processor
712 movl %eax, 0(%rsi) // store results into buffer
717 popq %rbx // callee-save register
722 DEBUG_INFO __kmp_x86_cpuid
726 # if !KMP_ASM_INTRINS
728 //------------------------------------------------------------------------
729 // FUNCTION __kmp_test_then_add32
732 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
740 PROC __kmp_test_then_add32
742 movl %esi, %eax // "d"
747 DEBUG_INFO __kmp_test_then_add32
750 //------------------------------------------------------------------------
751 // FUNCTION __kmp_test_then_add64
754 // __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
761 PROC __kmp_test_then_add64
763 movq %rsi, %rax // "d"
768 DEBUG_INFO __kmp_test_then_add64
771 //------------------------------------------------------------------------
772 // FUNCTION __kmp_xchg_fixed8
775 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
783 PROC __kmp_xchg_fixed8
785 movb %sil, %al // "d"
791 DEBUG_INFO __kmp_xchg_fixed8
794 //------------------------------------------------------------------------
795 // FUNCTION __kmp_xchg_fixed16
798 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
805 PROC __kmp_xchg_fixed16
813 DEBUG_INFO __kmp_xchg_fixed16
816 //------------------------------------------------------------------------
817 // FUNCTION __kmp_xchg_fixed32
820 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
828 PROC __kmp_xchg_fixed32
830 movl %esi, %eax // "d"
836 DEBUG_INFO __kmp_xchg_fixed32
839 //------------------------------------------------------------------------
840 // FUNCTION __kmp_xchg_fixed64
843 // __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
850 PROC __kmp_xchg_fixed64
852 movq %rsi, %rax // "d"
858 DEBUG_INFO __kmp_xchg_fixed64
861 //------------------------------------------------------------------------
862 // FUNCTION __kmp_compare_and_store8
865 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
874 PROC __kmp_compare_and_store8
876 movb %sil, %al // "cv"
879 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
880 andq $1, %rax // sign extend previous instruction for return value
883 DEBUG_INFO __kmp_compare_and_store8
886 //------------------------------------------------------------------------
887 // FUNCTION __kmp_compare_and_store16
890 // __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
899 PROC __kmp_compare_and_store16
901 movw %si, %ax // "cv"
904 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
905 andq $1, %rax // sign extend previous instruction for return value
908 DEBUG_INFO __kmp_compare_and_store16
911 //------------------------------------------------------------------------
912 // FUNCTION __kmp_compare_and_store32
915 // __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
924 PROC __kmp_compare_and_store32
926 movl %esi, %eax // "cv"
929 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
930 andq $1, %rax // sign extend previous instruction for return value
933 DEBUG_INFO __kmp_compare_and_store32
936 //------------------------------------------------------------------------
937 // FUNCTION __kmp_compare_and_store64
940 // __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
948 PROC __kmp_compare_and_store64
950 movq %rsi, %rax // "cv"
953 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
954 andq $1, %rax // sign extend previous instruction for return value
957 DEBUG_INFO __kmp_compare_and_store64
959 //------------------------------------------------------------------------
960 // FUNCTION __kmp_compare_and_store_ret8
963 // __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
972 PROC __kmp_compare_and_store_ret8
974 movb %sil, %al // "cv"
979 DEBUG_INFO __kmp_compare_and_store_ret8
982 //------------------------------------------------------------------------
983 // FUNCTION __kmp_compare_and_store_ret16
986 // __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
995 PROC __kmp_compare_and_store_ret16
997 movw %si, %ax // "cv"
1002 DEBUG_INFO __kmp_compare_and_store_ret16
1005 //------------------------------------------------------------------------
1006 // FUNCTION __kmp_compare_and_store_ret32
1009 // __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
1018 PROC __kmp_compare_and_store_ret32
1020 movl %esi, %eax // "cv"
1022 cmpxchgl %edx,(%rdi)
1025 DEBUG_INFO __kmp_compare_and_store_ret32
1028 //------------------------------------------------------------------------
1029 // FUNCTION __kmp_compare_and_store_ret64
1032 // __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1040 PROC __kmp_compare_and_store_ret64
1042 movq %rsi, %rax // "cv"
1044 cmpxchgq %rdx,(%rdi)
1047 DEBUG_INFO __kmp_compare_and_store_ret64
1049 # endif /* !KMP_ASM_INTRINS */
1054 # if !KMP_ASM_INTRINS
1056 //------------------------------------------------------------------------
1057 // FUNCTION __kmp_xchg_real32
1060 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1064 // data: %xmm0 (lower 4 bytes)
1066 // return: %xmm0 (lower 4 bytes)
1068 PROC __kmp_xchg_real32
1070 movd %xmm0, %eax // load "data" to eax
1075 movd %eax, %xmm0 // load old value into return register
1079 DEBUG_INFO __kmp_xchg_real32
1082 //------------------------------------------------------------------------
1083 // FUNCTION __kmp_xchg_real64
1086 // __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1090 // data: %xmm0 (lower 8 bytes)
1091 // return: %xmm0 (lower 8 bytes)
1093 PROC __kmp_xchg_real64
1095 movd %xmm0, %rax // load "data" to rax
1100 movd %rax, %xmm0 // load old value into return register
1103 DEBUG_INFO __kmp_xchg_real64
1106 # endif /* !KMP_MIC */
1108 # endif /* !KMP_ASM_INTRINS */
1111 //------------------------------------------------------------------------
1112 // FUNCTION __kmp_load_x87_fpu_control_word
1115 // __kmp_load_x87_fpu_control_word( kmp_int16 *p );
1120 PROC __kmp_load_x87_fpu_control_word
1125 DEBUG_INFO __kmp_load_x87_fpu_control_word
1128 //------------------------------------------------------------------------
1129 // FUNCTION __kmp_store_x87_fpu_control_word
1132 // __kmp_store_x87_fpu_control_word( kmp_int16 *p );
1137 PROC __kmp_store_x87_fpu_control_word
1142 DEBUG_INFO __kmp_store_x87_fpu_control_word
1145 //------------------------------------------------------------------------
1146 // FUNCTION __kmp_clear_x87_fpu_status_word
1149 // __kmp_clear_x87_fpu_status_word();
1151 PROC __kmp_clear_x87_fpu_status_word
1154 // TODO: remove the workaround for problem with fnclex instruction (no CQ known)
1155 fstenv -32(%rsp) // store FP env
1156 andw $~0x80ff, 4-32(%rsp) // clear 0-7,15 bits of FP SW
1157 fldenv -32(%rsp) // load FP env back
1164 DEBUG_INFO __kmp_clear_x87_fpu_status_word
1167 //------------------------------------------------------------------------
1168 // typedef void (*microtask_t)( int *gtid, int *tid, ... );
1171 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1172 // int gtid, int tid,
1173 // int argc, void *p_argv[] ) {
1174 // (*pkfn)( & gtid, & tid, argv[0], ... );
1178 // note: at call to pkfn must have %rsp 128-byte aligned for compiler
1189 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1190 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1193 // %rax: used all over the place
1194 // %rdx: used in stack pointer alignment calculation
1195 // %r11: used to traverse p_argv array
1196 // %rsi: used as temporary for stack parameters
1197 // used as temporary for number of pkfn parms to push
1198 // %rbx: used to hold pkfn address, and zero constant, callee-save
1200 // return: %eax (always 1/TRUE)
1204 // -- Begin __kmp_invoke_microtask
1207 PROC __kmp_invoke_microtask
1209 pushq %rbp // save base pointer
1210 KMP_CFI_DEF_OFFSET 16
1211 KMP_CFI_OFFSET rbp,-16
1212 movq %rsp,%rbp // establish the base pointer for this routine.
1213 KMP_CFI_REGISTER rbp
1216 movq %rbp, (%r9) // save exit_frame
1219 pushq %rbx // %rbx is callee-saved register
1220 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1221 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1223 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1224 movq $0, %rbx // constant for cmovs later
1225 subq $4, %rax // subtract four args passed in registers to pkfn
1227 js KMP_LABEL(kmp_0) // jump to movq
1228 jmp KMP_LABEL(kmp_0_exit) // jump ahead
1230 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1231 KMP_LABEL(kmp_0_exit):
1233 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1236 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1237 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1240 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1241 // without align, stack ptr would be this
1242 movq %rdx, %rax // Save to %rax
1244 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1245 subq %rax, %rdx // Amount to subtract from %rsp
1246 subq %rdx, %rsp // Prepare the stack ptr --
1247 // now %rsp will align to 128-byte boundary at call site
1249 // setup pkfn parameter reg and stack
1250 movq %rcx, %rax // argc -> %rax
1252 je KMP_LABEL(kmp_invoke_pass_parms) // jump ahead if no parms to push
1253 shlq $3, %rcx // argc*8 -> %rcx
1254 movq %r8, %rdx // p_argv -> %rdx
1255 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1257 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1259 KMP_LABEL(kmp_invoke_push_parms):
1260 // push nth - 7th parms to pkfn on stack
1261 subq $8, %rdx // decrement p_argv pointer to previous parm
1262 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1263 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1266 // C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1267 // if the name of the label that is an operand of this jecxz starts with a dot (".");
1268 // Apple's linker does not support 1-byte length relocation;
1269 // Resolution: replace all .labelX entries with L_labelX.
1271 jecxz KMP_LABEL(kmp_invoke_pass_parms) // stop when four p_argv[] parms left
1272 jmp KMP_LABEL(kmp_invoke_push_parms)
1274 KMP_LABEL(kmp_invoke_pass_parms): // put 1st - 6th parms to pkfn in registers.
1275 // order here is important to avoid trashing
1276 // registers used for both input and output parms!
1277 movq %rdi, %rbx // pkfn -> %rbx
1278 leaq __gtid(%rbp), %rdi // >id -> %rdi (store 1st parm to pkfn)
1279 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1281 movq %r8, %r11 // p_argv -> %r11
1284 cmpq $4, %rax // argc >= 4?
1285 jns KMP_LABEL(kmp_4) // jump to movq
1286 jmp KMP_LABEL(kmp_4_exit) // jump ahead
1288 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1289 KMP_LABEL(kmp_4_exit):
1291 cmpq $3, %rax // argc >= 3?
1292 jns KMP_LABEL(kmp_3) // jump to movq
1293 jmp KMP_LABEL(kmp_3_exit) // jump ahead
1295 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1296 KMP_LABEL(kmp_3_exit):
1298 cmpq $2, %rax // argc >= 2?
1299 jns KMP_LABEL(kmp_2) // jump to movq
1300 jmp KMP_LABEL(kmp_2_exit) // jump ahead
1302 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1303 KMP_LABEL(kmp_2_exit):
1305 cmpq $1, %rax // argc >= 1?
1306 jns KMP_LABEL(kmp_1) // jump to movq
1307 jmp KMP_LABEL(kmp_1_exit) // jump ahead
1309 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1310 KMP_LABEL(kmp_1_exit):
1312 cmpq $4, %rax // argc >= 4?
1313 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1315 cmpq $3, %rax // argc >= 3?
1316 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1318 cmpq $2, %rax // argc >= 2?
1319 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1321 cmpq $1, %rax // argc >= 1?
1322 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1325 call *%rbx // call (*pkfn)();
1326 movq $1, %rax // move 1 into return register;
1328 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1329 movq %rbp, %rsp // restore stack pointer
1330 popq %rbp // restore frame pointer
1334 DEBUG_INFO __kmp_invoke_microtask
1335 // -- End __kmp_invoke_microtask
1338 // __kmp_hardware_timestamp(void)
1340 PROC __kmp_hardware_timestamp
1346 DEBUG_INFO __kmp_hardware_timestamp
1347 // -- End __kmp_hardware_timestamp
1349 //------------------------------------------------------------------------
1350 // FUNCTION __kmp_bsr32
1353 // __kmp_bsr32( int );
1360 DEBUG_INFO __kmp_bsr32
1363 // -----------------------------------------------------------------------
1364 #endif /* KMP_ARCH_X86_64 */
1367 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
1369 //------------------------------------------------------------------------
1371 // typedef void (*microtask_t)( int *gtid, int *tid, ... );
1374 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1375 // int gtid, int tid,
1376 // int argc, void *p_argv[] ) {
1377 // (*pkfn)( & gtid, & tid, argv[0], ... );
1390 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1391 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1394 // x8: used to hold pkfn address
1395 // w9: used as temporary for number of pkfn parms
1396 // x10: used to traverse p_argv array
1397 // x11: used as temporary for stack placement calculation
1398 // x12: used as temporary for stack parameters
1399 // x19: used to preserve exit_frame_ptr, callee-save
1401 // return: w0 (always 1/TRUE)
1407 // -- Begin __kmp_invoke_microtask
1410 PROC __kmp_invoke_microtask
1412 stp x29, x30, [sp, #-16]!
1414 stp x19, x20, [sp, #-16]!
1419 add w9, w9, w3, lsr #1
1420 sub sp, sp, w9, lsl #4
1424 str w1, [x29, #-__gtid]
1425 str w2, [x29, #-__tid]
1433 sub x0, x29, #__gtid
1436 cbz w9, KMP_LABEL(kmp_1)
1440 cbz w9, KMP_LABEL(kmp_1)
1444 cbz w9, KMP_LABEL(kmp_1)
1448 cbz w9, KMP_LABEL(kmp_1)
1452 cbz w9, KMP_LABEL(kmp_1)
1456 cbz w9, KMP_LABEL(kmp_1)
1461 cbz w9, KMP_LABEL(kmp_1)
1471 ldp x19, x20, [sp], #16
1473 ldp x29, x30, [sp], #16
1476 DEBUG_INFO __kmp_invoke_microtask
1477 // -- End __kmp_invoke_microtask
1479 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
1483 //------------------------------------------------------------------------
1485 // typedef void (*microtask_t)( int *gtid, int *tid, ... );
1488 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1489 // int gtid, int tid,
1490 // int argc, void *p_argv[] ) {
1491 // (*pkfn)( & gtid, & tid, argv[0], ... );
1503 // return: r3 (always 1/TRUE)
1506 # if KMP_ARCH_PPC64_LE
1509 .globl __kmp_invoke_microtask
1511 # if KMP_ARCH_PPC64_LE
1517 .type __kmp_invoke_microtask,@function
1519 # if KMP_ARCH_PPC64_LE
1520 __kmp_invoke_microtask:
1523 addis 2, 12, .TOC.-.Lfunc_gep0@ha
1524 addi 2, 2, .TOC.-.Lfunc_gep0@l
1526 .localentry __kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1528 .section .opd,"aw",@progbits
1529 __kmp_invoke_microtask:
1538 // -- Begin __kmp_invoke_microtask
1541 // We need to allocate a stack frame large enough to hold all of the parameters
1542 // on the stack for the microtask plus what this function needs. That's 48
1543 // bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1544 // parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1545 // and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1546 // to save r30 to hold a copy of r8.
1553 // This is unusual because normally we'd set r31 equal to r1 after the stack
1554 // frame is established. In this case, however, we need to dynamically compute
1555 // the stack frame size, and so we keep a direct copy of r1 to access our
1556 // register save areas and restore the r1 value before returning.
1558 .cfi_def_cfa_register r31
1562 // Compute the size necessary for the local stack frame.
1563 # if KMP_ARCH_PPC64_LE
1572 // We need to make sure that the stack frame stays aligned (to 16 bytes, except
1573 // under the BG/Q CNK, where it must be to 32 bytes).
1581 // Establish the local stack frame.
1585 .cfi_offset r30, -16
1591 // Store gtid and tid to the stack because they're passed by reference to the microtask.
1631 // There are more than 6 microtask parameters, so we need to store the
1632 // remainder to the stack.
1636 // These are set to 8 bytes before the first desired store address (we're using
1637 // pre-increment loads and stores in the loop below). The parameter save area
1638 // for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1639 // 32 + 8*8 == 96 bytes above r1 for ELFv2.
1641 # if KMP_ARCH_PPC64_LE
1653 # if KMP_ARCH_PPC64_LE
1658 // For ELFv1, we need to load the actual function address from the function descriptor.
1669 # if KMP_ARCH_PPC64_LE
1695 .size __kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1698 // -- End __kmp_invoke_microtask
1700 #endif /* KMP_ARCH_PPC64 */
1702 #if KMP_ARCH_ARM || KMP_ARCH_MIPS
1704 .comm .gomp_critical_user_,32,8
1707 .global __kmp_unnamed_critical_addr
1708 __kmp_unnamed_critical_addr:
1709 .4byte .gomp_critical_user_
1710 .size __kmp_unnamed_critical_addr,4
1711 #endif /* KMP_ARCH_ARM */
1713 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
1715 .comm .gomp_critical_user_,32,8
1718 .global __kmp_unnamed_critical_addr
1719 __kmp_unnamed_critical_addr:
1720 .8byte .gomp_critical_user_
1721 .size __kmp_unnamed_critical_addr,8
1722 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
1726 .section .note.GNU-stack,"",%progbits
1728 .section .note.GNU-stack,"",@progbits