contrib/llvm-project/compiler-rt/lib/builtins/i386/floatundisf.S

   1 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   2 // See https://llvm.org/LICENSE.txt for license information.
   3 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   4
   5 #include "../assembly.h"
   6
   7 // float __floatundisf(du_int a);
   8
   9 // Note that there is a hardware instruction, fildll, that does most of what
  10 // this function needs to do.  However, because of our ia32 ABI, it will take
  11 // a write-small read-large stall, so the software implementation here is
  12 // actually several cycles faster.
  13
  14 // This is a branch-free implementation.  A branchy implementation might be
  15 // faster for the common case if you know something a priori about the input
  16 // distribution.
  17
  18 /* branch-free x87 implementation - one cycle slower than without x87.
  19
  20 #ifdef __i386__
  21
  22 CONST_SECTION
  23 .balign 3
  24
  25                 .quad   0x43f0000000000000
  26 twop64: .quad   0x0000000000000000
  27
  28 #define                 TWOp64                  twop64-0b(%ecx,%eax,8)
  29
  30 .text
  31 .balign 4
  32 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  33         movl            8(%esp),                %eax
  34         movd            8(%esp),                %xmm1
  35         movd            4(%esp),                %xmm0
  36         punpckldq       %xmm1,                  %xmm0
  37         calll           0f
  38 0:      popl            %ecx
  39         sarl            $31,                    %eax
  40         movq            %xmm0,                  4(%esp)
  41         fildll          4(%esp)
  42         faddl           TWOp64
  43         fstps           4(%esp)
  44         flds            4(%esp)
  45         ret
  46 END_COMPILERRT_FUNCTION(__floatundisf)
  47
  48 #endif // __i386__
  49
  50 */
  51
  52 // branch-free, x87-free implementation - faster at the expense of code size
  53
  54 #ifdef __i386__
  55
  56 CONST_SECTION
  57
  58         .balign 16
  59 twop52:
  60         .quad 0x4330000000000000
  61         .quad 0x0000000000000fff
  62
  63         .balign 16
  64 sticky:
  65         .quad 0x0000000000000000
  66         .long 0x00000012
  67
  68         .balign 16
  69 twelve:
  70         .long 0x00000000
  71
  72 #define                 TWOp52                  twop52-0b(%ecx)
  73 #define                 STICKY                  sticky-0b(%ecx,%eax,8)
  74
  75 .text
  76 .balign 4
  77 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  78         movl            8(%esp),                %eax
  79         movd            8(%esp),                %xmm1
  80         movd            4(%esp),                %xmm0
  81         punpckldq       %xmm1,                  %xmm0
  82
  83         calll           0f
  84 0:      popl            %ecx
  85         shrl            %eax                                    // high 31 bits of input as sint32
  86         addl            $0x7ff80000,    %eax
  87         sarl            $31,                    %eax    // (big input) ? -1 : 0
  88         movsd           STICKY,                 %xmm1   // (big input) ? 0xfff : 0
  89         movl            $12,                    %edx
  90         andl            %eax,                   %edx    // (big input) ? 12 : 0
  91         movd            %edx,                   %xmm3
  92         andpd           %xmm0,                  %xmm1   // (big input) ? input & 0xfff : 0
  93         movsd           TWOp52,                 %xmm2   // 0x1.0p52
  94         psrlq           %xmm3,                  %xmm0   // (big input) ? input >> 12 : input
  95         orpd            %xmm2,                  %xmm1   // 0x1.0p52 + ((big input) ? input & 0xfff : input)
  96         orpd            %xmm1,                  %xmm0   // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
  97         subsd           %xmm2,                  %xmm0   // (double)((big input) ? (input >> 12 | input & 0xfff) : input)
  98         cvtsd2ss        %xmm0,                  %xmm0   // (float)((big input) ? (input >> 12 | input & 0xfff) : input)
  99         pslld           $23,                    %xmm3
 100         paddd           %xmm3,                  %xmm0   // (float)input
 101         movd            %xmm0,                  4(%esp)
 102         flds            4(%esp)
 103         ret
 104 END_COMPILERRT_FUNCTION(__floatundisf)
 105
 106 #endif // __i386__
 107
 108 NO_EXEC_STACK_DIRECTIVE
 109