contrib/compiler-rt/lib/builtins/i386/floatundisf.S

   1 // This file is dual licensed under the MIT and the University of Illinois Open
   2 // Source Licenses. See LICENSE.TXT for details.
   3
   4 #include "../assembly.h"
   5
   6 // float __floatundisf(du_int a);
   7
   8 // Note that there is a hardware instruction, fildll, that does most of what
   9 // this function needs to do.  However, because of our ia32 ABI, it will take
  10 // a write-small read-large stall, so the software implementation here is
  11 // actually several cycles faster.
  12
  13 // This is a branch-free implementation.  A branchy implementation might be
  14 // faster for the common case if you know something a priori about the input
  15 // distribution.
  16
  17 /* branch-free x87 implementation - one cycle slower than without x87.
  18
  19 #ifdef __i386__
  20
  21 CONST_SECTION
  22 .balign 3
  23
  24                 .quad   0x43f0000000000000
  25 twop64: .quad   0x0000000000000000
  26
  27 #define                 TWOp64                  twop64-0b(%ecx,%eax,8)
  28
  29 .text
  30 .balign 4
  31 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  32         movl            8(%esp),                %eax
  33         movd            8(%esp),                %xmm1
  34         movd            4(%esp),                %xmm0
  35         punpckldq       %xmm1,                  %xmm0
  36         calll           0f
  37 0:      popl            %ecx
  38         sarl            $31,                    %eax
  39         movq            %xmm0,                  4(%esp)
  40         fildll          4(%esp)
  41         faddl           TWOp64
  42         fstps           4(%esp)
  43         flds            4(%esp)
  44         ret
  45 END_COMPILERRT_FUNCTION(__floatundisf)
  46
  47 #endif // __i386__
  48
  49 */
  50
  51 /* branch-free, x87-free implementation - faster at the expense of code size */
  52
  53 #ifdef __i386__
  54
  55 CONST_SECTION
  56
  57         .balign 16
  58 twop52:
  59         .quad 0x4330000000000000
  60         .quad 0x0000000000000fff
  61
  62         .balign 16
  63 sticky:
  64         .quad 0x0000000000000000
  65         .long 0x00000012
  66
  67         .balign 16
  68 twelve:
  69         .long 0x00000000
  70
  71 #define                 TWOp52                  twop52-0b(%ecx)
  72 #define                 STICKY                  sticky-0b(%ecx,%eax,8)
  73
  74 .text
  75 .balign 4
  76 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  77         movl            8(%esp),                %eax
  78         movd            8(%esp),                %xmm1
  79         movd            4(%esp),                %xmm0
  80         punpckldq       %xmm1,                  %xmm0
  81
  82         calll           0f
  83 0:      popl            %ecx
  84         shrl            %eax                                    // high 31 bits of input as sint32
  85         addl            $0x7ff80000,    %eax
  86         sarl            $31,                    %eax    // (big input) ? -1 : 0
  87         movsd           STICKY,                 %xmm1   // (big input) ? 0xfff : 0
  88         movl            $12,                    %edx
  89         andl            %eax,                   %edx    // (big input) ? 12 : 0
  90         movd            %edx,                   %xmm3
  91         andpd           %xmm0,                  %xmm1   // (big input) ? input & 0xfff : 0
  92         movsd           TWOp52,                 %xmm2   // 0x1.0p52
  93         psrlq           %xmm3,                  %xmm0   // (big input) ? input >> 12 : input
  94         orpd            %xmm2,                  %xmm1   // 0x1.0p52 + ((big input) ? input & 0xfff : input)
  95         orpd            %xmm1,                  %xmm0   // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
  96         subsd           %xmm2,                  %xmm0   // (double)((big input) ? (input >> 12 | input & 0xfff) : input)
  97         cvtsd2ss        %xmm0,                  %xmm0   // (float)((big input) ? (input >> 12 | input & 0xfff) : input)
  98         pslld           $23,                    %xmm3
  99         paddd           %xmm3,                  %xmm0   // (float)input
 100         movd            %xmm0,                  4(%esp)
 101         flds            4(%esp)
 102         ret
 103 END_COMPILERRT_FUNCTION(__floatundisf)
 104
 105 #endif // __i386__
 106
 107 NO_EXEC_STACK_DIRECTIVE
 108