contrib/compiler-rt/lib/i386/floatundisf.S

   1 // This file is dual licensed under the MIT and the University of Illinois Open
   2 // Source Licenses. See LICENSE.TXT for details.
   3
   4 #include "../assembly.h"
   5
   6 // float __floatundisf(du_int a);
   7
   8 // Note that there is a hardware instruction, fildll, that does most of what
   9 // this function needs to do.  However, because of our ia32 ABI, it will take
  10 // a write-small read-large stall, so the software implementation here is
  11 // actually several cycles faster.
  12
  13 // This is a branch-free implementation.  A branchy implementation might be
  14 // faster for the common case if you know something a priori about the input
  15 // distribution.
  16
  17 /* branch-free x87 implementation - one cycle slower than without x87.
  18
  19 #ifdef __i386__
  20
  21 .const
  22 .align 3
  23
  24                 .quad   0x43f0000000000000
  25 twop64: .quad   0x0000000000000000
  26
  27 #define                 TWOp64                  twop64-0b(%ecx,%eax,8)
  28
  29 .text
  30 .align 4
  31 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  32         movl            8(%esp),                %eax
  33         movd            8(%esp),                %xmm1
  34         movd            4(%esp),                %xmm0
  35         punpckldq       %xmm1,                  %xmm0
  36         calll           0f
  37 0:      popl            %ecx
  38         sarl            $31,                    %eax
  39         movq            %xmm0,                  4(%esp)
  40         fildll          4(%esp)
  41         faddl           TWOp64
  42         fstps           4(%esp)
  43         flds            4(%esp)
  44         ret
  45
  46 #endif // __i386__
  47
  48 */
  49
  50 /* branch-free, x87-free implementation - faster at the expense of code size */
  51
  52 #ifdef __i386__
  53
  54 #ifndef __ELF__
  55 .const
  56 .align 3
  57 #else
  58 .align 8
  59 #endif
  60 twop52: .quad 0x4330000000000000
  61                 .quad 0x0000000000000fff
  62 sticky: .quad 0x0000000000000000
  63                 .long 0x00000012
  64 twelve: .long 0x00000000
  65
  66 #define                 TWOp52                  twop52-0b(%ecx)
  67 #define                 STICKY                  sticky-0b(%ecx,%eax,8)
  68
  69 .text
  70 .align 4
  71 DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  72         movl            8(%esp),                %eax
  73         movd            8(%esp),                %xmm1
  74         movd            4(%esp),                %xmm0
  75         punpckldq       %xmm1,                  %xmm0
  76
  77         calll           0f
  78 0:      popl            %ecx
  79         shrl            %eax                                    // high 31 bits of input as sint32
  80         addl            $0x7ff80000,    %eax
  81         sarl            $31,                    %eax    // (big input) ? -1 : 0
  82         movsd           STICKY,                 %xmm1   // (big input) ? 0xfff : 0
  83         movl            $12,                    %edx
  84         andl            %eax,                   %edx    // (big input) ? 12 : 0
  85         movd            %edx,                   %xmm3
  86         andpd           %xmm0,                  %xmm1   // (big input) ? input & 0xfff : 0
  87         movsd           TWOp52,                 %xmm2   // 0x1.0p52
  88         psrlq           %xmm3,                  %xmm0   // (big input) ? input >> 12 : input
  89         orpd            %xmm2,                  %xmm1   // 0x1.0p52 + ((big input) ? input & 0xfff : input)
  90         orpd            %xmm1,                  %xmm0   // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
  91         subsd           %xmm2,                  %xmm0   // (double)((big input) ? (input >> 12 | input & 0xfff) : input)
  92         cvtsd2ss        %xmm0,                  %xmm0   // (float)((big input) ? (input >> 12 | input & 0xfff) : input)
  93         pslld           $23,                    %xmm3
  94         paddd           %xmm3,                  %xmm0   // (float)input
  95         movd            %xmm0,                  4(%esp)
  96         flds            4(%esp)
  97         ret
  98
  99 #endif // __i386__