1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #include <sys/cdefs.h>
55 __FBSDID("$FreeBSD$");
57 #ifdef SOFTFLOAT_FOR_GCC
58 #include "softfloat-for-gcc.h"
62 #include "softfloat.h"
65 * Conversions between floats as stored in memory and floats as
68 #ifndef FLOAT64_DEMANGLE
69 #define FLOAT64_DEMANGLE(a) (a)
71 #ifndef FLOAT64_MANGLE
72 #define FLOAT64_MANGLE(a) (a)
76 -------------------------------------------------------------------------------
77 Floating-point rounding mode and exception flags.
78 -------------------------------------------------------------------------------
80 fp_rnd_t float_rounding_mode = float_round_nearest_even;
81 fp_except float_exception_flags = 0;
84 -------------------------------------------------------------------------------
85 Primitive arithmetic functions, including multi-word arithmetic, and
86 division and square root approximations. (Can be specialized to target if
88 -------------------------------------------------------------------------------
90 #include "softfloat-macros"
93 -------------------------------------------------------------------------------
94 Functions and definitions to determine: (1) whether tininess for underflow
95 is detected before or after rounding by default, (2) what (if anything)
96 happens when exceptions are raised, (3) how signaling NaNs are distinguished
97 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
98 are propagated from function inputs to output. These details are target-
100 -------------------------------------------------------------------------------
102 #include "softfloat-specialize"
105 -------------------------------------------------------------------------------
106 Returns the fraction bits of the single-precision floating-point value `a'.
107 -------------------------------------------------------------------------------
109 INLINE bits32 extractFloat32Frac( float32 a )
112 return a & 0x007FFFFF;
117 -------------------------------------------------------------------------------
118 Returns the exponent bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE int16 extractFloat32Exp( float32 a )
124 return ( a>>23 ) & 0xFF;
129 -------------------------------------------------------------------------------
130 Returns the sign bit of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE flag extractFloat32Sign( float32 a )
141 -------------------------------------------------------------------------------
142 Normalizes the subnormal single-precision floating-point value represented
143 by the denormalized significand `aSig'. The normalized exponent and
144 significand are stored at the locations pointed to by `zExpPtr' and
145 `zSigPtr', respectively.
146 -------------------------------------------------------------------------------
149 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
153 shiftCount = countLeadingZeros32( aSig ) - 8;
154 *zSigPtr = aSig<<shiftCount;
155 *zExpPtr = 1 - shiftCount;
160 -------------------------------------------------------------------------------
161 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
162 single-precision floating-point value, returning the result. After being
163 shifted into the proper positions, the three fields are simply added
164 together to form the result. This means that any integer portion of `zSig'
165 will be added into the exponent. Since a properly normalized significand
166 will have an integer portion equal to 1, the `zExp' input should be 1 less
167 than the desired result exponent whenever `zSig' is a complete, normalized
169 -------------------------------------------------------------------------------
171 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
174 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
179 -------------------------------------------------------------------------------
180 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
181 and significand `zSig', and returns the proper single-precision floating-
182 point value corresponding to the abstract input. Ordinarily, the abstract
183 value is simply rounded and packed into the single-precision format, with
184 the inexact exception raised if the abstract input cannot be represented
185 exactly. However, if the abstract value is too large, the overflow and
186 inexact exceptions are raised and an infinity or maximal finite value is
187 returned. If the abstract value is too small, the input value is rounded to
188 a subnormal number, and the underflow and inexact exceptions are raised if
189 the abstract input cannot be represented exactly as a subnormal single-
190 precision floating-point number.
191 The input significand `zSig' has its binary point between bits 30
192 and 29, which is 7 bits to the left of the usual location. This shifted
193 significand must be normalized or smaller. If `zSig' is not normalized,
194 `zExp' must be 0; in that case, the result returned is a subnormal number,
195 and it must not require rounding. In the usual case that `zSig' is
196 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
197 The handling of underflow and overflow follows the IEC/IEEE Standard for
198 Binary Floating-Point Arithmetic.
199 -------------------------------------------------------------------------------
201 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
204 flag roundNearestEven;
205 int8 roundIncrement, roundBits;
208 roundingMode = float_rounding_mode;
209 roundNearestEven = roundingMode == float_round_nearest_even;
210 roundIncrement = 0x40;
211 if ( ! roundNearestEven ) {
212 if ( roundingMode == float_round_to_zero ) {
216 roundIncrement = 0x7F;
218 if ( roundingMode == float_round_up ) roundIncrement = 0;
221 if ( roundingMode == float_round_down ) roundIncrement = 0;
225 roundBits = zSig & 0x7F;
226 if ( 0xFD <= (bits16) zExp ) {
228 || ( ( zExp == 0xFD )
229 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
231 float_raise( float_flag_overflow | float_flag_inexact );
232 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
236 ( float_detect_tininess == float_tininess_before_rounding )
238 || ( zSig + roundIncrement < 0x80000000 );
239 shift32RightJamming( zSig, - zExp, &zSig );
241 roundBits = zSig & 0x7F;
242 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
245 if ( roundBits ) float_exception_flags |= float_flag_inexact;
246 zSig = ( zSig + roundIncrement )>>7;
247 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
248 if ( zSig == 0 ) zExp = 0;
249 return packFloat32( zSign, zExp, zSig );
254 -------------------------------------------------------------------------------
255 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
256 and significand `zSig', and returns the proper single-precision floating-
257 point value corresponding to the abstract input. This routine is just like
258 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
259 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
260 floating-point exponent.
261 -------------------------------------------------------------------------------
264 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
268 shiftCount = countLeadingZeros32( zSig ) - 1;
269 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
274 -------------------------------------------------------------------------------
275 Returns the least-significant 32 fraction bits of the double-precision
276 floating-point value `a'.
277 -------------------------------------------------------------------------------
279 INLINE bits32 extractFloat64Frac1( float64 a )
282 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
287 -------------------------------------------------------------------------------
288 Returns the most-significant 20 fraction bits of the double-precision
289 floating-point value `a'.
290 -------------------------------------------------------------------------------
292 INLINE bits32 extractFloat64Frac0( float64 a )
295 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
300 -------------------------------------------------------------------------------
301 Returns the exponent bits of the double-precision floating-point value `a'.
302 -------------------------------------------------------------------------------
304 INLINE int16 extractFloat64Exp( float64 a )
307 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
312 -------------------------------------------------------------------------------
313 Returns the sign bit of the double-precision floating-point value `a'.
314 -------------------------------------------------------------------------------
316 INLINE flag extractFloat64Sign( float64 a )
319 return FLOAT64_DEMANGLE(a)>>63;
324 -------------------------------------------------------------------------------
325 Normalizes the subnormal double-precision floating-point value represented
326 by the denormalized significand formed by the concatenation of `aSig0' and
327 `aSig1'. The normalized exponent is stored at the location pointed to by
328 `zExpPtr'. The most significant 21 bits of the normalized significand are
329 stored at the location pointed to by `zSig0Ptr', and the least significant
330 32 bits of the normalized significand are stored at the location pointed to
332 -------------------------------------------------------------------------------
335 normalizeFloat64Subnormal(
346 shiftCount = countLeadingZeros32( aSig1 ) - 11;
347 if ( shiftCount < 0 ) {
348 *zSig0Ptr = aSig1>>( - shiftCount );
349 *zSig1Ptr = aSig1<<( shiftCount & 31 );
352 *zSig0Ptr = aSig1<<shiftCount;
355 *zExpPtr = - shiftCount - 31;
358 shiftCount = countLeadingZeros32( aSig0 ) - 11;
359 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
360 *zExpPtr = 1 - shiftCount;
366 -------------------------------------------------------------------------------
367 Packs the sign `zSign', the exponent `zExp', and the significand formed by
368 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
369 point value, returning the result. After being shifted into the proper
370 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
371 together to form the most significant 32 bits of the result. This means
372 that any integer portion of `zSig0' will be added into the exponent. Since
373 a properly normalized significand will have an integer portion equal to 1,
374 the `zExp' input should be 1 less than the desired result exponent whenever
375 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
376 -------------------------------------------------------------------------------
379 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
382 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
383 ( ( (bits64) zExp )<<52 ) +
384 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
390 -------------------------------------------------------------------------------
391 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
392 and extended significand formed by the concatenation of `zSig0', `zSig1',
393 and `zSig2', and returns the proper double-precision floating-point value
394 corresponding to the abstract input. Ordinarily, the abstract value is
395 simply rounded and packed into the double-precision format, with the inexact
396 exception raised if the abstract input cannot be represented exactly.
397 However, if the abstract value is too large, the overflow and inexact
398 exceptions are raised and an infinity or maximal finite value is returned.
399 If the abstract value is too small, the input value is rounded to a
400 subnormal number, and the underflow and inexact exceptions are raised if the
401 abstract input cannot be represented exactly as a subnormal double-precision
402 floating-point number.
403 The input significand must be normalized or smaller. If the input
404 significand is not normalized, `zExp' must be 0; in that case, the result
405 returned is a subnormal number, and it must not require rounding. In the
406 usual case that the input significand is normalized, `zExp' must be 1 less
407 than the ``true'' floating-point exponent. The handling of underflow and
408 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
409 -------------------------------------------------------------------------------
413 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
416 flag roundNearestEven, increment, isTiny;
418 roundingMode = float_rounding_mode;
419 roundNearestEven = ( roundingMode == float_round_nearest_even );
420 increment = ( (sbits32) zSig2 < 0 );
421 if ( ! roundNearestEven ) {
422 if ( roundingMode == float_round_to_zero ) {
427 increment = ( roundingMode == float_round_down ) && zSig2;
430 increment = ( roundingMode == float_round_up ) && zSig2;
434 if ( 0x7FD <= (bits16) zExp ) {
435 if ( ( 0x7FD < zExp )
436 || ( ( zExp == 0x7FD )
437 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
441 float_raise( float_flag_overflow | float_flag_inexact );
442 if ( ( roundingMode == float_round_to_zero )
443 || ( zSign && ( roundingMode == float_round_up ) )
444 || ( ! zSign && ( roundingMode == float_round_down ) )
446 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
448 return packFloat64( zSign, 0x7FF, 0, 0 );
452 ( float_detect_tininess == float_tininess_before_rounding )
455 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
456 shift64ExtraRightJamming(
457 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
459 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
460 if ( roundNearestEven ) {
461 increment = ( (sbits32) zSig2 < 0 );
465 increment = ( roundingMode == float_round_down ) && zSig2;
468 increment = ( roundingMode == float_round_up ) && zSig2;
473 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
475 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
476 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
479 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
481 return packFloat64( zSign, zExp, zSig0, zSig1 );
486 -------------------------------------------------------------------------------
487 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
488 and significand formed by the concatenation of `zSig0' and `zSig1', and
489 returns the proper double-precision floating-point value corresponding
490 to the abstract input. This routine is just like `roundAndPackFloat64'
491 except that the input significand has fewer bits and does not have to be
492 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
494 -------------------------------------------------------------------------------
497 normalizeRoundAndPackFloat64(
498 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
508 shiftCount = countLeadingZeros32( zSig0 ) - 11;
509 if ( 0 <= shiftCount ) {
511 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
514 shift64ExtraRightJamming(
515 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
518 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
523 -------------------------------------------------------------------------------
524 Returns the result of converting the 32-bit two's complement integer `a' to
525 the single-precision floating-point format. The conversion is performed
526 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
527 -------------------------------------------------------------------------------
529 float32 int32_to_float32( int32 a )
533 if ( a == 0 ) return 0;
534 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
536 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
541 -------------------------------------------------------------------------------
542 Returns the result of converting the 32-bit two's complement integer `a' to
543 the double-precision floating-point format. The conversion is performed
544 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
545 -------------------------------------------------------------------------------
547 float64 int32_to_float64( int32 a )
554 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
556 absA = zSign ? - a : a;
557 shiftCount = countLeadingZeros32( absA ) - 11;
558 if ( 0 <= shiftCount ) {
559 zSig0 = absA<<shiftCount;
563 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
565 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
569 #ifndef SOFTFLOAT_FOR_GCC
571 -------------------------------------------------------------------------------
572 Returns the result of converting the single-precision floating-point value
573 `a' to the 32-bit two's complement integer format. The conversion is
574 performed according to the IEC/IEEE Standard for Binary Floating-Point
575 Arithmetic---which means in particular that the conversion is rounded
576 according to the current rounding mode. If `a' is a NaN, the largest
577 positive integer is returned. Otherwise, if the conversion overflows, the
578 largest integer with the same sign as `a' is returned.
579 -------------------------------------------------------------------------------
581 int32 float32_to_int32( float32 a )
584 int16 aExp, shiftCount;
585 bits32 aSig, aSigExtra;
589 aSig = extractFloat32Frac( a );
590 aExp = extractFloat32Exp( a );
591 aSign = extractFloat32Sign( a );
592 shiftCount = aExp - 0x96;
593 if ( 0 <= shiftCount ) {
594 if ( 0x9E <= aExp ) {
595 if ( a != 0xCF000000 ) {
596 float_raise( float_flag_invalid );
597 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
601 return (sbits32) 0x80000000;
603 z = ( aSig | 0x00800000 )<<shiftCount;
604 if ( aSign ) z = - z;
608 aSigExtra = aExp | aSig;
613 aSigExtra = aSig<<( shiftCount & 31 );
614 z = aSig>>( - shiftCount );
616 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
617 roundingMode = float_rounding_mode;
618 if ( roundingMode == float_round_nearest_even ) {
619 if ( (sbits32) aSigExtra < 0 ) {
621 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
623 if ( aSign ) z = - z;
626 aSigExtra = ( aSigExtra != 0 );
628 z += ( roundingMode == float_round_down ) & aSigExtra;
632 z += ( roundingMode == float_round_up ) & aSigExtra;
642 -------------------------------------------------------------------------------
643 Returns the result of converting the single-precision floating-point value
644 `a' to the 32-bit two's complement integer format. The conversion is
645 performed according to the IEC/IEEE Standard for Binary Floating-Point
646 Arithmetic, except that the conversion is always rounded toward zero.
647 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
648 the conversion overflows, the largest integer with the same sign as `a' is
650 -------------------------------------------------------------------------------
652 int32 float32_to_int32_round_to_zero( float32 a )
655 int16 aExp, shiftCount;
659 aSig = extractFloat32Frac( a );
660 aExp = extractFloat32Exp( a );
661 aSign = extractFloat32Sign( a );
662 shiftCount = aExp - 0x9E;
663 if ( 0 <= shiftCount ) {
664 if ( a != 0xCF000000 ) {
665 float_raise( float_flag_invalid );
666 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
668 return (sbits32) 0x80000000;
670 else if ( aExp <= 0x7E ) {
671 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
674 aSig = ( aSig | 0x00800000 )<<8;
675 z = aSig>>( - shiftCount );
676 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
677 float_exception_flags |= float_flag_inexact;
679 if ( aSign ) z = - z;
685 -------------------------------------------------------------------------------
686 Returns the result of converting the single-precision floating-point value
687 `a' to the double-precision floating-point format. The conversion is
688 performed according to the IEC/IEEE Standard for Binary Floating-Point
690 -------------------------------------------------------------------------------
692 float64 float32_to_float64( float32 a )
696 bits32 aSig, zSig0, zSig1;
698 aSig = extractFloat32Frac( a );
699 aExp = extractFloat32Exp( a );
700 aSign = extractFloat32Sign( a );
701 if ( aExp == 0xFF ) {
702 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
703 return packFloat64( aSign, 0x7FF, 0, 0 );
706 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
707 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
710 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
711 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
715 #ifndef SOFTFLOAT_FOR_GCC
717 -------------------------------------------------------------------------------
718 Rounds the single-precision floating-point value `a' to an integer,
719 and returns the result as a single-precision floating-point value. The
720 operation is performed according to the IEC/IEEE Standard for Binary
721 Floating-Point Arithmetic.
722 -------------------------------------------------------------------------------
724 float32 float32_round_to_int( float32 a )
728 bits32 lastBitMask, roundBitsMask;
732 aExp = extractFloat32Exp( a );
733 if ( 0x96 <= aExp ) {
734 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
735 return propagateFloat32NaN( a, a );
739 if ( aExp <= 0x7E ) {
740 if ( (bits32) ( a<<1 ) == 0 ) return a;
741 float_exception_flags |= float_flag_inexact;
742 aSign = extractFloat32Sign( a );
743 switch ( float_rounding_mode ) {
744 case float_round_nearest_even:
745 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
746 return packFloat32( aSign, 0x7F, 0 );
749 case float_round_to_zero:
751 case float_round_down:
752 return aSign ? 0xBF800000 : 0;
754 return aSign ? 0x80000000 : 0x3F800000;
756 return packFloat32( aSign, 0, 0 );
759 lastBitMask <<= 0x96 - aExp;
760 roundBitsMask = lastBitMask - 1;
762 roundingMode = float_rounding_mode;
763 if ( roundingMode == float_round_nearest_even ) {
765 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
767 else if ( roundingMode != float_round_to_zero ) {
768 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
772 z &= ~ roundBitsMask;
773 if ( z != a ) float_exception_flags |= float_flag_inexact;
780 -------------------------------------------------------------------------------
781 Returns the result of adding the absolute values of the single-precision
782 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
783 before being returned. `zSign' is ignored if the result is a NaN.
784 The addition is performed according to the IEC/IEEE Standard for Binary
785 Floating-Point Arithmetic.
786 -------------------------------------------------------------------------------
788 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
790 int16 aExp, bExp, zExp;
791 bits32 aSig, bSig, zSig;
794 aSig = extractFloat32Frac( a );
795 aExp = extractFloat32Exp( a );
796 bSig = extractFloat32Frac( b );
797 bExp = extractFloat32Exp( b );
798 expDiff = aExp - bExp;
802 if ( aExp == 0xFF ) {
803 if ( aSig ) return propagateFloat32NaN( a, b );
812 shift32RightJamming( bSig, expDiff, &bSig );
815 else if ( expDiff < 0 ) {
816 if ( bExp == 0xFF ) {
817 if ( bSig ) return propagateFloat32NaN( a, b );
818 return packFloat32( zSign, 0xFF, 0 );
826 shift32RightJamming( aSig, - expDiff, &aSig );
830 if ( aExp == 0xFF ) {
831 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
834 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
835 zSig = 0x40000000 + aSig + bSig;
840 zSig = ( aSig + bSig )<<1;
842 if ( (sbits32) zSig < 0 ) {
847 return roundAndPackFloat32( zSign, zExp, zSig );
852 -------------------------------------------------------------------------------
853 Returns the result of subtracting the absolute values of the single-
854 precision floating-point values `a' and `b'. If `zSign' is 1, the
855 difference is negated before being returned. `zSign' is ignored if the
856 result is a NaN. The subtraction is performed according to the IEC/IEEE
857 Standard for Binary Floating-Point Arithmetic.
858 -------------------------------------------------------------------------------
860 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
862 int16 aExp, bExp, zExp;
863 bits32 aSig, bSig, zSig;
866 aSig = extractFloat32Frac( a );
867 aExp = extractFloat32Exp( a );
868 bSig = extractFloat32Frac( b );
869 bExp = extractFloat32Exp( b );
870 expDiff = aExp - bExp;
873 if ( 0 < expDiff ) goto aExpBigger;
874 if ( expDiff < 0 ) goto bExpBigger;
875 if ( aExp == 0xFF ) {
876 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
877 float_raise( float_flag_invalid );
878 return float32_default_nan;
884 if ( bSig < aSig ) goto aBigger;
885 if ( aSig < bSig ) goto bBigger;
886 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
888 if ( bExp == 0xFF ) {
889 if ( bSig ) return propagateFloat32NaN( a, b );
890 return packFloat32( zSign ^ 1, 0xFF, 0 );
898 shift32RightJamming( aSig, - expDiff, &aSig );
904 goto normalizeRoundAndPack;
906 if ( aExp == 0xFF ) {
907 if ( aSig ) return propagateFloat32NaN( a, b );
916 shift32RightJamming( bSig, expDiff, &bSig );
921 normalizeRoundAndPack:
923 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
928 -------------------------------------------------------------------------------
929 Returns the result of adding the single-precision floating-point values `a'
930 and `b'. The operation is performed according to the IEC/IEEE Standard for
931 Binary Floating-Point Arithmetic.
932 -------------------------------------------------------------------------------
934 float32 float32_add( float32 a, float32 b )
938 aSign = extractFloat32Sign( a );
939 bSign = extractFloat32Sign( b );
940 if ( aSign == bSign ) {
941 return addFloat32Sigs( a, b, aSign );
944 return subFloat32Sigs( a, b, aSign );
950 -------------------------------------------------------------------------------
951 Returns the result of subtracting the single-precision floating-point values
952 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
953 for Binary Floating-Point Arithmetic.
954 -------------------------------------------------------------------------------
956 float32 float32_sub( float32 a, float32 b )
960 aSign = extractFloat32Sign( a );
961 bSign = extractFloat32Sign( b );
962 if ( aSign == bSign ) {
963 return subFloat32Sigs( a, b, aSign );
966 return addFloat32Sigs( a, b, aSign );
972 -------------------------------------------------------------------------------
973 Returns the result of multiplying the single-precision floating-point values
974 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
975 for Binary Floating-Point Arithmetic.
976 -------------------------------------------------------------------------------
978 float32 float32_mul( float32 a, float32 b )
980 flag aSign, bSign, zSign;
981 int16 aExp, bExp, zExp;
982 bits32 aSig, bSig, zSig0, zSig1;
984 aSig = extractFloat32Frac( a );
985 aExp = extractFloat32Exp( a );
986 aSign = extractFloat32Sign( a );
987 bSig = extractFloat32Frac( b );
988 bExp = extractFloat32Exp( b );
989 bSign = extractFloat32Sign( b );
990 zSign = aSign ^ bSign;
991 if ( aExp == 0xFF ) {
992 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
993 return propagateFloat32NaN( a, b );
995 if ( ( bExp | bSig ) == 0 ) {
996 float_raise( float_flag_invalid );
997 return float32_default_nan;
999 return packFloat32( zSign, 0xFF, 0 );
1001 if ( bExp == 0xFF ) {
1002 if ( bSig ) return propagateFloat32NaN( a, b );
1003 if ( ( aExp | aSig ) == 0 ) {
1004 float_raise( float_flag_invalid );
1005 return float32_default_nan;
1007 return packFloat32( zSign, 0xFF, 0 );
1010 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1011 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1014 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1015 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1017 zExp = aExp + bExp - 0x7F;
1018 aSig = ( aSig | 0x00800000 )<<7;
1019 bSig = ( bSig | 0x00800000 )<<8;
1020 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1021 zSig0 |= ( zSig1 != 0 );
1022 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1026 return roundAndPackFloat32( zSign, zExp, zSig0 );
1031 -------------------------------------------------------------------------------
1032 Returns the result of dividing the single-precision floating-point value `a'
1033 by the corresponding value `b'. The operation is performed according to the
1034 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1035 -------------------------------------------------------------------------------
1037 float32 float32_div( float32 a, float32 b )
1039 flag aSign, bSign, zSign;
1040 int16 aExp, bExp, zExp;
1041 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1043 aSig = extractFloat32Frac( a );
1044 aExp = extractFloat32Exp( a );
1045 aSign = extractFloat32Sign( a );
1046 bSig = extractFloat32Frac( b );
1047 bExp = extractFloat32Exp( b );
1048 bSign = extractFloat32Sign( b );
1049 zSign = aSign ^ bSign;
1050 if ( aExp == 0xFF ) {
1051 if ( aSig ) return propagateFloat32NaN( a, b );
1052 if ( bExp == 0xFF ) {
1053 if ( bSig ) return propagateFloat32NaN( a, b );
1054 float_raise( float_flag_invalid );
1055 return float32_default_nan;
1057 return packFloat32( zSign, 0xFF, 0 );
1059 if ( bExp == 0xFF ) {
1060 if ( bSig ) return propagateFloat32NaN( a, b );
1061 return packFloat32( zSign, 0, 0 );
1065 if ( ( aExp | aSig ) == 0 ) {
1066 float_raise( float_flag_invalid );
1067 return float32_default_nan;
1069 float_raise( float_flag_divbyzero );
1070 return packFloat32( zSign, 0xFF, 0 );
1072 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1075 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1076 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1078 zExp = aExp - bExp + 0x7D;
1079 aSig = ( aSig | 0x00800000 )<<7;
1080 bSig = ( bSig | 0x00800000 )<<8;
1081 if ( bSig <= ( aSig + aSig ) ) {
1085 zSig = estimateDiv64To32( aSig, 0, bSig );
1086 if ( ( zSig & 0x3F ) <= 2 ) {
1087 mul32To64( bSig, zSig, &term0, &term1 );
1088 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1089 while ( (sbits32) rem0 < 0 ) {
1091 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1093 zSig |= ( rem1 != 0 );
1095 return roundAndPackFloat32( zSign, zExp, zSig );
1099 #ifndef SOFTFLOAT_FOR_GCC
1101 -------------------------------------------------------------------------------
1102 Returns the remainder of the single-precision floating-point value `a'
1103 with respect to the corresponding value `b'. The operation is performed
1104 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1105 -------------------------------------------------------------------------------
1107 float32 float32_rem( float32 a, float32 b )
1109 flag aSign, bSign, zSign;
1110 int16 aExp, bExp, expDiff;
1111 bits32 aSig, bSig, q, allZero, alternateASig;
1114 aSig = extractFloat32Frac( a );
1115 aExp = extractFloat32Exp( a );
1116 aSign = extractFloat32Sign( a );
1117 bSig = extractFloat32Frac( b );
1118 bExp = extractFloat32Exp( b );
1119 bSign = extractFloat32Sign( b );
1120 if ( aExp == 0xFF ) {
1121 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1122 return propagateFloat32NaN( a, b );
1124 float_raise( float_flag_invalid );
1125 return float32_default_nan;
1127 if ( bExp == 0xFF ) {
1128 if ( bSig ) return propagateFloat32NaN( a, b );
1133 float_raise( float_flag_invalid );
1134 return float32_default_nan;
1136 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1139 if ( aSig == 0 ) return a;
1140 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1142 expDiff = aExp - bExp;
1143 aSig = ( aSig | 0x00800000 )<<8;
1144 bSig = ( bSig | 0x00800000 )<<8;
1145 if ( expDiff < 0 ) {
1146 if ( expDiff < -1 ) return a;
1149 q = ( bSig <= aSig );
1150 if ( q ) aSig -= bSig;
1152 while ( 0 < expDiff ) {
1153 q = estimateDiv64To32( aSig, 0, bSig );
1154 q = ( 2 < q ) ? q - 2 : 0;
1155 aSig = - ( ( bSig>>2 ) * q );
1159 if ( 0 < expDiff ) {
1160 q = estimateDiv64To32( aSig, 0, bSig );
1161 q = ( 2 < q ) ? q - 2 : 0;
1164 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1171 alternateASig = aSig;
1174 } while ( 0 <= (sbits32) aSig );
1175 sigMean = aSig + alternateASig;
1176 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1177 aSig = alternateASig;
1179 zSign = ( (sbits32) aSig < 0 );
1180 if ( zSign ) aSig = - aSig;
1181 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1186 #ifndef SOFTFLOAT_FOR_GCC
1188 -------------------------------------------------------------------------------
1189 Returns the square root of the single-precision floating-point value `a'.
1190 The operation is performed according to the IEC/IEEE Standard for Binary
1191 Floating-Point Arithmetic.
1192 -------------------------------------------------------------------------------
1194 float32 float32_sqrt( float32 a )
1198 bits32 aSig, zSig, rem0, rem1, term0, term1;
1200 aSig = extractFloat32Frac( a );
1201 aExp = extractFloat32Exp( a );
1202 aSign = extractFloat32Sign( a );
1203 if ( aExp == 0xFF ) {
1204 if ( aSig ) return propagateFloat32NaN( a, 0 );
1205 if ( ! aSign ) return a;
1206 float_raise( float_flag_invalid );
1207 return float32_default_nan;
1210 if ( ( aExp | aSig ) == 0 ) return a;
1211 float_raise( float_flag_invalid );
1212 return float32_default_nan;
1215 if ( aSig == 0 ) return 0;
1216 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1218 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1219 aSig = ( aSig | 0x00800000 )<<8;
1220 zSig = estimateSqrt32( aExp, aSig ) + 2;
1221 if ( ( zSig & 0x7F ) <= 5 ) {
1228 mul32To64( zSig, zSig, &term0, &term1 );
1229 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1230 while ( (sbits32) rem0 < 0 ) {
1232 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1234 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1236 zSig |= ( ( rem0 | rem1 ) != 0 );
1239 shift32RightJamming( zSig, 1, &zSig );
1241 return roundAndPackFloat32( 0, zExp, zSig );
1247 -------------------------------------------------------------------------------
1248 Returns 1 if the single-precision floating-point value `a' is equal to
1249 the corresponding value `b', and 0 otherwise. The comparison is performed
1250 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1251 -------------------------------------------------------------------------------
1253 flag float32_eq( float32 a, float32 b )
1256 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1257 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1259 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1260 float_raise( float_flag_invalid );
1264 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1269 -------------------------------------------------------------------------------
1270 Returns 1 if the single-precision floating-point value `a' is less than
1271 or equal to the corresponding value `b', and 0 otherwise. The comparison
1272 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1274 -------------------------------------------------------------------------------
1276 flag float32_le( float32 a, float32 b )
1280 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1281 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1283 float_raise( float_flag_invalid );
1286 aSign = extractFloat32Sign( a );
1287 bSign = extractFloat32Sign( b );
1288 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1289 return ( a == b ) || ( aSign ^ ( a < b ) );
1294 -------------------------------------------------------------------------------
1295 Returns 1 if the single-precision floating-point value `a' is less than
1296 the corresponding value `b', and 0 otherwise. The comparison is performed
1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1298 -------------------------------------------------------------------------------
1300 flag float32_lt( float32 a, float32 b )
1304 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1305 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1307 float_raise( float_flag_invalid );
1310 aSign = extractFloat32Sign( a );
1311 bSign = extractFloat32Sign( b );
1312 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1313 return ( a != b ) && ( aSign ^ ( a < b ) );
1317 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1319 -------------------------------------------------------------------------------
1320 Returns 1 if the single-precision floating-point value `a' is equal to
1321 the corresponding value `b', and 0 otherwise. The invalid exception is
1322 raised if either operand is a NaN. Otherwise, the comparison is performed
1323 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1324 -------------------------------------------------------------------------------
1326 flag float32_eq_signaling( float32 a, float32 b )
1329 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1330 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1332 float_raise( float_flag_invalid );
1335 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1340 -------------------------------------------------------------------------------
1341 Returns 1 if the single-precision floating-point value `a' is less than or
1342 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1343 cause an exception. Otherwise, the comparison is performed according to the
1344 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 -------------------------------------------------------------------------------
1347 flag float32_le_quiet( float32 a, float32 b )
1352 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1353 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1355 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1356 float_raise( float_flag_invalid );
1360 aSign = extractFloat32Sign( a );
1361 bSign = extractFloat32Sign( b );
1362 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1363 return ( a == b ) || ( aSign ^ ( a < b ) );
1368 -------------------------------------------------------------------------------
1369 Returns 1 if the single-precision floating-point value `a' is less than
1370 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1371 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1372 Standard for Binary Floating-Point Arithmetic.
1373 -------------------------------------------------------------------------------
1375 flag float32_lt_quiet( float32 a, float32 b )
1379 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1380 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1382 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1383 float_raise( float_flag_invalid );
1387 aSign = extractFloat32Sign( a );
1388 bSign = extractFloat32Sign( b );
1389 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1390 return ( a != b ) && ( aSign ^ ( a < b ) );
1393 #endif /* !SOFTFLOAT_FOR_GCC */
1395 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1397 -------------------------------------------------------------------------------
1398 Returns the result of converting the double-precision floating-point value
1399 `a' to the 32-bit two's complement integer format. The conversion is
1400 performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 Arithmetic---which means in particular that the conversion is rounded
1402 according to the current rounding mode. If `a' is a NaN, the largest
1403 positive integer is returned. Otherwise, if the conversion overflows, the
1404 largest integer with the same sign as `a' is returned.
1405 -------------------------------------------------------------------------------
1407 int32 float64_to_int32( float64 a )
1410 int16 aExp, shiftCount;
1411 bits32 aSig0, aSig1, absZ, aSigExtra;
1415 aSig1 = extractFloat64Frac1( a );
1416 aSig0 = extractFloat64Frac0( a );
1417 aExp = extractFloat64Exp( a );
1418 aSign = extractFloat64Sign( a );
1419 shiftCount = aExp - 0x413;
1420 if ( 0 <= shiftCount ) {
1421 if ( 0x41E < aExp ) {
1422 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1426 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1427 if ( 0x80000000 < absZ ) goto invalid;
1430 aSig1 = ( aSig1 != 0 );
1431 if ( aExp < 0x3FE ) {
1432 aSigExtra = aExp | aSig0 | aSig1;
1436 aSig0 |= 0x00100000;
1437 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1438 absZ = aSig0>>( - shiftCount );
1441 roundingMode = float_rounding_mode;
1442 if ( roundingMode == float_round_nearest_even ) {
1443 if ( (sbits32) aSigExtra < 0 ) {
1445 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1447 z = aSign ? - absZ : absZ;
1450 aSigExtra = ( aSigExtra != 0 );
1453 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1456 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1459 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1461 float_raise( float_flag_invalid );
1462 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1464 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1468 #endif /* !SOFTFLOAT_FOR_GCC */
1471 -------------------------------------------------------------------------------
1472 Returns the result of converting the double-precision floating-point value
1473 `a' to the 32-bit two's complement integer format. The conversion is
1474 performed according to the IEC/IEEE Standard for Binary Floating-Point
1475 Arithmetic, except that the conversion is always rounded toward zero.
1476 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1477 the conversion overflows, the largest integer with the same sign as `a' is
1479 -------------------------------------------------------------------------------
1481 int32 float64_to_int32_round_to_zero( float64 a )
1484 int16 aExp, shiftCount;
1485 bits32 aSig0, aSig1, absZ, aSigExtra;
1488 aSig1 = extractFloat64Frac1( a );
1489 aSig0 = extractFloat64Frac0( a );
1490 aExp = extractFloat64Exp( a );
1491 aSign = extractFloat64Sign( a );
1492 shiftCount = aExp - 0x413;
1493 if ( 0 <= shiftCount ) {
1494 if ( 0x41E < aExp ) {
1495 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1499 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1502 if ( aExp < 0x3FF ) {
1503 if ( aExp | aSig0 | aSig1 ) {
1504 float_exception_flags |= float_flag_inexact;
1508 aSig0 |= 0x00100000;
1509 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1510 absZ = aSig0>>( - shiftCount );
1512 z = aSign ? - absZ : absZ;
1513 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1515 float_raise( float_flag_invalid );
1516 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1518 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1524 -------------------------------------------------------------------------------
1525 Returns the result of converting the double-precision floating-point value
1526 `a' to the single-precision floating-point format. The conversion is
1527 performed according to the IEC/IEEE Standard for Binary Floating-Point
1529 -------------------------------------------------------------------------------
1531 float32 float64_to_float32( float64 a )
1535 bits32 aSig0, aSig1, zSig;
1538 aSig1 = extractFloat64Frac1( a );
1539 aSig0 = extractFloat64Frac0( a );
1540 aExp = extractFloat64Exp( a );
1541 aSign = extractFloat64Sign( a );
1542 if ( aExp == 0x7FF ) {
1543 if ( aSig0 | aSig1 ) {
1544 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1546 return packFloat32( aSign, 0xFF, 0 );
1548 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1549 if ( aExp ) zSig |= 0x40000000;
1550 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1554 #ifndef SOFTFLOAT_FOR_GCC
1556 -------------------------------------------------------------------------------
1557 Rounds the double-precision floating-point value `a' to an integer,
1558 and returns the result as a double-precision floating-point value. The
1559 operation is performed according to the IEC/IEEE Standard for Binary
1560 Floating-Point Arithmetic.
1561 -------------------------------------------------------------------------------
1563 float64 float64_round_to_int( float64 a )
1567 bits32 lastBitMask, roundBitsMask;
1571 aExp = extractFloat64Exp( a );
1572 if ( 0x413 <= aExp ) {
1573 if ( 0x433 <= aExp ) {
1574 if ( ( aExp == 0x7FF )
1575 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1576 return propagateFloat64NaN( a, a );
1581 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1582 roundBitsMask = lastBitMask - 1;
1584 roundingMode = float_rounding_mode;
1585 if ( roundingMode == float_round_nearest_even ) {
1586 if ( lastBitMask ) {
1587 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1588 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1591 if ( (sbits32) z.low < 0 ) {
1593 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1597 else if ( roundingMode != float_round_to_zero ) {
1598 if ( extractFloat64Sign( z )
1599 ^ ( roundingMode == float_round_up ) ) {
1600 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1603 z.low &= ~ roundBitsMask;
1606 if ( aExp <= 0x3FE ) {
1607 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1608 float_exception_flags |= float_flag_inexact;
1609 aSign = extractFloat64Sign( a );
1610 switch ( float_rounding_mode ) {
1611 case float_round_nearest_even:
1612 if ( ( aExp == 0x3FE )
1613 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1615 return packFloat64( aSign, 0x3FF, 0, 0 );
1618 case float_round_down:
1620 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1621 : packFloat64( 0, 0, 0, 0 );
1622 case float_round_up:
1624 aSign ? packFloat64( 1, 0, 0, 0 )
1625 : packFloat64( 0, 0x3FF, 0, 0 );
1627 return packFloat64( aSign, 0, 0, 0 );
1630 lastBitMask <<= 0x413 - aExp;
1631 roundBitsMask = lastBitMask - 1;
1634 roundingMode = float_rounding_mode;
1635 if ( roundingMode == float_round_nearest_even ) {
1636 z.high += lastBitMask>>1;
1637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1638 z.high &= ~ lastBitMask;
1641 else if ( roundingMode != float_round_to_zero ) {
1642 if ( extractFloat64Sign( z )
1643 ^ ( roundingMode == float_round_up ) ) {
1644 z.high |= ( a.low != 0 );
1645 z.high += roundBitsMask;
1648 z.high &= ~ roundBitsMask;
1650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1651 float_exception_flags |= float_flag_inexact;
1659 -------------------------------------------------------------------------------
1660 Returns the result of adding the absolute values of the double-precision
1661 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1662 before being returned. `zSign' is ignored if the result is a NaN.
1663 The addition is performed according to the IEC/IEEE Standard for Binary
1664 Floating-Point Arithmetic.
1665 -------------------------------------------------------------------------------
1667 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1669 int16 aExp, bExp, zExp;
1670 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1673 aSig1 = extractFloat64Frac1( a );
1674 aSig0 = extractFloat64Frac0( a );
1675 aExp = extractFloat64Exp( a );
1676 bSig1 = extractFloat64Frac1( b );
1677 bSig0 = extractFloat64Frac0( b );
1678 bExp = extractFloat64Exp( b );
1679 expDiff = aExp - bExp;
1680 if ( 0 < expDiff ) {
1681 if ( aExp == 0x7FF ) {
1682 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1689 bSig0 |= 0x00100000;
1691 shift64ExtraRightJamming(
1692 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1695 else if ( expDiff < 0 ) {
1696 if ( bExp == 0x7FF ) {
1697 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1698 return packFloat64( zSign, 0x7FF, 0, 0 );
1704 aSig0 |= 0x00100000;
1706 shift64ExtraRightJamming(
1707 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1711 if ( aExp == 0x7FF ) {
1712 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1713 return propagateFloat64NaN( a, b );
1717 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1718 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1720 zSig0 |= 0x00200000;
1724 aSig0 |= 0x00100000;
1725 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1727 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1730 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1732 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1737 -------------------------------------------------------------------------------
1738 Returns the result of subtracting the absolute values of the double-
1739 precision floating-point values `a' and `b'. If `zSign' is 1, the
1740 difference is negated before being returned. `zSign' is ignored if the
1741 result is a NaN. The subtraction is performed according to the IEC/IEEE
1742 Standard for Binary Floating-Point Arithmetic.
1743 -------------------------------------------------------------------------------
1745 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1747 int16 aExp, bExp, zExp;
1748 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1751 aSig1 = extractFloat64Frac1( a );
1752 aSig0 = extractFloat64Frac0( a );
1753 aExp = extractFloat64Exp( a );
1754 bSig1 = extractFloat64Frac1( b );
1755 bSig0 = extractFloat64Frac0( b );
1756 bExp = extractFloat64Exp( b );
1757 expDiff = aExp - bExp;
1758 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1759 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1760 if ( 0 < expDiff ) goto aExpBigger;
1761 if ( expDiff < 0 ) goto bExpBigger;
1762 if ( aExp == 0x7FF ) {
1763 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1764 return propagateFloat64NaN( a, b );
1766 float_raise( float_flag_invalid );
1767 return float64_default_nan;
1773 if ( bSig0 < aSig0 ) goto aBigger;
1774 if ( aSig0 < bSig0 ) goto bBigger;
1775 if ( bSig1 < aSig1 ) goto aBigger;
1776 if ( aSig1 < bSig1 ) goto bBigger;
1777 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1779 if ( bExp == 0x7FF ) {
1780 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1781 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1787 aSig0 |= 0x40000000;
1789 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1790 bSig0 |= 0x40000000;
1792 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1795 goto normalizeRoundAndPack;
1797 if ( aExp == 0x7FF ) {
1798 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1805 bSig0 |= 0x40000000;
1807 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1808 aSig0 |= 0x40000000;
1810 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1812 normalizeRoundAndPack:
1814 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1819 -------------------------------------------------------------------------------
1820 Returns the result of adding the double-precision floating-point values `a'
1821 and `b'. The operation is performed according to the IEC/IEEE Standard for
1822 Binary Floating-Point Arithmetic.
1823 -------------------------------------------------------------------------------
1825 float64 float64_add( float64 a, float64 b )
1829 aSign = extractFloat64Sign( a );
1830 bSign = extractFloat64Sign( b );
1831 if ( aSign == bSign ) {
1832 return addFloat64Sigs( a, b, aSign );
1835 return subFloat64Sigs( a, b, aSign );
1841 -------------------------------------------------------------------------------
1842 Returns the result of subtracting the double-precision floating-point values
1843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1844 for Binary Floating-Point Arithmetic.
1845 -------------------------------------------------------------------------------
1847 float64 float64_sub( float64 a, float64 b )
1851 aSign = extractFloat64Sign( a );
1852 bSign = extractFloat64Sign( b );
1853 if ( aSign == bSign ) {
1854 return subFloat64Sigs( a, b, aSign );
1857 return addFloat64Sigs( a, b, aSign );
1863 -------------------------------------------------------------------------------
1864 Returns the result of multiplying the double-precision floating-point values
1865 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1866 for Binary Floating-Point Arithmetic.
1867 -------------------------------------------------------------------------------
1869 float64 float64_mul( float64 a, float64 b )
1871 flag aSign, bSign, zSign;
1872 int16 aExp, bExp, zExp;
1873 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1875 aSig1 = extractFloat64Frac1( a );
1876 aSig0 = extractFloat64Frac0( a );
1877 aExp = extractFloat64Exp( a );
1878 aSign = extractFloat64Sign( a );
1879 bSig1 = extractFloat64Frac1( b );
1880 bSig0 = extractFloat64Frac0( b );
1881 bExp = extractFloat64Exp( b );
1882 bSign = extractFloat64Sign( b );
1883 zSign = aSign ^ bSign;
1884 if ( aExp == 0x7FF ) {
1885 if ( ( aSig0 | aSig1 )
1886 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1887 return propagateFloat64NaN( a, b );
1889 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1890 return packFloat64( zSign, 0x7FF, 0, 0 );
1892 if ( bExp == 0x7FF ) {
1893 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1894 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1896 float_raise( float_flag_invalid );
1897 return float64_default_nan;
1899 return packFloat64( zSign, 0x7FF, 0, 0 );
1902 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1903 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1906 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1907 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1909 zExp = aExp + bExp - 0x400;
1910 aSig0 |= 0x00100000;
1911 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1912 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1913 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1914 zSig2 |= ( zSig3 != 0 );
1915 if ( 0x00200000 <= zSig0 ) {
1916 shift64ExtraRightJamming(
1917 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1920 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1925 -------------------------------------------------------------------------------
1926 Returns the result of dividing the double-precision floating-point value `a'
1927 by the corresponding value `b'. The operation is performed according to the
1928 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1929 -------------------------------------------------------------------------------
1931 float64 float64_div( float64 a, float64 b )
1933 flag aSign, bSign, zSign;
1934 int16 aExp, bExp, zExp;
1935 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1936 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1938 aSig1 = extractFloat64Frac1( a );
1939 aSig0 = extractFloat64Frac0( a );
1940 aExp = extractFloat64Exp( a );
1941 aSign = extractFloat64Sign( a );
1942 bSig1 = extractFloat64Frac1( b );
1943 bSig0 = extractFloat64Frac0( b );
1944 bExp = extractFloat64Exp( b );
1945 bSign = extractFloat64Sign( b );
1946 zSign = aSign ^ bSign;
1947 if ( aExp == 0x7FF ) {
1948 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1949 if ( bExp == 0x7FF ) {
1950 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1953 return packFloat64( zSign, 0x7FF, 0, 0 );
1955 if ( bExp == 0x7FF ) {
1956 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1957 return packFloat64( zSign, 0, 0, 0 );
1960 if ( ( bSig0 | bSig1 ) == 0 ) {
1961 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1963 float_raise( float_flag_invalid );
1964 return float64_default_nan;
1966 float_raise( float_flag_divbyzero );
1967 return packFloat64( zSign, 0x7FF, 0, 0 );
1969 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1972 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1973 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1975 zExp = aExp - bExp + 0x3FD;
1976 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1977 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1978 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1979 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1982 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1983 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1984 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1985 while ( (sbits32) rem0 < 0 ) {
1987 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1989 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1990 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1991 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1992 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1993 while ( (sbits32) rem1 < 0 ) {
1995 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1997 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1999 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2000 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2004 #ifndef SOFTFLOAT_FOR_GCC
2006 -------------------------------------------------------------------------------
2007 Returns the remainder of the double-precision floating-point value `a'
2008 with respect to the corresponding value `b'. The operation is performed
2009 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2010 -------------------------------------------------------------------------------
2012 float64 float64_rem( float64 a, float64 b )
2014 flag aSign, bSign, zSign;
2015 int16 aExp, bExp, expDiff;
2016 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2017 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2021 aSig1 = extractFloat64Frac1( a );
2022 aSig0 = extractFloat64Frac0( a );
2023 aExp = extractFloat64Exp( a );
2024 aSign = extractFloat64Sign( a );
2025 bSig1 = extractFloat64Frac1( b );
2026 bSig0 = extractFloat64Frac0( b );
2027 bExp = extractFloat64Exp( b );
2028 bSign = extractFloat64Sign( b );
2029 if ( aExp == 0x7FF ) {
2030 if ( ( aSig0 | aSig1 )
2031 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2032 return propagateFloat64NaN( a, b );
2036 if ( bExp == 0x7FF ) {
2037 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2041 if ( ( bSig0 | bSig1 ) == 0 ) {
2043 float_raise( float_flag_invalid );
2044 return float64_default_nan;
2046 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2049 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2050 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2052 expDiff = aExp - bExp;
2053 if ( expDiff < -1 ) return a;
2055 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2056 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2057 q = le64( bSig0, bSig1, aSig0, aSig1 );
2058 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2060 while ( 0 < expDiff ) {
2061 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2062 q = ( 4 < q ) ? q - 4 : 0;
2063 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2064 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2065 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2066 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2069 if ( -32 < expDiff ) {
2070 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2071 q = ( 4 < q ) ? q - 4 : 0;
2073 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2075 if ( expDiff < 0 ) {
2076 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2079 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2081 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2082 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2085 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2086 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2089 alternateASig0 = aSig0;
2090 alternateASig1 = aSig1;
2092 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2093 } while ( 0 <= (sbits32) aSig0 );
2095 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2096 if ( ( sigMean0 < 0 )
2097 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2098 aSig0 = alternateASig0;
2099 aSig1 = alternateASig1;
2101 zSign = ( (sbits32) aSig0 < 0 );
2102 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2104 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2109 #ifndef SOFTFLOAT_FOR_GCC
2111 -------------------------------------------------------------------------------
2112 Returns the square root of the double-precision floating-point value `a'.
2113 The operation is performed according to the IEC/IEEE Standard for Binary
2114 Floating-Point Arithmetic.
2115 -------------------------------------------------------------------------------
2117 float64 float64_sqrt( float64 a )
2121 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2122 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2125 aSig1 = extractFloat64Frac1( a );
2126 aSig0 = extractFloat64Frac0( a );
2127 aExp = extractFloat64Exp( a );
2128 aSign = extractFloat64Sign( a );
2129 if ( aExp == 0x7FF ) {
2130 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2131 if ( ! aSign ) return a;
2135 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2137 float_raise( float_flag_invalid );
2138 return float64_default_nan;
2141 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2142 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2144 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2145 aSig0 |= 0x00100000;
2146 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2147 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2148 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2149 doubleZSig0 = zSig0 + zSig0;
2150 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2151 mul32To64( zSig0, zSig0, &term0, &term1 );
2152 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2153 while ( (sbits32) rem0 < 0 ) {
2156 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2158 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2159 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2160 if ( zSig1 == 0 ) zSig1 = 1;
2161 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2162 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2163 mul32To64( zSig1, zSig1, &term2, &term3 );
2164 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2165 while ( (sbits32) rem1 < 0 ) {
2167 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2169 term2 |= doubleZSig0;
2170 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2172 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2174 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2175 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the double-precision floating-point value `a' is equal to
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2187 flag float64_eq( float64 a, float64 b )
2190 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2191 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2192 || ( ( extractFloat64Exp( b ) == 0x7FF )
2193 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2195 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2196 float_raise( float_flag_invalid );
2200 return ( a == b ) ||
2201 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2206 -------------------------------------------------------------------------------
2207 Returns 1 if the double-precision floating-point value `a' is less than
2208 or equal to the corresponding value `b', and 0 otherwise. The comparison
2209 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2211 -------------------------------------------------------------------------------
2213 flag float64_le( float64 a, float64 b )
2217 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2218 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2219 || ( ( extractFloat64Exp( b ) == 0x7FF )
2220 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2222 float_raise( float_flag_invalid );
2225 aSign = extractFloat64Sign( a );
2226 bSign = extractFloat64Sign( b );
2227 if ( aSign != bSign )
2229 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2231 return ( a == b ) ||
2232 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2236 -------------------------------------------------------------------------------
2237 Returns 1 if the double-precision floating-point value `a' is less than
2238 the corresponding value `b', and 0 otherwise. The comparison is performed
2239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2240 -------------------------------------------------------------------------------
2242 flag float64_lt( float64 a, float64 b )
2246 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2247 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2248 || ( ( extractFloat64Exp( b ) == 0x7FF )
2249 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2251 float_raise( float_flag_invalid );
2254 aSign = extractFloat64Sign( a );
2255 bSign = extractFloat64Sign( b );
2256 if ( aSign != bSign )
2258 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2260 return ( a != b ) &&
2261 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2265 #ifndef SOFTFLOAT_FOR_GCC
2267 -------------------------------------------------------------------------------
2268 Returns 1 if the double-precision floating-point value `a' is equal to
2269 the corresponding value `b', and 0 otherwise. The invalid exception is
2270 raised if either operand is a NaN. Otherwise, the comparison is performed
2271 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2272 -------------------------------------------------------------------------------
2274 flag float64_eq_signaling( float64 a, float64 b )
2277 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2278 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2279 || ( ( extractFloat64Exp( b ) == 0x7FF )
2280 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2282 float_raise( float_flag_invalid );
2285 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2290 -------------------------------------------------------------------------------
2291 Returns 1 if the double-precision floating-point value `a' is less than or
2292 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2293 cause an exception. Otherwise, the comparison is performed according to the
2294 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 -------------------------------------------------------------------------------
2297 flag float64_le_quiet( float64 a, float64 b )
2301 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2302 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2303 || ( ( extractFloat64Exp( b ) == 0x7FF )
2304 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2306 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2307 float_raise( float_flag_invalid );
2311 aSign = extractFloat64Sign( a );
2312 bSign = extractFloat64Sign( b );
2313 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2314 return ( a == b ) || ( aSign ^ ( a < b ) );
2319 -------------------------------------------------------------------------------
2320 Returns 1 if the double-precision floating-point value `a' is less than
2321 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2322 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2323 Standard for Binary Floating-Point Arithmetic.
2324 -------------------------------------------------------------------------------
2326 flag float64_lt_quiet( float64 a, float64 b )
2330 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2331 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2332 || ( ( extractFloat64Exp( b ) == 0x7FF )
2333 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2335 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2336 float_raise( float_flag_invalid );
2340 aSign = extractFloat64Sign( a );
2341 bSign = extractFloat64Sign( b );
2342 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2343 return ( a != b ) && ( aSign ^ ( a < b ) );