1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
50 #ifdef SOFTFLOAT_FOR_GCC
51 #include "softfloat-for-gcc.h"
55 #include "softfloat.h"
58 * Conversions between floats as stored in memory and floats as
61 #ifndef FLOAT64_DEMANGLE
62 #define FLOAT64_DEMANGLE(a) (a)
64 #ifndef FLOAT64_MANGLE
65 #define FLOAT64_MANGLE(a) (a)
69 -------------------------------------------------------------------------------
70 Floating-point rounding mode, extended double-precision rounding precision,
72 -------------------------------------------------------------------------------
74 int float_rounding_mode = float_round_nearest_even;
75 int float_exception_flags = 0;
77 int8 floatx80_rounding_precision = 80;
81 -------------------------------------------------------------------------------
82 Primitive arithmetic functions, including multi-word arithmetic, and
83 division and square root approximations. (Can be specialized to target if
85 -------------------------------------------------------------------------------
87 #include "softfloat-macros"
90 -------------------------------------------------------------------------------
91 Functions and definitions to determine: (1) whether tininess for underflow
92 is detected before or after rounding by default, (2) what (if anything)
93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
95 are propagated from function inputs to output. These details are target-
97 -------------------------------------------------------------------------------
99 #include "softfloat-specialize"
101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
103 -------------------------------------------------------------------------------
104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
105 and 7, and returns the properly rounded 32-bit integer corresponding to the
106 input. If `zSign' is 1, the input is negated before being converted to an
107 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
108 is simply rounded to an integer, with the inexact exception raised if the
109 input cannot be represented exactly as an integer. However, if the fixed-
110 point input is too large, the invalid exception is raised and the largest
111 positive or negative integer is returned.
112 -------------------------------------------------------------------------------
114 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 flag roundNearestEven;
118 int8 roundIncrement, roundBits;
121 roundingMode = float_rounding_mode;
122 roundNearestEven = ( roundingMode == float_round_nearest_even );
123 roundIncrement = 0x40;
124 if ( ! roundNearestEven ) {
125 if ( roundingMode == float_round_to_zero ) {
129 roundIncrement = 0x7F;
131 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 if ( roundingMode == float_round_down ) roundIncrement = 0;
138 roundBits = absZ & 0x7F;
139 absZ = ( absZ + roundIncrement )>>7;
140 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 if ( zSign ) z = - z;
143 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
144 float_raise( float_flag_invalid );
145 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
147 if ( roundBits ) float_exception_flags |= float_flag_inexact;
153 -------------------------------------------------------------------------------
154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 `absZ1', with binary point between bits 63 and 64 (between the input words),
156 and returns the properly rounded 64-bit integer corresponding to the input.
157 If `zSign' is 1, the input is negated before being converted to an integer.
158 Ordinarily, the fixed-point input is simply rounded to an integer, with
159 the inexact exception raised if the input cannot be represented exactly as
160 an integer. However, if the fixed-point input is too large, the invalid
161 exception is raised and the largest positive or negative integer is
163 -------------------------------------------------------------------------------
165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 flag roundNearestEven, increment;
171 roundingMode = float_rounding_mode;
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
173 increment = ( (sbits64) absZ1 < 0 );
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
183 increment = ( roundingMode == float_round_up ) && absZ1;
189 if ( absZ0 == 0 ) goto overflow;
190 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
196 float_raise( float_flag_invalid );
198 zSign ? (sbits64) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
208 -------------------------------------------------------------------------------
209 Returns the fraction bits of the single-precision floating-point value `a'.
210 -------------------------------------------------------------------------------
212 INLINE bits32 extractFloat32Frac( float32 a )
215 return a & 0x007FFFFF;
220 -------------------------------------------------------------------------------
221 Returns the exponent bits of the single-precision floating-point value `a'.
222 -------------------------------------------------------------------------------
224 INLINE int16 extractFloat32Exp( float32 a )
227 return ( a>>23 ) & 0xFF;
232 -------------------------------------------------------------------------------
233 Returns the sign bit of the single-precision floating-point value `a'.
234 -------------------------------------------------------------------------------
236 INLINE flag extractFloat32Sign( float32 a )
244 -------------------------------------------------------------------------------
245 Normalizes the subnormal single-precision floating-point value represented
246 by the denormalized significand `aSig'. The normalized exponent and
247 significand are stored at the locations pointed to by `zExpPtr' and
248 `zSigPtr', respectively.
249 -------------------------------------------------------------------------------
252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
256 shiftCount = countLeadingZeros32( aSig ) - 8;
257 *zSigPtr = aSig<<shiftCount;
258 *zExpPtr = 1 - shiftCount;
263 -------------------------------------------------------------------------------
264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
265 single-precision floating-point value, returning the result. After being
266 shifted into the proper positions, the three fields are simply added
267 together to form the result. This means that any integer portion of `zSig'
268 will be added into the exponent. Since a properly normalized significand
269 will have an integer portion equal to 1, the `zExp' input should be 1 less
270 than the desired result exponent whenever `zSig' is a complete, normalized
272 -------------------------------------------------------------------------------
274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
282 -------------------------------------------------------------------------------
283 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
284 and significand `zSig', and returns the proper single-precision floating-
285 point value corresponding to the abstract input. Ordinarily, the abstract
286 value is simply rounded and packed into the single-precision format, with
287 the inexact exception raised if the abstract input cannot be represented
288 exactly. However, if the abstract value is too large, the overflow and
289 inexact exceptions are raised and an infinity or maximal finite value is
290 returned. If the abstract value is too small, the input value is rounded to
291 a subnormal number, and the underflow and inexact exceptions are raised if
292 the abstract input cannot be represented exactly as a subnormal single-
293 precision floating-point number.
294 The input significand `zSig' has its binary point between bits 30
295 and 29, which is 7 bits to the left of the usual location. This shifted
296 significand must be normalized or smaller. If `zSig' is not normalized,
297 `zExp' must be 0; in that case, the result returned is a subnormal number,
298 and it must not require rounding. In the usual case that `zSig' is
299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
300 The handling of underflow and overflow follows the IEC/IEEE Standard for
301 Binary Floating-Point Arithmetic.
302 -------------------------------------------------------------------------------
304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 flag roundNearestEven;
308 int8 roundIncrement, roundBits;
311 roundingMode = float_rounding_mode;
312 roundNearestEven = ( roundingMode == float_round_nearest_even );
313 roundIncrement = 0x40;
314 if ( ! roundNearestEven ) {
315 if ( roundingMode == float_round_to_zero ) {
319 roundIncrement = 0x7F;
321 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 if ( roundingMode == float_round_down ) roundIncrement = 0;
328 roundBits = zSig & 0x7F;
329 if ( 0xFD <= (bits16) zExp ) {
331 || ( ( zExp == 0xFD )
332 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
334 float_raise( float_flag_overflow | float_flag_inexact );
335 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
339 ( float_detect_tininess == float_tininess_before_rounding )
341 || ( zSig + roundIncrement < 0x80000000 );
342 shift32RightJamming( zSig, - zExp, &zSig );
344 roundBits = zSig & 0x7F;
345 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348 if ( roundBits ) float_exception_flags |= float_flag_inexact;
349 zSig = ( zSig + roundIncrement )>>7;
350 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
351 if ( zSig == 0 ) zExp = 0;
352 return packFloat32( zSign, zExp, zSig );
357 -------------------------------------------------------------------------------
358 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
359 and significand `zSig', and returns the proper single-precision floating-
360 point value corresponding to the abstract input. This routine is just like
361 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
363 floating-point exponent.
364 -------------------------------------------------------------------------------
367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
371 shiftCount = countLeadingZeros32( zSig ) - 1;
372 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
377 -------------------------------------------------------------------------------
378 Returns the fraction bits of the double-precision floating-point value `a'.
379 -------------------------------------------------------------------------------
381 INLINE bits64 extractFloat64Frac( float64 a )
384 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
389 -------------------------------------------------------------------------------
390 Returns the exponent bits of the double-precision floating-point value `a'.
391 -------------------------------------------------------------------------------
393 INLINE int16 extractFloat64Exp( float64 a )
396 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
401 -------------------------------------------------------------------------------
402 Returns the sign bit of the double-precision floating-point value `a'.
403 -------------------------------------------------------------------------------
405 INLINE flag extractFloat64Sign( float64 a )
408 return FLOAT64_DEMANGLE(a)>>63;
413 -------------------------------------------------------------------------------
414 Normalizes the subnormal double-precision floating-point value represented
415 by the denormalized significand `aSig'. The normalized exponent and
416 significand are stored at the locations pointed to by `zExpPtr' and
417 `zSigPtr', respectively.
418 -------------------------------------------------------------------------------
421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
425 shiftCount = countLeadingZeros64( aSig ) - 11;
426 *zSigPtr = aSig<<shiftCount;
427 *zExpPtr = 1 - shiftCount;
432 -------------------------------------------------------------------------------
433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
434 double-precision floating-point value, returning the result. After being
435 shifted into the proper positions, the three fields are simply added
436 together to form the result. This means that any integer portion of `zSig'
437 will be added into the exponent. Since a properly normalized significand
438 will have an integer portion equal to 1, the `zExp' input should be 1 less
439 than the desired result exponent whenever `zSig' is a complete, normalized
441 -------------------------------------------------------------------------------
443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
447 ( ( (bits64) zExp )<<52 ) + zSig );
452 -------------------------------------------------------------------------------
453 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454 and significand `zSig', and returns the proper double-precision floating-
455 point value corresponding to the abstract input. Ordinarily, the abstract
456 value is simply rounded and packed into the double-precision format, with
457 the inexact exception raised if the abstract input cannot be represented
458 exactly. However, if the abstract value is too large, the overflow and
459 inexact exceptions are raised and an infinity or maximal finite value is
460 returned. If the abstract value is too small, the input value is rounded to
461 a subnormal number, and the underflow and inexact exceptions are raised if
462 the abstract input cannot be represented exactly as a subnormal double-
463 precision floating-point number.
464 The input significand `zSig' has its binary point between bits 62
465 and 61, which is 10 bits to the left of the usual location. This shifted
466 significand must be normalized or smaller. If `zSig' is not normalized,
467 `zExp' must be 0; in that case, the result returned is a subnormal number,
468 and it must not require rounding. In the usual case that `zSig' is
469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
470 The handling of underflow and overflow follows the IEC/IEEE Standard for
471 Binary Floating-Point Arithmetic.
472 -------------------------------------------------------------------------------
474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 flag roundNearestEven;
478 int16 roundIncrement, roundBits;
481 roundingMode = float_rounding_mode;
482 roundNearestEven = ( roundingMode == float_round_nearest_even );
483 roundIncrement = 0x200;
484 if ( ! roundNearestEven ) {
485 if ( roundingMode == float_round_to_zero ) {
489 roundIncrement = 0x3FF;
491 if ( roundingMode == float_round_up ) roundIncrement = 0;
494 if ( roundingMode == float_round_down ) roundIncrement = 0;
498 roundBits = zSig & 0x3FF;
499 if ( 0x7FD <= (bits16) zExp ) {
500 if ( ( 0x7FD < zExp )
501 || ( ( zExp == 0x7FD )
502 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
504 float_raise( float_flag_overflow | float_flag_inexact );
505 return FLOAT64_MANGLE(
506 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
507 ( roundIncrement == 0 ));
511 ( float_detect_tininess == float_tininess_before_rounding )
513 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
514 shift64RightJamming( zSig, - zExp, &zSig );
516 roundBits = zSig & 0x3FF;
517 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520 if ( roundBits ) float_exception_flags |= float_flag_inexact;
521 zSig = ( zSig + roundIncrement )>>10;
522 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
523 if ( zSig == 0 ) zExp = 0;
524 return packFloat64( zSign, zExp, zSig );
529 -------------------------------------------------------------------------------
530 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
531 and significand `zSig', and returns the proper double-precision floating-
532 point value corresponding to the abstract input. This routine is just like
533 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
535 floating-point exponent.
536 -------------------------------------------------------------------------------
539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
543 shiftCount = countLeadingZeros64( zSig ) - 1;
544 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
551 -------------------------------------------------------------------------------
552 Returns the fraction bits of the extended double-precision floating-point
554 -------------------------------------------------------------------------------
556 INLINE bits64 extractFloatx80Frac( floatx80 a )
564 -------------------------------------------------------------------------------
565 Returns the exponent bits of the extended double-precision floating-point
567 -------------------------------------------------------------------------------
569 INLINE int32 extractFloatx80Exp( floatx80 a )
572 return a.high & 0x7FFF;
577 -------------------------------------------------------------------------------
578 Returns the sign bit of the extended double-precision floating-point value
580 -------------------------------------------------------------------------------
582 INLINE flag extractFloatx80Sign( floatx80 a )
590 -------------------------------------------------------------------------------
591 Normalizes the subnormal extended double-precision floating-point value
592 represented by the denormalized significand `aSig'. The normalized exponent
593 and significand are stored at the locations pointed to by `zExpPtr' and
594 `zSigPtr', respectively.
595 -------------------------------------------------------------------------------
598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
602 shiftCount = countLeadingZeros64( aSig );
603 *zSigPtr = aSig<<shiftCount;
604 *zExpPtr = 1 - shiftCount;
609 -------------------------------------------------------------------------------
610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
611 extended double-precision floating-point value, returning the result.
612 -------------------------------------------------------------------------------
614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
619 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
625 -------------------------------------------------------------------------------
626 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
627 and extended significand formed by the concatenation of `zSig0' and `zSig1',
628 and returns the proper extended double-precision floating-point value
629 corresponding to the abstract input. Ordinarily, the abstract value is
630 rounded and packed into the extended double-precision format, with the
631 inexact exception raised if the abstract input cannot be represented
632 exactly. However, if the abstract value is too large, the overflow and
633 inexact exceptions are raised and an infinity or maximal finite value is
634 returned. If the abstract value is too small, the input value is rounded to
635 a subnormal number, and the underflow and inexact exceptions are raised if
636 the abstract input cannot be represented exactly as a subnormal extended
637 double-precision floating-point number.
638 If `roundingPrecision' is 32 or 64, the result is rounded to the same
639 number of bits as single or double precision, respectively. Otherwise, the
640 result is rounded to the full precision of the extended double-precision
642 The input significand must be normalized or smaller. If the input
643 significand is not normalized, `zExp' must be 0; in that case, the result
644 returned is a subnormal number, and it must not require rounding. The
645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
646 Floating-Point Arithmetic.
647 -------------------------------------------------------------------------------
650 roundAndPackFloatx80(
651 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
655 flag roundNearestEven, increment, isTiny;
656 int64 roundIncrement, roundMask, roundBits;
658 roundingMode = float_rounding_mode;
659 roundNearestEven = ( roundingMode == float_round_nearest_even );
660 if ( roundingPrecision == 80 ) goto precision80;
661 if ( roundingPrecision == 64 ) {
662 roundIncrement = LIT64( 0x0000000000000400 );
663 roundMask = LIT64( 0x00000000000007FF );
665 else if ( roundingPrecision == 32 ) {
666 roundIncrement = LIT64( 0x0000008000000000 );
667 roundMask = LIT64( 0x000000FFFFFFFFFF );
672 zSig0 |= ( zSig1 != 0 );
673 if ( ! roundNearestEven ) {
674 if ( roundingMode == float_round_to_zero ) {
678 roundIncrement = roundMask;
680 if ( roundingMode == float_round_up ) roundIncrement = 0;
683 if ( roundingMode == float_round_down ) roundIncrement = 0;
687 roundBits = zSig0 & roundMask;
688 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
689 if ( ( 0x7FFE < zExp )
690 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
696 ( float_detect_tininess == float_tininess_before_rounding )
698 || ( zSig0 <= zSig0 + roundIncrement );
699 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
701 roundBits = zSig0 & roundMask;
702 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
703 if ( roundBits ) float_exception_flags |= float_flag_inexact;
704 zSig0 += roundIncrement;
705 if ( (sbits64) zSig0 < 0 ) zExp = 1;
706 roundIncrement = roundMask + 1;
707 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
708 roundMask |= roundIncrement;
710 zSig0 &= ~ roundMask;
711 return packFloatx80( zSign, zExp, zSig0 );
714 if ( roundBits ) float_exception_flags |= float_flag_inexact;
715 zSig0 += roundIncrement;
716 if ( zSig0 < roundIncrement ) {
718 zSig0 = LIT64( 0x8000000000000000 );
720 roundIncrement = roundMask + 1;
721 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
722 roundMask |= roundIncrement;
724 zSig0 &= ~ roundMask;
725 if ( zSig0 == 0 ) zExp = 0;
726 return packFloatx80( zSign, zExp, zSig0 );
728 increment = ( (sbits64) zSig1 < 0 );
729 if ( ! roundNearestEven ) {
730 if ( roundingMode == float_round_to_zero ) {
735 increment = ( roundingMode == float_round_down ) && zSig1;
738 increment = ( roundingMode == float_round_up ) && zSig1;
742 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
743 if ( ( 0x7FFE < zExp )
744 || ( ( zExp == 0x7FFE )
745 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
751 float_raise( float_flag_overflow | float_flag_inexact );
752 if ( ( roundingMode == float_round_to_zero )
753 || ( zSign && ( roundingMode == float_round_up ) )
754 || ( ! zSign && ( roundingMode == float_round_down ) )
756 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
758 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
762 ( float_detect_tininess == float_tininess_before_rounding )
765 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
766 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
768 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
769 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
770 if ( roundNearestEven ) {
771 increment = ( (sbits64) zSig1 < 0 );
775 increment = ( roundingMode == float_round_down ) && zSig1;
778 increment = ( roundingMode == float_round_up ) && zSig1;
784 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
785 if ( (sbits64) zSig0 < 0 ) zExp = 1;
787 return packFloatx80( zSign, zExp, zSig0 );
790 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
795 zSig0 = LIT64( 0x8000000000000000 );
798 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802 if ( zSig0 == 0 ) zExp = 0;
804 return packFloatx80( zSign, zExp, zSig0 );
809 -------------------------------------------------------------------------------
810 Takes an abstract floating-point value having sign `zSign', exponent
811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
812 and returns the proper extended double-precision floating-point value
813 corresponding to the abstract input. This routine is just like
814 `roundAndPackFloatx80' except that the input significand does not have to be
816 -------------------------------------------------------------------------------
819 normalizeRoundAndPackFloatx80(
820 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
830 shiftCount = countLeadingZeros64( zSig0 );
831 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
843 -------------------------------------------------------------------------------
844 Returns the least-significant 64 fraction bits of the quadruple-precision
845 floating-point value `a'.
846 -------------------------------------------------------------------------------
848 INLINE bits64 extractFloat128Frac1( float128 a )
856 -------------------------------------------------------------------------------
857 Returns the most-significant 48 fraction bits of the quadruple-precision
858 floating-point value `a'.
859 -------------------------------------------------------------------------------
861 INLINE bits64 extractFloat128Frac0( float128 a )
864 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
869 -------------------------------------------------------------------------------
870 Returns the exponent bits of the quadruple-precision floating-point value
872 -------------------------------------------------------------------------------
874 INLINE int32 extractFloat128Exp( float128 a )
877 return ( a.high>>48 ) & 0x7FFF;
882 -------------------------------------------------------------------------------
883 Returns the sign bit of the quadruple-precision floating-point value `a'.
884 -------------------------------------------------------------------------------
886 INLINE flag extractFloat128Sign( float128 a )
894 -------------------------------------------------------------------------------
895 Normalizes the subnormal quadruple-precision floating-point value
896 represented by the denormalized significand formed by the concatenation of
897 `aSig0' and `aSig1'. The normalized exponent is stored at the location
898 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
899 significand are stored at the location pointed to by `zSig0Ptr', and the
900 least significant 64 bits of the normalized significand are stored at the
901 location pointed to by `zSig1Ptr'.
902 -------------------------------------------------------------------------------
905 normalizeFloat128Subnormal(
916 shiftCount = countLeadingZeros64( aSig1 ) - 15;
917 if ( shiftCount < 0 ) {
918 *zSig0Ptr = aSig1>>( - shiftCount );
919 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 *zSig0Ptr = aSig1<<shiftCount;
925 *zExpPtr = - shiftCount - 63;
928 shiftCount = countLeadingZeros64( aSig0 ) - 15;
929 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
930 *zExpPtr = 1 - shiftCount;
936 -------------------------------------------------------------------------------
937 Packs the sign `zSign', the exponent `zExp', and the significand formed
938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
939 floating-point value, returning the result. After being shifted into the
940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
941 added together to form the most significant 32 bits of the result. This
942 means that any integer portion of `zSig0' will be added into the exponent.
943 Since a properly normalized significand will have an integer portion equal
944 to 1, the `zExp' input should be 1 less than the desired result exponent
945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 -------------------------------------------------------------------------------
950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
955 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
961 -------------------------------------------------------------------------------
962 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 and extended significand formed by the concatenation of `zSig0', `zSig1',
964 and `zSig2', and returns the proper quadruple-precision floating-point value
965 corresponding to the abstract input. Ordinarily, the abstract value is
966 simply rounded and packed into the quadruple-precision format, with the
967 inexact exception raised if the abstract input cannot be represented
968 exactly. However, if the abstract value is too large, the overflow and
969 inexact exceptions are raised and an infinity or maximal finite value is
970 returned. If the abstract value is too small, the input value is rounded to
971 a subnormal number, and the underflow and inexact exceptions are raised if
972 the abstract input cannot be represented exactly as a subnormal quadruple-
973 precision floating-point number.
974 The input significand must be normalized or smaller. If the input
975 significand is not normalized, `zExp' must be 0; in that case, the result
976 returned is a subnormal number, and it must not require rounding. In the
977 usual case that the input significand is normalized, `zExp' must be 1 less
978 than the ``true'' floating-point exponent. The handling of underflow and
979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 -------------------------------------------------------------------------------
983 roundAndPackFloat128(
984 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 flag roundNearestEven, increment, isTiny;
989 roundingMode = float_rounding_mode;
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
991 increment = ( (sbits64) zSig2 < 0 );
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1005 if ( 0x7FFD <= (bits32) zExp ) {
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 float_raise( float_flag_overflow | float_flag_inexact );
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1034 ( float_detect_tininess == float_tininess_before_rounding )
1040 LIT64( 0x0001FFFFFFFFFFFF ),
1041 LIT64( 0xFFFFFFFFFFFFFFFF )
1043 shift128ExtraRightJamming(
1044 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047 if ( roundNearestEven ) {
1048 increment = ( (sbits64) zSig2 < 0 );
1052 increment = ( roundingMode == float_round_down ) && zSig2;
1055 increment = ( roundingMode == float_round_up ) && zSig2;
1060 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1062 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068 return packFloat128( zSign, zExp, zSig0, zSig1 );
1073 -------------------------------------------------------------------------------
1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 returns the proper quadruple-precision floating-point value corresponding
1077 to the abstract input. This routine is just like `roundAndPackFloat128'
1078 except that the input significand has fewer bits and does not have to be
1079 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1081 -------------------------------------------------------------------------------
1084 normalizeRoundAndPackFloat128(
1085 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1112 -------------------------------------------------------------------------------
1113 Returns the result of converting the 32-bit two's complement integer `a'
1114 to the single-precision floating-point format. The conversion is performed
1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116 -------------------------------------------------------------------------------
1118 float32 int32_to_float32( int32 a )
1122 if ( a == 0 ) return 0;
1123 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1125 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1129 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
1130 float32 uint32_to_float32( uint32 a )
1132 if ( a == 0 ) return 0;
1133 if ( a & (bits32) 0x80000000 )
1134 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1135 return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format. The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1147 float64 int32_to_float64( int32 a )
1154 if ( a == 0 ) return 0;
1156 absA = zSign ? - a : a;
1157 shiftCount = countLeadingZeros32( absA ) + 21;
1159 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1163 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
1164 float64 uint32_to_float64( uint32 a )
1169 if ( a == 0 ) return 0;
1170 shiftCount = countLeadingZeros32( a ) + 21;
1171 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1179 -------------------------------------------------------------------------------
1180 Returns the result of converting the 32-bit two's complement integer `a'
1181 to the extended double-precision floating-point format. The conversion
1182 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1184 -------------------------------------------------------------------------------
1186 floatx80 int32_to_floatx80( int32 a )
1193 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 32;
1198 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1202 floatx80 uint32_to_floatx80( uint32 a )
1207 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1208 shiftCount = countLeadingZeros32( a ) + 32;
1209 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1218 -------------------------------------------------------------------------------
1219 Returns the result of converting the 32-bit two's complement integer `a' to
1220 the quadruple-precision floating-point format. The conversion is performed
1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 -------------------------------------------------------------------------------
1224 float128 int32_to_float128( int32 a )
1231 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1233 absA = zSign ? - a : a;
1234 shiftCount = countLeadingZeros32( absA ) + 17;
1236 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1240 float128 uint32_to_float128( uint32 a )
1245 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1246 shiftCount = countLeadingZeros32( a ) + 17;
1247 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1253 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1255 -------------------------------------------------------------------------------
1256 Returns the result of converting the 64-bit two's complement integer `a'
1257 to the single-precision floating-point format. The conversion is performed
1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 -------------------------------------------------------------------------------
1261 float32 int64_to_float32( int64 a )
1267 if ( a == 0 ) return 0;
1269 absA = zSign ? - a : a;
1270 shiftCount = countLeadingZeros64( absA ) - 40;
1271 if ( 0 <= shiftCount ) {
1272 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1276 if ( shiftCount < 0 ) {
1277 shift64RightJamming( absA, - shiftCount, &absA );
1280 absA <<= shiftCount;
1282 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1288 -------------------------------------------------------------------------------
1289 Returns the result of converting the 64-bit two's complement integer `a'
1290 to the double-precision floating-point format. The conversion is performed
1291 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1292 -------------------------------------------------------------------------------
1294 float64 int64_to_float64( int64 a )
1298 if ( a == 0 ) return 0;
1299 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1300 return packFloat64( 1, 0x43E, 0 );
1303 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1310 -------------------------------------------------------------------------------
1311 Returns the result of converting the 64-bit two's complement integer `a'
1312 to the extended double-precision floating-point format. The conversion
1313 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 -------------------------------------------------------------------------------
1317 floatx80 int64_to_floatx80( int64 a )
1323 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1325 absA = zSign ? - a : a;
1326 shiftCount = countLeadingZeros64( absA );
1327 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1333 #endif /* !SOFTFLOAT_FOR_GCC */
1338 -------------------------------------------------------------------------------
1339 Returns the result of converting the 64-bit two's complement integer `a' to
1340 the quadruple-precision floating-point format. The conversion is performed
1341 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1342 -------------------------------------------------------------------------------
1344 float128 int64_to_float128( int64 a )
1350 bits64 zSig0, zSig1;
1352 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1354 absA = zSign ? - a : a;
1355 shiftCount = countLeadingZeros64( absA ) + 49;
1356 zExp = 0x406E - shiftCount;
1357 if ( 64 <= shiftCount ) {
1366 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1367 return packFloat128( zSign, zExp, zSig0, zSig1 );
1373 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1375 -------------------------------------------------------------------------------
1376 Returns the result of converting the single-precision floating-point value
1377 `a' to the 32-bit two's complement integer format. The conversion is
1378 performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 Arithmetic---which means in particular that the conversion is rounded
1380 according to the current rounding mode. If `a' is a NaN, the largest
1381 positive integer is returned. Otherwise, if the conversion overflows, the
1382 largest integer with the same sign as `a' is returned.
1383 -------------------------------------------------------------------------------
1385 int32 float32_to_int32( float32 a )
1388 int16 aExp, shiftCount;
1392 aSig = extractFloat32Frac( a );
1393 aExp = extractFloat32Exp( a );
1394 aSign = extractFloat32Sign( a );
1395 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1396 if ( aExp ) aSig |= 0x00800000;
1397 shiftCount = 0xAF - aExp;
1400 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1401 return roundAndPackInt32( aSign, aSig64 );
1404 #endif /* !SOFTFLOAT_FOR_GCC */
1407 -------------------------------------------------------------------------------
1408 Returns the result of converting the single-precision floating-point value
1409 `a' to the 32-bit two's complement integer format. The conversion is
1410 performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 Arithmetic, except that the conversion is always rounded toward zero.
1412 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1413 the conversion overflows, the largest integer with the same sign as `a' is
1415 -------------------------------------------------------------------------------
1417 int32 float32_to_int32_round_to_zero( float32 a )
1420 int16 aExp, shiftCount;
1424 aSig = extractFloat32Frac( a );
1425 aExp = extractFloat32Exp( a );
1426 aSign = extractFloat32Sign( a );
1427 shiftCount = aExp - 0x9E;
1428 if ( 0 <= shiftCount ) {
1429 if ( a != 0xCF000000 ) {
1430 float_raise( float_flag_invalid );
1431 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1433 return (sbits32) 0x80000000;
1435 else if ( aExp <= 0x7E ) {
1436 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1439 aSig = ( aSig | 0x00800000 )<<8;
1440 z = aSig>>( - shiftCount );
1441 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1442 float_exception_flags |= float_flag_inexact;
1444 if ( aSign ) z = - z;
1449 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1451 -------------------------------------------------------------------------------
1452 Returns the result of converting the single-precision floating-point value
1453 `a' to the 64-bit two's complement integer format. The conversion is
1454 performed according to the IEC/IEEE Standard for Binary Floating-Point
1455 Arithmetic---which means in particular that the conversion is rounded
1456 according to the current rounding mode. If `a' is a NaN, the largest
1457 positive integer is returned. Otherwise, if the conversion overflows, the
1458 largest integer with the same sign as `a' is returned.
1459 -------------------------------------------------------------------------------
1461 int64 float32_to_int64( float32 a )
1464 int16 aExp, shiftCount;
1466 bits64 aSig64, aSigExtra;
1468 aSig = extractFloat32Frac( a );
1469 aExp = extractFloat32Exp( a );
1470 aSign = extractFloat32Sign( a );
1471 shiftCount = 0xBE - aExp;
1472 if ( shiftCount < 0 ) {
1473 float_raise( float_flag_invalid );
1474 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1475 return LIT64( 0x7FFFFFFFFFFFFFFF );
1477 return (sbits64) LIT64( 0x8000000000000000 );
1479 if ( aExp ) aSig |= 0x00800000;
1482 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1483 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1488 -------------------------------------------------------------------------------
1489 Returns the result of converting the single-precision floating-point value
1490 `a' to the 64-bit two's complement integer format. The conversion is
1491 performed according to the IEC/IEEE Standard for Binary Floating-Point
1492 Arithmetic, except that the conversion is always rounded toward zero. If
1493 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1494 conversion overflows, the largest integer with the same sign as `a' is
1496 -------------------------------------------------------------------------------
1498 int64 float32_to_int64_round_to_zero( float32 a )
1501 int16 aExp, shiftCount;
1506 aSig = extractFloat32Frac( a );
1507 aExp = extractFloat32Exp( a );
1508 aSign = extractFloat32Sign( a );
1509 shiftCount = aExp - 0xBE;
1510 if ( 0 <= shiftCount ) {
1511 if ( a != 0xDF000000 ) {
1512 float_raise( float_flag_invalid );
1513 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1514 return LIT64( 0x7FFFFFFFFFFFFFFF );
1517 return (sbits64) LIT64( 0x8000000000000000 );
1519 else if ( aExp <= 0x7E ) {
1520 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1523 aSig64 = aSig | 0x00800000;
1525 z = aSig64>>( - shiftCount );
1526 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1527 float_exception_flags |= float_flag_inexact;
1529 if ( aSign ) z = - z;
1533 #endif /* !SOFTFLOAT_FOR_GCC */
1536 -------------------------------------------------------------------------------
1537 Returns the result of converting the single-precision floating-point value
1538 `a' to the double-precision floating-point format. The conversion is
1539 performed according to the IEC/IEEE Standard for Binary Floating-Point
1541 -------------------------------------------------------------------------------
1543 float64 float32_to_float64( float32 a )
1549 aSig = extractFloat32Frac( a );
1550 aExp = extractFloat32Exp( a );
1551 aSign = extractFloat32Sign( a );
1552 if ( aExp == 0xFF ) {
1553 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1554 return packFloat64( aSign, 0x7FF, 0 );
1557 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1558 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1561 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1568 -------------------------------------------------------------------------------
1569 Returns the result of converting the single-precision floating-point value
1570 `a' to the extended double-precision floating-point format. The conversion
1571 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1573 -------------------------------------------------------------------------------
1575 floatx80 float32_to_floatx80( float32 a )
1581 aSig = extractFloat32Frac( a );
1582 aExp = extractFloat32Exp( a );
1583 aSign = extractFloat32Sign( a );
1584 if ( aExp == 0xFF ) {
1585 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1586 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1589 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1593 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1602 -------------------------------------------------------------------------------
1603 Returns the result of converting the single-precision floating-point value
1604 `a' to the double-precision floating-point format. The conversion is
1605 performed according to the IEC/IEEE Standard for Binary Floating-Point
1607 -------------------------------------------------------------------------------
1609 float128 float32_to_float128( float32 a )
1615 aSig = extractFloat32Frac( a );
1616 aExp = extractFloat32Exp( a );
1617 aSign = extractFloat32Sign( a );
1618 if ( aExp == 0xFF ) {
1619 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1620 return packFloat128( aSign, 0x7FFF, 0, 0 );
1623 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1624 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1627 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1633 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1635 -------------------------------------------------------------------------------
1636 Rounds the single-precision floating-point value `a' to an integer, and
1637 returns the result as a single-precision floating-point value. The
1638 operation is performed according to the IEC/IEEE Standard for Binary
1639 Floating-Point Arithmetic.
1640 -------------------------------------------------------------------------------
1642 float32 float32_round_to_int( float32 a )
1646 bits32 lastBitMask, roundBitsMask;
1650 aExp = extractFloat32Exp( a );
1651 if ( 0x96 <= aExp ) {
1652 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1653 return propagateFloat32NaN( a, a );
1657 if ( aExp <= 0x7E ) {
1658 if ( (bits32) ( a<<1 ) == 0 ) return a;
1659 float_exception_flags |= float_flag_inexact;
1660 aSign = extractFloat32Sign( a );
1661 switch ( float_rounding_mode ) {
1662 case float_round_nearest_even:
1663 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1664 return packFloat32( aSign, 0x7F, 0 );
1667 case float_round_to_zero:
1669 case float_round_down:
1670 return aSign ? 0xBF800000 : 0;
1671 case float_round_up:
1672 return aSign ? 0x80000000 : 0x3F800000;
1674 return packFloat32( aSign, 0, 0 );
1677 lastBitMask <<= 0x96 - aExp;
1678 roundBitsMask = lastBitMask - 1;
1680 roundingMode = float_rounding_mode;
1681 if ( roundingMode == float_round_nearest_even ) {
1682 z += lastBitMask>>1;
1683 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1685 else if ( roundingMode != float_round_to_zero ) {
1686 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1690 z &= ~ roundBitsMask;
1691 if ( z != a ) float_exception_flags |= float_flag_inexact;
1695 #endif /* !SOFTFLOAT_FOR_GCC */
1698 -------------------------------------------------------------------------------
1699 Returns the result of adding the absolute values of the single-precision
1700 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1701 before being returned. `zSign' is ignored if the result is a NaN.
1702 The addition is performed according to the IEC/IEEE Standard for Binary
1703 Floating-Point Arithmetic.
1704 -------------------------------------------------------------------------------
1706 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1708 int16 aExp, bExp, zExp;
1709 bits32 aSig, bSig, zSig;
1712 aSig = extractFloat32Frac( a );
1713 aExp = extractFloat32Exp( a );
1714 bSig = extractFloat32Frac( b );
1715 bExp = extractFloat32Exp( b );
1716 expDiff = aExp - bExp;
1719 if ( 0 < expDiff ) {
1720 if ( aExp == 0xFF ) {
1721 if ( aSig ) return propagateFloat32NaN( a, b );
1730 shift32RightJamming( bSig, expDiff, &bSig );
1733 else if ( expDiff < 0 ) {
1734 if ( bExp == 0xFF ) {
1735 if ( bSig ) return propagateFloat32NaN( a, b );
1736 return packFloat32( zSign, 0xFF, 0 );
1744 shift32RightJamming( aSig, - expDiff, &aSig );
1748 if ( aExp == 0xFF ) {
1749 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1752 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1753 zSig = 0x40000000 + aSig + bSig;
1758 zSig = ( aSig + bSig )<<1;
1760 if ( (sbits32) zSig < 0 ) {
1765 return roundAndPackFloat32( zSign, zExp, zSig );
1770 -------------------------------------------------------------------------------
1771 Returns the result of subtracting the absolute values of the single-
1772 precision floating-point values `a' and `b'. If `zSign' is 1, the
1773 difference is negated before being returned. `zSign' is ignored if the
1774 result is a NaN. The subtraction is performed according to the IEC/IEEE
1775 Standard for Binary Floating-Point Arithmetic.
1776 -------------------------------------------------------------------------------
1778 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1780 int16 aExp, bExp, zExp;
1781 bits32 aSig, bSig, zSig;
1784 aSig = extractFloat32Frac( a );
1785 aExp = extractFloat32Exp( a );
1786 bSig = extractFloat32Frac( b );
1787 bExp = extractFloat32Exp( b );
1788 expDiff = aExp - bExp;
1791 if ( 0 < expDiff ) goto aExpBigger;
1792 if ( expDiff < 0 ) goto bExpBigger;
1793 if ( aExp == 0xFF ) {
1794 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1795 float_raise( float_flag_invalid );
1796 return float32_default_nan;
1802 if ( bSig < aSig ) goto aBigger;
1803 if ( aSig < bSig ) goto bBigger;
1804 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1806 if ( bExp == 0xFF ) {
1807 if ( bSig ) return propagateFloat32NaN( a, b );
1808 return packFloat32( zSign ^ 1, 0xFF, 0 );
1816 shift32RightJamming( aSig, - expDiff, &aSig );
1822 goto normalizeRoundAndPack;
1824 if ( aExp == 0xFF ) {
1825 if ( aSig ) return propagateFloat32NaN( a, b );
1834 shift32RightJamming( bSig, expDiff, &bSig );
1839 normalizeRoundAndPack:
1841 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1846 -------------------------------------------------------------------------------
1847 Returns the result of adding the single-precision floating-point values `a'
1848 and `b'. The operation is performed according to the IEC/IEEE Standard for
1849 Binary Floating-Point Arithmetic.
1850 -------------------------------------------------------------------------------
1852 float32 float32_add( float32 a, float32 b )
1856 aSign = extractFloat32Sign( a );
1857 bSign = extractFloat32Sign( b );
1858 if ( aSign == bSign ) {
1859 return addFloat32Sigs( a, b, aSign );
1862 return subFloat32Sigs( a, b, aSign );
1868 -------------------------------------------------------------------------------
1869 Returns the result of subtracting the single-precision floating-point values
1870 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1871 for Binary Floating-Point Arithmetic.
1872 -------------------------------------------------------------------------------
1874 float32 float32_sub( float32 a, float32 b )
1878 aSign = extractFloat32Sign( a );
1879 bSign = extractFloat32Sign( b );
1880 if ( aSign == bSign ) {
1881 return subFloat32Sigs( a, b, aSign );
1884 return addFloat32Sigs( a, b, aSign );
1890 -------------------------------------------------------------------------------
1891 Returns the result of multiplying the single-precision floating-point values
1892 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1893 for Binary Floating-Point Arithmetic.
1894 -------------------------------------------------------------------------------
1896 float32 float32_mul( float32 a, float32 b )
1898 flag aSign, bSign, zSign;
1899 int16 aExp, bExp, zExp;
1904 aSig = extractFloat32Frac( a );
1905 aExp = extractFloat32Exp( a );
1906 aSign = extractFloat32Sign( a );
1907 bSig = extractFloat32Frac( b );
1908 bExp = extractFloat32Exp( b );
1909 bSign = extractFloat32Sign( b );
1910 zSign = aSign ^ bSign;
1911 if ( aExp == 0xFF ) {
1912 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913 return propagateFloat32NaN( a, b );
1915 if ( ( bExp | bSig ) == 0 ) {
1916 float_raise( float_flag_invalid );
1917 return float32_default_nan;
1919 return packFloat32( zSign, 0xFF, 0 );
1921 if ( bExp == 0xFF ) {
1922 if ( bSig ) return propagateFloat32NaN( a, b );
1923 if ( ( aExp | aSig ) == 0 ) {
1924 float_raise( float_flag_invalid );
1925 return float32_default_nan;
1927 return packFloat32( zSign, 0xFF, 0 );
1930 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1934 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1937 zExp = aExp + bExp - 0x7F;
1938 aSig = ( aSig | 0x00800000 )<<7;
1939 bSig = ( bSig | 0x00800000 )<<8;
1940 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1942 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1946 return roundAndPackFloat32( zSign, zExp, zSig );
1951 -------------------------------------------------------------------------------
1952 Returns the result of dividing the single-precision floating-point value `a'
1953 by the corresponding value `b'. The operation is performed according to the
1954 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1955 -------------------------------------------------------------------------------
1957 float32 float32_div( float32 a, float32 b )
1959 flag aSign, bSign, zSign;
1960 int16 aExp, bExp, zExp;
1961 bits32 aSig, bSig, zSig;
1963 aSig = extractFloat32Frac( a );
1964 aExp = extractFloat32Exp( a );
1965 aSign = extractFloat32Sign( a );
1966 bSig = extractFloat32Frac( b );
1967 bExp = extractFloat32Exp( b );
1968 bSign = extractFloat32Sign( b );
1969 zSign = aSign ^ bSign;
1970 if ( aExp == 0xFF ) {
1971 if ( aSig ) return propagateFloat32NaN( a, b );
1972 if ( bExp == 0xFF ) {
1973 if ( bSig ) return propagateFloat32NaN( a, b );
1974 float_raise( float_flag_invalid );
1975 return float32_default_nan;
1977 return packFloat32( zSign, 0xFF, 0 );
1979 if ( bExp == 0xFF ) {
1980 if ( bSig ) return propagateFloat32NaN( a, b );
1981 return packFloat32( zSign, 0, 0 );
1985 if ( ( aExp | aSig ) == 0 ) {
1986 float_raise( float_flag_invalid );
1987 return float32_default_nan;
1989 float_raise( float_flag_divbyzero );
1990 return packFloat32( zSign, 0xFF, 0 );
1992 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1995 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1996 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1998 zExp = aExp - bExp + 0x7D;
1999 aSig = ( aSig | 0x00800000 )<<7;
2000 bSig = ( bSig | 0x00800000 )<<8;
2001 if ( bSig <= ( aSig + aSig ) ) {
2005 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2006 if ( ( zSig & 0x3F ) == 0 ) {
2007 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2009 return roundAndPackFloat32( zSign, zExp, zSig );
2013 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2015 -------------------------------------------------------------------------------
2016 Returns the remainder of the single-precision floating-point value `a'
2017 with respect to the corresponding value `b'. The operation is performed
2018 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 -------------------------------------------------------------------------------
2021 float32 float32_rem( float32 a, float32 b )
2023 flag aSign, bSign, zSign;
2024 int16 aExp, bExp, expDiff;
2027 bits64 aSig64, bSig64, q64;
2028 bits32 alternateASig;
2031 aSig = extractFloat32Frac( a );
2032 aExp = extractFloat32Exp( a );
2033 aSign = extractFloat32Sign( a );
2034 bSig = extractFloat32Frac( b );
2035 bExp = extractFloat32Exp( b );
2036 bSign = extractFloat32Sign( b );
2037 if ( aExp == 0xFF ) {
2038 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039 return propagateFloat32NaN( a, b );
2041 float_raise( float_flag_invalid );
2042 return float32_default_nan;
2044 if ( bExp == 0xFF ) {
2045 if ( bSig ) return propagateFloat32NaN( a, b );
2050 float_raise( float_flag_invalid );
2051 return float32_default_nan;
2053 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2056 if ( aSig == 0 ) return a;
2057 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2059 expDiff = aExp - bExp;
2062 if ( expDiff < 32 ) {
2065 if ( expDiff < 0 ) {
2066 if ( expDiff < -1 ) return a;
2069 q = ( bSig <= aSig );
2070 if ( q ) aSig -= bSig;
2071 if ( 0 < expDiff ) {
2072 q = ( ( (bits64) aSig )<<32 ) / bSig;
2075 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2083 if ( bSig <= aSig ) aSig -= bSig;
2084 aSig64 = ( (bits64) aSig )<<40;
2085 bSig64 = ( (bits64) bSig )<<40;
2087 while ( 0 < expDiff ) {
2088 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090 aSig64 = - ( ( bSig * q64 )<<38 );
2094 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096 q = q64>>( 64 - expDiff );
2098 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2101 alternateASig = aSig;
2104 } while ( 0 <= (sbits32) aSig );
2105 sigMean = aSig + alternateASig;
2106 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107 aSig = alternateASig;
2109 zSign = ( (sbits32) aSig < 0 );
2110 if ( zSign ) aSig = - aSig;
2111 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2114 #endif /* !SOFTFLOAT_FOR_GCC */
2116 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2118 -------------------------------------------------------------------------------
2119 Returns the square root of the single-precision floating-point value `a'.
2120 The operation is performed according to the IEC/IEEE Standard for Binary
2121 Floating-Point Arithmetic.
2122 -------------------------------------------------------------------------------
2124 float32 float32_sqrt( float32 a )
2131 aSig = extractFloat32Frac( a );
2132 aExp = extractFloat32Exp( a );
2133 aSign = extractFloat32Sign( a );
2134 if ( aExp == 0xFF ) {
2135 if ( aSig ) return propagateFloat32NaN( a, 0 );
2136 if ( ! aSign ) return a;
2137 float_raise( float_flag_invalid );
2138 return float32_default_nan;
2141 if ( ( aExp | aSig ) == 0 ) return a;
2142 float_raise( float_flag_invalid );
2143 return float32_default_nan;
2146 if ( aSig == 0 ) return 0;
2147 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2149 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2150 aSig = ( aSig | 0x00800000 )<<8;
2151 zSig = estimateSqrt32( aExp, aSig ) + 2;
2152 if ( ( zSig & 0x7F ) <= 5 ) {
2158 term = ( (bits64) zSig ) * zSig;
2159 rem = ( ( (bits64) aSig )<<32 ) - term;
2160 while ( (sbits64) rem < 0 ) {
2162 rem += ( ( (bits64) zSig )<<1 ) | 1;
2164 zSig |= ( rem != 0 );
2166 shift32RightJamming( zSig, 1, &zSig );
2168 return roundAndPackFloat32( 0, zExp, zSig );
2171 #endif /* !SOFTFLOAT_FOR_GCC */
2174 -------------------------------------------------------------------------------
2175 Returns 1 if the single-precision floating-point value `a' is equal to
2176 the corresponding value `b', and 0 otherwise. The comparison is performed
2177 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2178 -------------------------------------------------------------------------------
2180 flag float32_eq( float32 a, float32 b )
2183 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2184 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2186 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2187 float_raise( float_flag_invalid );
2191 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2196 -------------------------------------------------------------------------------
2197 Returns 1 if the single-precision floating-point value `a' is less than
2198 or equal to the corresponding value `b', and 0 otherwise. The comparison
2199 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2201 -------------------------------------------------------------------------------
2203 flag float32_le( float32 a, float32 b )
2207 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2208 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2210 float_raise( float_flag_invalid );
2213 aSign = extractFloat32Sign( a );
2214 bSign = extractFloat32Sign( b );
2215 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2216 return ( a == b ) || ( aSign ^ ( a < b ) );
2221 -------------------------------------------------------------------------------
2222 Returns 1 if the single-precision floating-point value `a' is less than
2223 the corresponding value `b', and 0 otherwise. The comparison is performed
2224 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2225 -------------------------------------------------------------------------------
2227 flag float32_lt( float32 a, float32 b )
2231 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2232 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2234 float_raise( float_flag_invalid );
2237 aSign = extractFloat32Sign( a );
2238 bSign = extractFloat32Sign( b );
2239 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2240 return ( a != b ) && ( aSign ^ ( a < b ) );
2244 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2246 -------------------------------------------------------------------------------
2247 Returns 1 if the single-precision floating-point value `a' is equal to
2248 the corresponding value `b', and 0 otherwise. The invalid exception is
2249 raised if either operand is a NaN. Otherwise, the comparison is performed
2250 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2251 -------------------------------------------------------------------------------
2253 flag float32_eq_signaling( float32 a, float32 b )
2256 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2257 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2259 float_raise( float_flag_invalid );
2262 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2267 -------------------------------------------------------------------------------
2268 Returns 1 if the single-precision floating-point value `a' is less than or
2269 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2270 cause an exception. Otherwise, the comparison is performed according to the
2271 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2272 -------------------------------------------------------------------------------
2274 flag float32_le_quiet( float32 a, float32 b )
2278 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2279 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2281 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2282 float_raise( float_flag_invalid );
2286 aSign = extractFloat32Sign( a );
2287 bSign = extractFloat32Sign( b );
2288 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2289 return ( a == b ) || ( aSign ^ ( a < b ) );
2294 -------------------------------------------------------------------------------
2295 Returns 1 if the single-precision floating-point value `a' is less than
2296 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2297 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2298 Standard for Binary Floating-Point Arithmetic.
2299 -------------------------------------------------------------------------------
2301 flag float32_lt_quiet( float32 a, float32 b )
2305 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2306 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2308 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2309 float_raise( float_flag_invalid );
2313 aSign = extractFloat32Sign( a );
2314 bSign = extractFloat32Sign( b );
2315 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2316 return ( a != b ) && ( aSign ^ ( a < b ) );
2319 #endif /* !SOFTFLOAT_FOR_GCC */
2321 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2323 -------------------------------------------------------------------------------
2324 Returns the result of converting the double-precision floating-point value
2325 `a' to the 32-bit two's complement integer format. The conversion is
2326 performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 Arithmetic---which means in particular that the conversion is rounded
2328 according to the current rounding mode. If `a' is a NaN, the largest
2329 positive integer is returned. Otherwise, if the conversion overflows, the
2330 largest integer with the same sign as `a' is returned.
2331 -------------------------------------------------------------------------------
2333 int32 float64_to_int32( float64 a )
2336 int16 aExp, shiftCount;
2339 aSig = extractFloat64Frac( a );
2340 aExp = extractFloat64Exp( a );
2341 aSign = extractFloat64Sign( a );
2342 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2343 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2344 shiftCount = 0x42C - aExp;
2345 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2346 return roundAndPackInt32( aSign, aSig );
2349 #endif /* !SOFTFLOAT_FOR_GCC */
2352 -------------------------------------------------------------------------------
2353 Returns the result of converting the double-precision floating-point value
2354 `a' to the 32-bit two's complement integer format. The conversion is
2355 performed according to the IEC/IEEE Standard for Binary Floating-Point
2356 Arithmetic, except that the conversion is always rounded toward zero.
2357 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2358 the conversion overflows, the largest integer with the same sign as `a' is
2360 -------------------------------------------------------------------------------
2362 int32 float64_to_int32_round_to_zero( float64 a )
2365 int16 aExp, shiftCount;
2366 bits64 aSig, savedASig;
2369 aSig = extractFloat64Frac( a );
2370 aExp = extractFloat64Exp( a );
2371 aSign = extractFloat64Sign( a );
2372 if ( 0x41E < aExp ) {
2373 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2376 else if ( aExp < 0x3FF ) {
2377 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2380 aSig |= LIT64( 0x0010000000000000 );
2381 shiftCount = 0x433 - aExp;
2383 aSig >>= shiftCount;
2385 if ( aSign ) z = - z;
2386 if ( ( z < 0 ) ^ aSign ) {
2388 float_raise( float_flag_invalid );
2389 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2391 if ( ( aSig<<shiftCount ) != savedASig ) {
2392 float_exception_flags |= float_flag_inexact;
2398 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2400 -------------------------------------------------------------------------------
2401 Returns the result of converting the double-precision floating-point value
2402 `a' to the 64-bit two's complement integer format. The conversion is
2403 performed according to the IEC/IEEE Standard for Binary Floating-Point
2404 Arithmetic---which means in particular that the conversion is rounded
2405 according to the current rounding mode. If `a' is a NaN, the largest
2406 positive integer is returned. Otherwise, if the conversion overflows, the
2407 largest integer with the same sign as `a' is returned.
2408 -------------------------------------------------------------------------------
2410 int64 float64_to_int64( float64 a )
2413 int16 aExp, shiftCount;
2414 bits64 aSig, aSigExtra;
2416 aSig = extractFloat64Frac( a );
2417 aExp = extractFloat64Exp( a );
2418 aSign = extractFloat64Sign( a );
2419 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2420 shiftCount = 0x433 - aExp;
2421 if ( shiftCount <= 0 ) {
2422 if ( 0x43E < aExp ) {
2423 float_raise( float_flag_invalid );
2425 || ( ( aExp == 0x7FF )
2426 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2428 return LIT64( 0x7FFFFFFFFFFFFFFF );
2430 return (sbits64) LIT64( 0x8000000000000000 );
2433 aSig <<= - shiftCount;
2436 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2438 return roundAndPackInt64( aSign, aSig, aSigExtra );
2443 -------------------------------------------------------------------------------
2444 Returns the result of converting the double-precision floating-point value
2445 `a' to the 64-bit two's complement integer format. The conversion is
2446 performed according to the IEC/IEEE Standard for Binary Floating-Point
2447 Arithmetic, except that the conversion is always rounded toward zero.
2448 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2449 the conversion overflows, the largest integer with the same sign as `a' is
2451 -------------------------------------------------------------------------------
2453 int64 float64_to_int64_round_to_zero( float64 a )
2456 int16 aExp, shiftCount;
2460 aSig = extractFloat64Frac( a );
2461 aExp = extractFloat64Exp( a );
2462 aSign = extractFloat64Sign( a );
2463 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2464 shiftCount = aExp - 0x433;
2465 if ( 0 <= shiftCount ) {
2466 if ( 0x43E <= aExp ) {
2467 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2468 float_raise( float_flag_invalid );
2470 || ( ( aExp == 0x7FF )
2471 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2473 return LIT64( 0x7FFFFFFFFFFFFFFF );
2476 return (sbits64) LIT64( 0x8000000000000000 );
2478 z = aSig<<shiftCount;
2481 if ( aExp < 0x3FE ) {
2482 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2485 z = aSig>>( - shiftCount );
2486 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2487 float_exception_flags |= float_flag_inexact;
2490 if ( aSign ) z = - z;
2494 #endif /* !SOFTFLOAT_FOR_GCC */
2497 -------------------------------------------------------------------------------
2498 Returns the result of converting the double-precision floating-point value
2499 `a' to the single-precision floating-point format. The conversion is
2500 performed according to the IEC/IEEE Standard for Binary Floating-Point
2502 -------------------------------------------------------------------------------
2504 float32 float64_to_float32( float64 a )
2511 aSig = extractFloat64Frac( a );
2512 aExp = extractFloat64Exp( a );
2513 aSign = extractFloat64Sign( a );
2514 if ( aExp == 0x7FF ) {
2515 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2516 return packFloat32( aSign, 0xFF, 0 );
2518 shift64RightJamming( aSig, 22, &aSig );
2520 if ( aExp || zSig ) {
2524 return roundAndPackFloat32( aSign, aExp, zSig );
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the double-precision floating-point value
2533 `a' to the extended double-precision floating-point format. The conversion
2534 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2536 -------------------------------------------------------------------------------
2538 floatx80 float64_to_floatx80( float64 a )
2544 aSig = extractFloat64Frac( a );
2545 aExp = extractFloat64Exp( a );
2546 aSign = extractFloat64Sign( a );
2547 if ( aExp == 0x7FF ) {
2548 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2549 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2552 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2553 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2557 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2566 -------------------------------------------------------------------------------
2567 Returns the result of converting the double-precision floating-point value
2568 `a' to the quadruple-precision floating-point format. The conversion is
2569 performed according to the IEC/IEEE Standard for Binary Floating-Point
2571 -------------------------------------------------------------------------------
2573 float128 float64_to_float128( float64 a )
2577 bits64 aSig, zSig0, zSig1;
2579 aSig = extractFloat64Frac( a );
2580 aExp = extractFloat64Exp( a );
2581 aSign = extractFloat64Sign( a );
2582 if ( aExp == 0x7FF ) {
2583 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2584 return packFloat128( aSign, 0x7FFF, 0, 0 );
2587 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2588 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2591 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2592 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2598 #ifndef SOFTFLOAT_FOR_GCC
2600 -------------------------------------------------------------------------------
2601 Rounds the double-precision floating-point value `a' to an integer, and
2602 returns the result as a double-precision floating-point value. The
2603 operation is performed according to the IEC/IEEE Standard for Binary
2604 Floating-Point Arithmetic.
2605 -------------------------------------------------------------------------------
2607 float64 float64_round_to_int( float64 a )
2611 bits64 lastBitMask, roundBitsMask;
2615 aExp = extractFloat64Exp( a );
2616 if ( 0x433 <= aExp ) {
2617 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2618 return propagateFloat64NaN( a, a );
2622 if ( aExp < 0x3FF ) {
2623 if ( (bits64) ( a<<1 ) == 0 ) return a;
2624 float_exception_flags |= float_flag_inexact;
2625 aSign = extractFloat64Sign( a );
2626 switch ( float_rounding_mode ) {
2627 case float_round_nearest_even:
2628 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2629 return packFloat64( aSign, 0x3FF, 0 );
2632 case float_round_to_zero:
2634 case float_round_down:
2635 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2636 case float_round_up:
2638 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2640 return packFloat64( aSign, 0, 0 );
2643 lastBitMask <<= 0x433 - aExp;
2644 roundBitsMask = lastBitMask - 1;
2646 roundingMode = float_rounding_mode;
2647 if ( roundingMode == float_round_nearest_even ) {
2648 z += lastBitMask>>1;
2649 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2651 else if ( roundingMode != float_round_to_zero ) {
2652 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2656 z &= ~ roundBitsMask;
2657 if ( z != a ) float_exception_flags |= float_flag_inexact;
2664 -------------------------------------------------------------------------------
2665 Returns the result of adding the absolute values of the double-precision
2666 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2667 before being returned. `zSign' is ignored if the result is a NaN.
2668 The addition is performed according to the IEC/IEEE Standard for Binary
2669 Floating-Point Arithmetic.
2670 -------------------------------------------------------------------------------
2672 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2674 int16 aExp, bExp, zExp;
2675 bits64 aSig, bSig, zSig;
2678 aSig = extractFloat64Frac( a );
2679 aExp = extractFloat64Exp( a );
2680 bSig = extractFloat64Frac( b );
2681 bExp = extractFloat64Exp( b );
2682 expDiff = aExp - bExp;
2685 if ( 0 < expDiff ) {
2686 if ( aExp == 0x7FF ) {
2687 if ( aSig ) return propagateFloat64NaN( a, b );
2694 bSig |= LIT64( 0x2000000000000000 );
2696 shift64RightJamming( bSig, expDiff, &bSig );
2699 else if ( expDiff < 0 ) {
2700 if ( bExp == 0x7FF ) {
2701 if ( bSig ) return propagateFloat64NaN( a, b );
2702 return packFloat64( zSign, 0x7FF, 0 );
2708 aSig |= LIT64( 0x2000000000000000 );
2710 shift64RightJamming( aSig, - expDiff, &aSig );
2714 if ( aExp == 0x7FF ) {
2715 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2718 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2719 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2723 aSig |= LIT64( 0x2000000000000000 );
2724 zSig = ( aSig + bSig )<<1;
2726 if ( (sbits64) zSig < 0 ) {
2731 return roundAndPackFloat64( zSign, zExp, zSig );
2736 -------------------------------------------------------------------------------
2737 Returns the result of subtracting the absolute values of the double-
2738 precision floating-point values `a' and `b'. If `zSign' is 1, the
2739 difference is negated before being returned. `zSign' is ignored if the
2740 result is a NaN. The subtraction is performed according to the IEC/IEEE
2741 Standard for Binary Floating-Point Arithmetic.
2742 -------------------------------------------------------------------------------
2744 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2746 int16 aExp, bExp, zExp;
2747 bits64 aSig, bSig, zSig;
2750 aSig = extractFloat64Frac( a );
2751 aExp = extractFloat64Exp( a );
2752 bSig = extractFloat64Frac( b );
2753 bExp = extractFloat64Exp( b );
2754 expDiff = aExp - bExp;
2757 if ( 0 < expDiff ) goto aExpBigger;
2758 if ( expDiff < 0 ) goto bExpBigger;
2759 if ( aExp == 0x7FF ) {
2760 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2761 float_raise( float_flag_invalid );
2762 return float64_default_nan;
2768 if ( bSig < aSig ) goto aBigger;
2769 if ( aSig < bSig ) goto bBigger;
2770 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2772 if ( bExp == 0x7FF ) {
2773 if ( bSig ) return propagateFloat64NaN( a, b );
2774 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2780 aSig |= LIT64( 0x4000000000000000 );
2782 shift64RightJamming( aSig, - expDiff, &aSig );
2783 bSig |= LIT64( 0x4000000000000000 );
2788 goto normalizeRoundAndPack;
2790 if ( aExp == 0x7FF ) {
2791 if ( aSig ) return propagateFloat64NaN( a, b );
2798 bSig |= LIT64( 0x4000000000000000 );
2800 shift64RightJamming( bSig, expDiff, &bSig );
2801 aSig |= LIT64( 0x4000000000000000 );
2805 normalizeRoundAndPack:
2807 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2812 -------------------------------------------------------------------------------
2813 Returns the result of adding the double-precision floating-point values `a'
2814 and `b'. The operation is performed according to the IEC/IEEE Standard for
2815 Binary Floating-Point Arithmetic.
2816 -------------------------------------------------------------------------------
2818 float64 float64_add( float64 a, float64 b )
2822 aSign = extractFloat64Sign( a );
2823 bSign = extractFloat64Sign( b );
2824 if ( aSign == bSign ) {
2825 return addFloat64Sigs( a, b, aSign );
2828 return subFloat64Sigs( a, b, aSign );
2834 -------------------------------------------------------------------------------
2835 Returns the result of subtracting the double-precision floating-point values
2836 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2837 for Binary Floating-Point Arithmetic.
2838 -------------------------------------------------------------------------------
2840 float64 float64_sub( float64 a, float64 b )
2844 aSign = extractFloat64Sign( a );
2845 bSign = extractFloat64Sign( b );
2846 if ( aSign == bSign ) {
2847 return subFloat64Sigs( a, b, aSign );
2850 return addFloat64Sigs( a, b, aSign );
2856 -------------------------------------------------------------------------------
2857 Returns the result of multiplying the double-precision floating-point values
2858 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2859 for Binary Floating-Point Arithmetic.
2860 -------------------------------------------------------------------------------
2862 float64 float64_mul( float64 a, float64 b )
2864 flag aSign, bSign, zSign;
2865 int16 aExp, bExp, zExp;
2866 bits64 aSig, bSig, zSig0, zSig1;
2868 aSig = extractFloat64Frac( a );
2869 aExp = extractFloat64Exp( a );
2870 aSign = extractFloat64Sign( a );
2871 bSig = extractFloat64Frac( b );
2872 bExp = extractFloat64Exp( b );
2873 bSign = extractFloat64Sign( b );
2874 zSign = aSign ^ bSign;
2875 if ( aExp == 0x7FF ) {
2876 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2877 return propagateFloat64NaN( a, b );
2879 if ( ( bExp | bSig ) == 0 ) {
2880 float_raise( float_flag_invalid );
2881 return float64_default_nan;
2883 return packFloat64( zSign, 0x7FF, 0 );
2885 if ( bExp == 0x7FF ) {
2886 if ( bSig ) return propagateFloat64NaN( a, b );
2887 if ( ( aExp | aSig ) == 0 ) {
2888 float_raise( float_flag_invalid );
2889 return float64_default_nan;
2891 return packFloat64( zSign, 0x7FF, 0 );
2894 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2895 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2898 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2899 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2901 zExp = aExp + bExp - 0x3FF;
2902 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2903 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2904 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2905 zSig0 |= ( zSig1 != 0 );
2906 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2910 return roundAndPackFloat64( zSign, zExp, zSig0 );
2915 -------------------------------------------------------------------------------
2916 Returns the result of dividing the double-precision floating-point value `a'
2917 by the corresponding value `b'. The operation is performed according to
2918 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2919 -------------------------------------------------------------------------------
2921 float64 float64_div( float64 a, float64 b )
2923 flag aSign, bSign, zSign;
2924 int16 aExp, bExp, zExp;
2925 bits64 aSig, bSig, zSig;
2927 bits64 term0, term1;
2929 aSig = extractFloat64Frac( a );
2930 aExp = extractFloat64Exp( a );
2931 aSign = extractFloat64Sign( a );
2932 bSig = extractFloat64Frac( b );
2933 bExp = extractFloat64Exp( b );
2934 bSign = extractFloat64Sign( b );
2935 zSign = aSign ^ bSign;
2936 if ( aExp == 0x7FF ) {
2937 if ( aSig ) return propagateFloat64NaN( a, b );
2938 if ( bExp == 0x7FF ) {
2939 if ( bSig ) return propagateFloat64NaN( a, b );
2940 float_raise( float_flag_invalid );
2941 return float64_default_nan;
2943 return packFloat64( zSign, 0x7FF, 0 );
2945 if ( bExp == 0x7FF ) {
2946 if ( bSig ) return propagateFloat64NaN( a, b );
2947 return packFloat64( zSign, 0, 0 );
2951 if ( ( aExp | aSig ) == 0 ) {
2952 float_raise( float_flag_invalid );
2953 return float64_default_nan;
2955 float_raise( float_flag_divbyzero );
2956 return packFloat64( zSign, 0x7FF, 0 );
2958 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2961 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2962 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2964 zExp = aExp - bExp + 0x3FD;
2965 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2966 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2967 if ( bSig <= ( aSig + aSig ) ) {
2971 zSig = estimateDiv128To64( aSig, 0, bSig );
2972 if ( ( zSig & 0x1FF ) <= 2 ) {
2973 mul64To128( bSig, zSig, &term0, &term1 );
2974 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2975 while ( (sbits64) rem0 < 0 ) {
2977 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2979 zSig |= ( rem1 != 0 );
2981 return roundAndPackFloat64( zSign, zExp, zSig );
2985 #ifndef SOFTFLOAT_FOR_GCC
2987 -------------------------------------------------------------------------------
2988 Returns the remainder of the double-precision floating-point value `a'
2989 with respect to the corresponding value `b'. The operation is performed
2990 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991 -------------------------------------------------------------------------------
2993 float64 float64_rem( float64 a, float64 b )
2995 flag aSign, bSign, zSign;
2996 int16 aExp, bExp, expDiff;
2998 bits64 q, alternateASig;
3001 aSig = extractFloat64Frac( a );
3002 aExp = extractFloat64Exp( a );
3003 aSign = extractFloat64Sign( a );
3004 bSig = extractFloat64Frac( b );
3005 bExp = extractFloat64Exp( b );
3006 bSign = extractFloat64Sign( b );
3007 if ( aExp == 0x7FF ) {
3008 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3009 return propagateFloat64NaN( a, b );
3011 float_raise( float_flag_invalid );
3012 return float64_default_nan;
3014 if ( bExp == 0x7FF ) {
3015 if ( bSig ) return propagateFloat64NaN( a, b );
3020 float_raise( float_flag_invalid );
3021 return float64_default_nan;
3023 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3026 if ( aSig == 0 ) return a;
3027 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3029 expDiff = aExp - bExp;
3030 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3031 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3032 if ( expDiff < 0 ) {
3033 if ( expDiff < -1 ) return a;
3036 q = ( bSig <= aSig );
3037 if ( q ) aSig -= bSig;
3039 while ( 0 < expDiff ) {
3040 q = estimateDiv128To64( aSig, 0, bSig );
3041 q = ( 2 < q ) ? q - 2 : 0;
3042 aSig = - ( ( bSig>>2 ) * q );
3046 if ( 0 < expDiff ) {
3047 q = estimateDiv128To64( aSig, 0, bSig );
3048 q = ( 2 < q ) ? q - 2 : 0;
3051 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3058 alternateASig = aSig;
3061 } while ( 0 <= (sbits64) aSig );
3062 sigMean = aSig + alternateASig;
3063 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3064 aSig = alternateASig;
3066 zSign = ( (sbits64) aSig < 0 );
3067 if ( zSign ) aSig = - aSig;
3068 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3073 -------------------------------------------------------------------------------
3074 Returns the square root of the double-precision floating-point value `a'.
3075 The operation is performed according to the IEC/IEEE Standard for Binary
3076 Floating-Point Arithmetic.
3077 -------------------------------------------------------------------------------
3079 float64 float64_sqrt( float64 a )
3083 bits64 aSig, zSig, doubleZSig;
3084 bits64 rem0, rem1, term0, term1;
3086 aSig = extractFloat64Frac( a );
3087 aExp = extractFloat64Exp( a );
3088 aSign = extractFloat64Sign( a );
3089 if ( aExp == 0x7FF ) {
3090 if ( aSig ) return propagateFloat64NaN( a, a );
3091 if ( ! aSign ) return a;
3092 float_raise( float_flag_invalid );
3093 return float64_default_nan;
3096 if ( ( aExp | aSig ) == 0 ) return a;
3097 float_raise( float_flag_invalid );
3098 return float64_default_nan;
3101 if ( aSig == 0 ) return 0;
3102 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3104 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3105 aSig |= LIT64( 0x0010000000000000 );
3106 zSig = estimateSqrt32( aExp, aSig>>21 );
3107 aSig <<= 9 - ( aExp & 1 );
3108 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3109 if ( ( zSig & 0x1FF ) <= 5 ) {
3110 doubleZSig = zSig<<1;
3111 mul64To128( zSig, zSig, &term0, &term1 );
3112 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3113 while ( (sbits64) rem0 < 0 ) {
3116 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3118 zSig |= ( ( rem0 | rem1 ) != 0 );
3120 return roundAndPackFloat64( 0, zExp, zSig );
3126 -------------------------------------------------------------------------------
3127 Returns 1 if the double-precision floating-point value `a' is equal to the
3128 corresponding value `b', and 0 otherwise. The comparison is performed
3129 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3130 -------------------------------------------------------------------------------
3132 flag float64_eq( float64 a, float64 b )
3135 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3136 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3138 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3139 float_raise( float_flag_invalid );
3143 return ( a == b ) ||
3144 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3149 -------------------------------------------------------------------------------
3150 Returns 1 if the double-precision floating-point value `a' is less than or
3151 equal to the corresponding value `b', and 0 otherwise. The comparison is
3152 performed according to the IEC/IEEE Standard for Binary Floating-Point
3154 -------------------------------------------------------------------------------
3156 flag float64_le( float64 a, float64 b )
3160 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3161 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3163 float_raise( float_flag_invalid );
3166 aSign = extractFloat64Sign( a );
3167 bSign = extractFloat64Sign( b );
3168 if ( aSign != bSign )
3170 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3172 return ( a == b ) ||
3173 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3178 -------------------------------------------------------------------------------
3179 Returns 1 if the double-precision floating-point value `a' is less than
3180 the corresponding value `b', and 0 otherwise. The comparison is performed
3181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3182 -------------------------------------------------------------------------------
3184 flag float64_lt( float64 a, float64 b )
3188 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3189 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3191 float_raise( float_flag_invalid );
3194 aSign = extractFloat64Sign( a );
3195 bSign = extractFloat64Sign( b );
3196 if ( aSign != bSign )
3198 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3200 return ( a != b ) &&
3201 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3205 #ifndef SOFTFLOAT_FOR_GCC
3207 -------------------------------------------------------------------------------
3208 Returns 1 if the double-precision floating-point value `a' is equal to the
3209 corresponding value `b', and 0 otherwise. The invalid exception is raised
3210 if either operand is a NaN. Otherwise, the comparison is performed
3211 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3212 -------------------------------------------------------------------------------
3214 flag float64_eq_signaling( float64 a, float64 b )
3217 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3218 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3220 float_raise( float_flag_invalid );
3223 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3228 -------------------------------------------------------------------------------
3229 Returns 1 if the double-precision floating-point value `a' is less than or
3230 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3231 cause an exception. Otherwise, the comparison is performed according to the
3232 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3233 -------------------------------------------------------------------------------
3235 flag float64_le_quiet( float64 a, float64 b )
3239 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3240 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3242 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3243 float_raise( float_flag_invalid );
3247 aSign = extractFloat64Sign( a );
3248 bSign = extractFloat64Sign( b );
3249 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3250 return ( a == b ) || ( aSign ^ ( a < b ) );
3255 -------------------------------------------------------------------------------
3256 Returns 1 if the double-precision floating-point value `a' is less than
3257 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3258 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3259 Standard for Binary Floating-Point Arithmetic.
3260 -------------------------------------------------------------------------------
3262 flag float64_lt_quiet( float64 a, float64 b )
3266 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3267 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3269 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3270 float_raise( float_flag_invalid );
3274 aSign = extractFloat64Sign( a );
3275 bSign = extractFloat64Sign( b );
3276 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3277 return ( a != b ) && ( aSign ^ ( a < b ) );
3285 -------------------------------------------------------------------------------
3286 Returns the result of converting the extended double-precision floating-
3287 point value `a' to the 32-bit two's complement integer format. The
3288 conversion is performed according to the IEC/IEEE Standard for Binary
3289 Floating-Point Arithmetic---which means in particular that the conversion
3290 is rounded according to the current rounding mode. If `a' is a NaN, the
3291 largest positive integer is returned. Otherwise, if the conversion
3292 overflows, the largest integer with the same sign as `a' is returned.
3293 -------------------------------------------------------------------------------
3295 int32 floatx80_to_int32( floatx80 a )
3298 int32 aExp, shiftCount;
3301 aSig = extractFloatx80Frac( a );
3302 aExp = extractFloatx80Exp( a );
3303 aSign = extractFloatx80Sign( a );
3304 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3305 shiftCount = 0x4037 - aExp;
3306 if ( shiftCount <= 0 ) shiftCount = 1;
3307 shift64RightJamming( aSig, shiftCount, &aSig );
3308 return roundAndPackInt32( aSign, aSig );
3313 -------------------------------------------------------------------------------
3314 Returns the result of converting the extended double-precision floating-
3315 point value `a' to the 32-bit two's complement integer format. The
3316 conversion is performed according to the IEC/IEEE Standard for Binary
3317 Floating-Point Arithmetic, except that the conversion is always rounded
3318 toward zero. If `a' is a NaN, the largest positive integer is returned.
3319 Otherwise, if the conversion overflows, the largest integer with the same
3320 sign as `a' is returned.
3321 -------------------------------------------------------------------------------
3323 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3326 int32 aExp, shiftCount;
3327 bits64 aSig, savedASig;
3330 aSig = extractFloatx80Frac( a );
3331 aExp = extractFloatx80Exp( a );
3332 aSign = extractFloatx80Sign( a );
3333 if ( 0x401E < aExp ) {
3334 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3337 else if ( aExp < 0x3FFF ) {
3338 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3341 shiftCount = 0x403E - aExp;
3343 aSig >>= shiftCount;
3345 if ( aSign ) z = - z;
3346 if ( ( z < 0 ) ^ aSign ) {
3348 float_raise( float_flag_invalid );
3349 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3351 if ( ( aSig<<shiftCount ) != savedASig ) {
3352 float_exception_flags |= float_flag_inexact;
3359 -------------------------------------------------------------------------------
3360 Returns the result of converting the extended double-precision floating-
3361 point value `a' to the 64-bit two's complement integer format. The
3362 conversion is performed according to the IEC/IEEE Standard for Binary
3363 Floating-Point Arithmetic---which means in particular that the conversion
3364 is rounded according to the current rounding mode. If `a' is a NaN,
3365 the largest positive integer is returned. Otherwise, if the conversion
3366 overflows, the largest integer with the same sign as `a' is returned.
3367 -------------------------------------------------------------------------------
3369 int64 floatx80_to_int64( floatx80 a )
3372 int32 aExp, shiftCount;
3373 bits64 aSig, aSigExtra;
3375 aSig = extractFloatx80Frac( a );
3376 aExp = extractFloatx80Exp( a );
3377 aSign = extractFloatx80Sign( a );
3378 shiftCount = 0x403E - aExp;
3379 if ( shiftCount <= 0 ) {
3381 float_raise( float_flag_invalid );
3383 || ( ( aExp == 0x7FFF )
3384 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3386 return LIT64( 0x7FFFFFFFFFFFFFFF );
3388 return (sbits64) LIT64( 0x8000000000000000 );
3393 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3395 return roundAndPackInt64( aSign, aSig, aSigExtra );
3400 -------------------------------------------------------------------------------
3401 Returns the result of converting the extended double-precision floating-
3402 point value `a' to the 64-bit two's complement integer format. The
3403 conversion is performed according to the IEC/IEEE Standard for Binary
3404 Floating-Point Arithmetic, except that the conversion is always rounded
3405 toward zero. If `a' is a NaN, the largest positive integer is returned.
3406 Otherwise, if the conversion overflows, the largest integer with the same
3407 sign as `a' is returned.
3408 -------------------------------------------------------------------------------
3410 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3413 int32 aExp, shiftCount;
3417 aSig = extractFloatx80Frac( a );
3418 aExp = extractFloatx80Exp( a );
3419 aSign = extractFloatx80Sign( a );
3420 shiftCount = aExp - 0x403E;
3421 if ( 0 <= shiftCount ) {
3422 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3423 if ( ( a.high != 0xC03E ) || aSig ) {
3424 float_raise( float_flag_invalid );
3425 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3426 return LIT64( 0x7FFFFFFFFFFFFFFF );
3429 return (sbits64) LIT64( 0x8000000000000000 );
3431 else if ( aExp < 0x3FFF ) {
3432 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3435 z = aSig>>( - shiftCount );
3436 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3437 float_exception_flags |= float_flag_inexact;
3439 if ( aSign ) z = - z;
3445 -------------------------------------------------------------------------------
3446 Returns the result of converting the extended double-precision floating-
3447 point value `a' to the single-precision floating-point format. The
3448 conversion is performed according to the IEC/IEEE Standard for Binary
3449 Floating-Point Arithmetic.
3450 -------------------------------------------------------------------------------
3452 float32 floatx80_to_float32( floatx80 a )
3458 aSig = extractFloatx80Frac( a );
3459 aExp = extractFloatx80Exp( a );
3460 aSign = extractFloatx80Sign( a );
3461 if ( aExp == 0x7FFF ) {
3462 if ( (bits64) ( aSig<<1 ) ) {
3463 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3465 return packFloat32( aSign, 0xFF, 0 );
3467 shift64RightJamming( aSig, 33, &aSig );
3468 if ( aExp || aSig ) aExp -= 0x3F81;
3469 return roundAndPackFloat32( aSign, aExp, aSig );
3474 -------------------------------------------------------------------------------
3475 Returns the result of converting the extended double-precision floating-
3476 point value `a' to the double-precision floating-point format. The
3477 conversion is performed according to the IEC/IEEE Standard for Binary
3478 Floating-Point Arithmetic.
3479 -------------------------------------------------------------------------------
3481 float64 floatx80_to_float64( floatx80 a )
3487 aSig = extractFloatx80Frac( a );
3488 aExp = extractFloatx80Exp( a );
3489 aSign = extractFloatx80Sign( a );
3490 if ( aExp == 0x7FFF ) {
3491 if ( (bits64) ( aSig<<1 ) ) {
3492 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3494 return packFloat64( aSign, 0x7FF, 0 );
3496 shift64RightJamming( aSig, 1, &zSig );
3497 if ( aExp || aSig ) aExp -= 0x3C01;
3498 return roundAndPackFloat64( aSign, aExp, zSig );
3505 -------------------------------------------------------------------------------
3506 Returns the result of converting the extended double-precision floating-
3507 point value `a' to the quadruple-precision floating-point format. The
3508 conversion is performed according to the IEC/IEEE Standard for Binary
3509 Floating-Point Arithmetic.
3510 -------------------------------------------------------------------------------
3512 float128 floatx80_to_float128( floatx80 a )
3516 bits64 aSig, zSig0, zSig1;
3518 aSig = extractFloatx80Frac( a );
3519 aExp = extractFloatx80Exp( a );
3520 aSign = extractFloatx80Sign( a );
3521 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3522 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3524 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3525 return packFloat128( aSign, aExp, zSig0, zSig1 );
3532 -------------------------------------------------------------------------------
3533 Rounds the extended double-precision floating-point value `a' to an integer,
3534 and returns the result as an extended quadruple-precision floating-point
3535 value. The operation is performed according to the IEC/IEEE Standard for
3536 Binary Floating-Point Arithmetic.
3537 -------------------------------------------------------------------------------
3539 floatx80 floatx80_round_to_int( floatx80 a )
3543 bits64 lastBitMask, roundBitsMask;
3547 aExp = extractFloatx80Exp( a );
3548 if ( 0x403E <= aExp ) {
3549 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3550 return propagateFloatx80NaN( a, a );
3554 if ( aExp < 0x3FFF ) {
3556 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3559 float_exception_flags |= float_flag_inexact;
3560 aSign = extractFloatx80Sign( a );
3561 switch ( float_rounding_mode ) {
3562 case float_round_nearest_even:
3563 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3566 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3569 case float_round_to_zero:
3571 case float_round_down:
3574 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3575 : packFloatx80( 0, 0, 0 );
3576 case float_round_up:
3578 aSign ? packFloatx80( 1, 0, 0 )
3579 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3581 return packFloatx80( aSign, 0, 0 );
3584 lastBitMask <<= 0x403E - aExp;
3585 roundBitsMask = lastBitMask - 1;
3587 roundingMode = float_rounding_mode;
3588 if ( roundingMode == float_round_nearest_even ) {
3589 z.low += lastBitMask>>1;
3590 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3592 else if ( roundingMode != float_round_to_zero ) {
3593 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3594 z.low += roundBitsMask;
3597 z.low &= ~ roundBitsMask;
3600 z.low = LIT64( 0x8000000000000000 );
3602 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3608 -------------------------------------------------------------------------------
3609 Returns the result of adding the absolute values of the extended double-
3610 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3611 negated before being returned. `zSign' is ignored if the result is a NaN.
3612 The addition is performed according to the IEC/IEEE Standard for Binary
3613 Floating-Point Arithmetic.
3614 -------------------------------------------------------------------------------
3616 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3618 int32 aExp, bExp, zExp;
3619 bits64 aSig, bSig, zSig0, zSig1;
3622 aSig = extractFloatx80Frac( a );
3623 aExp = extractFloatx80Exp( a );
3624 bSig = extractFloatx80Frac( b );
3625 bExp = extractFloatx80Exp( b );
3626 expDiff = aExp - bExp;
3627 if ( 0 < expDiff ) {
3628 if ( aExp == 0x7FFF ) {
3629 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3632 if ( bExp == 0 ) --expDiff;
3633 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3636 else if ( expDiff < 0 ) {
3637 if ( bExp == 0x7FFF ) {
3638 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3639 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3641 if ( aExp == 0 ) ++expDiff;
3642 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3646 if ( aExp == 0x7FFF ) {
3647 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3648 return propagateFloatx80NaN( a, b );
3653 zSig0 = aSig + bSig;
3655 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3661 zSig0 = aSig + bSig;
3662 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3664 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3665 zSig0 |= LIT64( 0x8000000000000000 );
3669 roundAndPackFloatx80(
3670 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3675 -------------------------------------------------------------------------------
3676 Returns the result of subtracting the absolute values of the extended
3677 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3678 difference is negated before being returned. `zSign' is ignored if the
3679 result is a NaN. The subtraction is performed according to the IEC/IEEE
3680 Standard for Binary Floating-Point Arithmetic.
3681 -------------------------------------------------------------------------------
3683 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3685 int32 aExp, bExp, zExp;
3686 bits64 aSig, bSig, zSig0, zSig1;
3690 aSig = extractFloatx80Frac( a );
3691 aExp = extractFloatx80Exp( a );
3692 bSig = extractFloatx80Frac( b );
3693 bExp = extractFloatx80Exp( b );
3694 expDiff = aExp - bExp;
3695 if ( 0 < expDiff ) goto aExpBigger;
3696 if ( expDiff < 0 ) goto bExpBigger;
3697 if ( aExp == 0x7FFF ) {
3698 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3699 return propagateFloatx80NaN( a, b );
3701 float_raise( float_flag_invalid );
3702 z.low = floatx80_default_nan_low;
3703 z.high = floatx80_default_nan_high;
3711 if ( bSig < aSig ) goto aBigger;
3712 if ( aSig < bSig ) goto bBigger;
3713 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3715 if ( bExp == 0x7FFF ) {
3716 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3717 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3719 if ( aExp == 0 ) ++expDiff;
3720 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3722 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3725 goto normalizeRoundAndPack;
3727 if ( aExp == 0x7FFF ) {
3728 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3731 if ( bExp == 0 ) --expDiff;
3732 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3734 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3736 normalizeRoundAndPack:
3738 normalizeRoundAndPackFloatx80(
3739 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3744 -------------------------------------------------------------------------------
3745 Returns the result of adding the extended double-precision floating-point
3746 values `a' and `b'. The operation is performed according to the IEC/IEEE
3747 Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3750 floatx80 floatx80_add( floatx80 a, floatx80 b )
3754 aSign = extractFloatx80Sign( a );
3755 bSign = extractFloatx80Sign( b );
3756 if ( aSign == bSign ) {
3757 return addFloatx80Sigs( a, b, aSign );
3760 return subFloatx80Sigs( a, b, aSign );
3766 -------------------------------------------------------------------------------
3767 Returns the result of subtracting the extended double-precision floating-
3768 point values `a' and `b'. The operation is performed according to the
3769 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3770 -------------------------------------------------------------------------------
3772 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3776 aSign = extractFloatx80Sign( a );
3777 bSign = extractFloatx80Sign( b );
3778 if ( aSign == bSign ) {
3779 return subFloatx80Sigs( a, b, aSign );
3782 return addFloatx80Sigs( a, b, aSign );
3788 -------------------------------------------------------------------------------
3789 Returns the result of multiplying the extended double-precision floating-
3790 point values `a' and `b'. The operation is performed according to the
3791 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3792 -------------------------------------------------------------------------------
3794 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3796 flag aSign, bSign, zSign;
3797 int32 aExp, bExp, zExp;
3798 bits64 aSig, bSig, zSig0, zSig1;
3801 aSig = extractFloatx80Frac( a );
3802 aExp = extractFloatx80Exp( a );
3803 aSign = extractFloatx80Sign( a );
3804 bSig = extractFloatx80Frac( b );
3805 bExp = extractFloatx80Exp( b );
3806 bSign = extractFloatx80Sign( b );
3807 zSign = aSign ^ bSign;
3808 if ( aExp == 0x7FFF ) {
3809 if ( (bits64) ( aSig<<1 )
3810 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3811 return propagateFloatx80NaN( a, b );
3813 if ( ( bExp | bSig ) == 0 ) goto invalid;
3814 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3816 if ( bExp == 0x7FFF ) {
3817 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3818 if ( ( aExp | aSig ) == 0 ) {
3820 float_raise( float_flag_invalid );
3821 z.low = floatx80_default_nan_low;
3822 z.high = floatx80_default_nan_high;
3825 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3828 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3829 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3832 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3835 zExp = aExp + bExp - 0x3FFE;
3836 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3837 if ( 0 < (sbits64) zSig0 ) {
3838 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3842 roundAndPackFloatx80(
3843 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3848 -------------------------------------------------------------------------------
3849 Returns the result of dividing the extended double-precision floating-point
3850 value `a' by the corresponding value `b'. The operation is performed
3851 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3852 -------------------------------------------------------------------------------
3854 floatx80 floatx80_div( floatx80 a, floatx80 b )
3856 flag aSign, bSign, zSign;
3857 int32 aExp, bExp, zExp;
3858 bits64 aSig, bSig, zSig0, zSig1;
3859 bits64 rem0, rem1, rem2, term0, term1, term2;
3862 aSig = extractFloatx80Frac( a );
3863 aExp = extractFloatx80Exp( a );
3864 aSign = extractFloatx80Sign( a );
3865 bSig = extractFloatx80Frac( b );
3866 bExp = extractFloatx80Exp( b );
3867 bSign = extractFloatx80Sign( b );
3868 zSign = aSign ^ bSign;
3869 if ( aExp == 0x7FFF ) {
3870 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871 if ( bExp == 0x7FFF ) {
3872 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3877 if ( bExp == 0x7FFF ) {
3878 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3879 return packFloatx80( zSign, 0, 0 );
3883 if ( ( aExp | aSig ) == 0 ) {
3885 float_raise( float_flag_invalid );
3886 z.low = floatx80_default_nan_low;
3887 z.high = floatx80_default_nan_high;
3890 float_raise( float_flag_divbyzero );
3891 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3893 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3896 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3897 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3899 zExp = aExp - bExp + 0x3FFE;
3901 if ( bSig <= aSig ) {
3902 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3905 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3906 mul64To128( bSig, zSig0, &term0, &term1 );
3907 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3908 while ( (sbits64) rem0 < 0 ) {
3910 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3912 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3913 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3914 mul64To128( bSig, zSig1, &term1, &term2 );
3915 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3916 while ( (sbits64) rem1 < 0 ) {
3918 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3920 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3923 roundAndPackFloatx80(
3924 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3929 -------------------------------------------------------------------------------
3930 Returns the remainder of the extended double-precision floating-point value
3931 `a' with respect to the corresponding value `b'. The operation is performed
3932 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3933 -------------------------------------------------------------------------------
3935 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3937 flag aSign, bSign, zSign;
3938 int32 aExp, bExp, expDiff;
3939 bits64 aSig0, aSig1, bSig;
3940 bits64 q, term0, term1, alternateASig0, alternateASig1;
3943 aSig0 = extractFloatx80Frac( a );
3944 aExp = extractFloatx80Exp( a );
3945 aSign = extractFloatx80Sign( a );
3946 bSig = extractFloatx80Frac( b );
3947 bExp = extractFloatx80Exp( b );
3948 bSign = extractFloatx80Sign( b );
3949 if ( aExp == 0x7FFF ) {
3950 if ( (bits64) ( aSig0<<1 )
3951 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3952 return propagateFloatx80NaN( a, b );
3956 if ( bExp == 0x7FFF ) {
3957 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3963 float_raise( float_flag_invalid );
3964 z.low = floatx80_default_nan_low;
3965 z.high = floatx80_default_nan_high;
3968 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3971 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3972 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3974 bSig |= LIT64( 0x8000000000000000 );
3976 expDiff = aExp - bExp;
3978 if ( expDiff < 0 ) {
3979 if ( expDiff < -1 ) return a;
3980 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3983 q = ( bSig <= aSig0 );
3984 if ( q ) aSig0 -= bSig;
3986 while ( 0 < expDiff ) {
3987 q = estimateDiv128To64( aSig0, aSig1, bSig );
3988 q = ( 2 < q ) ? q - 2 : 0;
3989 mul64To128( bSig, q, &term0, &term1 );
3990 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3991 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3995 if ( 0 < expDiff ) {
3996 q = estimateDiv128To64( aSig0, aSig1, bSig );
3997 q = ( 2 < q ) ? q - 2 : 0;
3999 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4000 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4001 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4002 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4004 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4011 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4012 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4013 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4016 aSig0 = alternateASig0;
4017 aSig1 = alternateASig1;
4021 normalizeRoundAndPackFloatx80(
4022 80, zSign, bExp + expDiff, aSig0, aSig1 );
4027 -------------------------------------------------------------------------------
4028 Returns the square root of the extended double-precision floating-point
4029 value `a'. The operation is performed according to the IEC/IEEE Standard
4030 for Binary Floating-Point Arithmetic.
4031 -------------------------------------------------------------------------------
4033 floatx80 floatx80_sqrt( floatx80 a )
4037 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4038 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4041 aSig0 = extractFloatx80Frac( a );
4042 aExp = extractFloatx80Exp( a );
4043 aSign = extractFloatx80Sign( a );
4044 if ( aExp == 0x7FFF ) {
4045 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4046 if ( ! aSign ) return a;
4050 if ( ( aExp | aSig0 ) == 0 ) return a;
4052 float_raise( float_flag_invalid );
4053 z.low = floatx80_default_nan_low;
4054 z.high = floatx80_default_nan_high;
4058 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4059 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4061 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4062 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4063 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4064 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4065 doubleZSig0 = zSig0<<1;
4066 mul64To128( zSig0, zSig0, &term0, &term1 );
4067 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4068 while ( (sbits64) rem0 < 0 ) {
4071 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4073 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4074 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4075 if ( zSig1 == 0 ) zSig1 = 1;
4076 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4077 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4078 mul64To128( zSig1, zSig1, &term2, &term3 );
4079 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4080 while ( (sbits64) rem1 < 0 ) {
4082 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4084 term2 |= doubleZSig0;
4085 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4087 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4089 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4090 zSig0 |= doubleZSig0;
4092 roundAndPackFloatx80(
4093 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4098 -------------------------------------------------------------------------------
4099 Returns 1 if the extended double-precision floating-point value `a' is
4100 equal to the corresponding value `b', and 0 otherwise. The comparison is
4101 performed according to the IEC/IEEE Standard for Binary Floating-Point
4103 -------------------------------------------------------------------------------
4105 flag floatx80_eq( floatx80 a, floatx80 b )
4108 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4109 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4110 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4111 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4113 if ( floatx80_is_signaling_nan( a )
4114 || floatx80_is_signaling_nan( b ) ) {
4115 float_raise( float_flag_invalid );
4121 && ( ( a.high == b.high )
4123 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4129 -------------------------------------------------------------------------------
4130 Returns 1 if the extended double-precision floating-point value `a' is
4131 less than or equal to the corresponding value `b', and 0 otherwise. The
4132 comparison is performed according to the IEC/IEEE Standard for Binary
4133 Floating-Point Arithmetic.
4134 -------------------------------------------------------------------------------
4136 flag floatx80_le( floatx80 a, floatx80 b )
4140 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4141 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4142 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4143 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4145 float_raise( float_flag_invalid );
4148 aSign = extractFloatx80Sign( a );
4149 bSign = extractFloatx80Sign( b );
4150 if ( aSign != bSign ) {
4153 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4157 aSign ? le128( b.high, b.low, a.high, a.low )
4158 : le128( a.high, a.low, b.high, b.low );
4163 -------------------------------------------------------------------------------
4164 Returns 1 if the extended double-precision floating-point value `a' is
4165 less than the corresponding value `b', and 0 otherwise. The comparison
4166 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4168 -------------------------------------------------------------------------------
4170 flag floatx80_lt( floatx80 a, floatx80 b )
4174 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4175 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4176 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4177 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4179 float_raise( float_flag_invalid );
4182 aSign = extractFloatx80Sign( a );
4183 bSign = extractFloatx80Sign( b );
4184 if ( aSign != bSign ) {
4187 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4191 aSign ? lt128( b.high, b.low, a.high, a.low )
4192 : lt128( a.high, a.low, b.high, b.low );
4197 -------------------------------------------------------------------------------
4198 Returns 1 if the extended double-precision floating-point value `a' is equal
4199 to the corresponding value `b', and 0 otherwise. The invalid exception is
4200 raised if either operand is a NaN. Otherwise, the comparison is performed
4201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202 -------------------------------------------------------------------------------
4204 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4207 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4208 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4209 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4210 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4212 float_raise( float_flag_invalid );
4217 && ( ( a.high == b.high )
4219 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4225 -------------------------------------------------------------------------------
4226 Returns 1 if the extended double-precision floating-point value `a' is less
4227 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4228 do not cause an exception. Otherwise, the comparison is performed according
4229 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4230 -------------------------------------------------------------------------------
4232 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4236 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4237 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4238 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4239 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4241 if ( floatx80_is_signaling_nan( a )
4242 || floatx80_is_signaling_nan( b ) ) {
4243 float_raise( float_flag_invalid );
4247 aSign = extractFloatx80Sign( a );
4248 bSign = extractFloatx80Sign( b );
4249 if ( aSign != bSign ) {
4252 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4256 aSign ? le128( b.high, b.low, a.high, a.low )
4257 : le128( a.high, a.low, b.high, b.low );
4262 -------------------------------------------------------------------------------
4263 Returns 1 if the extended double-precision floating-point value `a' is less
4264 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4265 an exception. Otherwise, the comparison is performed according to the
4266 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4267 -------------------------------------------------------------------------------
4269 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4273 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4274 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4275 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4276 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4278 if ( floatx80_is_signaling_nan( a )
4279 || floatx80_is_signaling_nan( b ) ) {
4280 float_raise( float_flag_invalid );
4284 aSign = extractFloatx80Sign( a );
4285 bSign = extractFloatx80Sign( b );
4286 if ( aSign != bSign ) {
4289 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4293 aSign ? lt128( b.high, b.low, a.high, a.low )
4294 : lt128( a.high, a.low, b.high, b.low );
4303 -------------------------------------------------------------------------------
4304 Returns the result of converting the quadruple-precision floating-point
4305 value `a' to the 32-bit two's complement integer format. The conversion
4306 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4307 Arithmetic---which means in particular that the conversion is rounded
4308 according to the current rounding mode. If `a' is a NaN, the largest
4309 positive integer is returned. Otherwise, if the conversion overflows, the
4310 largest integer with the same sign as `a' is returned.
4311 -------------------------------------------------------------------------------
4313 int32 float128_to_int32( float128 a )
4316 int32 aExp, shiftCount;
4317 bits64 aSig0, aSig1;
4319 aSig1 = extractFloat128Frac1( a );
4320 aSig0 = extractFloat128Frac0( a );
4321 aExp = extractFloat128Exp( a );
4322 aSign = extractFloat128Sign( a );
4323 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4324 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4325 aSig0 |= ( aSig1 != 0 );
4326 shiftCount = 0x4028 - aExp;
4327 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4328 return roundAndPackInt32( aSign, aSig0 );
4333 -------------------------------------------------------------------------------
4334 Returns the result of converting the quadruple-precision floating-point
4335 value `a' to the 32-bit two's complement integer format. The conversion
4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 Arithmetic, except that the conversion is always rounded toward zero. If
4338 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4339 conversion overflows, the largest integer with the same sign as `a' is
4341 -------------------------------------------------------------------------------
4343 int32 float128_to_int32_round_to_zero( float128 a )
4346 int32 aExp, shiftCount;
4347 bits64 aSig0, aSig1, savedASig;
4350 aSig1 = extractFloat128Frac1( a );
4351 aSig0 = extractFloat128Frac0( a );
4352 aExp = extractFloat128Exp( a );
4353 aSign = extractFloat128Sign( a );
4354 aSig0 |= ( aSig1 != 0 );
4355 if ( 0x401E < aExp ) {
4356 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4359 else if ( aExp < 0x3FFF ) {
4360 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4363 aSig0 |= LIT64( 0x0001000000000000 );
4364 shiftCount = 0x402F - aExp;
4366 aSig0 >>= shiftCount;
4368 if ( aSign ) z = - z;
4369 if ( ( z < 0 ) ^ aSign ) {
4371 float_raise( float_flag_invalid );
4372 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4374 if ( ( aSig0<<shiftCount ) != savedASig ) {
4375 float_exception_flags |= float_flag_inexact;
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format. The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic---which means in particular that the conversion is rounded
4387 according to the current rounding mode. If `a' is a NaN, the largest
4388 positive integer is returned. Otherwise, if the conversion overflows, the
4389 largest integer with the same sign as `a' is returned.
4390 -------------------------------------------------------------------------------
4392 int64 float128_to_int64( float128 a )
4395 int32 aExp, shiftCount;
4396 bits64 aSig0, aSig1;
4398 aSig1 = extractFloat128Frac1( a );
4399 aSig0 = extractFloat128Frac0( a );
4400 aExp = extractFloat128Exp( a );
4401 aSign = extractFloat128Sign( a );
4402 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4403 shiftCount = 0x402F - aExp;
4404 if ( shiftCount <= 0 ) {
4405 if ( 0x403E < aExp ) {
4406 float_raise( float_flag_invalid );
4408 || ( ( aExp == 0x7FFF )
4409 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4412 return LIT64( 0x7FFFFFFFFFFFFFFF );
4414 return (sbits64) LIT64( 0x8000000000000000 );
4416 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4419 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4421 return roundAndPackInt64( aSign, aSig0, aSig1 );
4426 -------------------------------------------------------------------------------
4427 Returns the result of converting the quadruple-precision floating-point
4428 value `a' to the 64-bit two's complement integer format. The conversion
4429 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4430 Arithmetic, except that the conversion is always rounded toward zero.
4431 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4432 the conversion overflows, the largest integer with the same sign as `a' is
4434 -------------------------------------------------------------------------------
4436 int64 float128_to_int64_round_to_zero( float128 a )
4439 int32 aExp, shiftCount;
4440 bits64 aSig0, aSig1;
4443 aSig1 = extractFloat128Frac1( a );
4444 aSig0 = extractFloat128Frac0( a );
4445 aExp = extractFloat128Exp( a );
4446 aSign = extractFloat128Sign( a );
4447 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4448 shiftCount = aExp - 0x402F;
4449 if ( 0 < shiftCount ) {
4450 if ( 0x403E <= aExp ) {
4451 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4452 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4453 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4454 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4457 float_raise( float_flag_invalid );
4458 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4459 return LIT64( 0x7FFFFFFFFFFFFFFF );
4462 return (sbits64) LIT64( 0x8000000000000000 );
4464 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4465 if ( (bits64) ( aSig1<<shiftCount ) ) {
4466 float_exception_flags |= float_flag_inexact;
4470 if ( aExp < 0x3FFF ) {
4471 if ( aExp | aSig0 | aSig1 ) {
4472 float_exception_flags |= float_flag_inexact;
4476 z = aSig0>>( - shiftCount );
4478 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4479 float_exception_flags |= float_flag_inexact;
4482 if ( aSign ) z = - z;
4487 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4488 && defined(SOFTFLOAT_NEED_FIXUNS)
4490 * just like above - but do not care for overflow of signed results
4492 uint64 float128_to_uint64_round_to_zero( float128 a )
4495 int32 aExp, shiftCount;
4496 bits64 aSig0, aSig1;
4499 aSig1 = extractFloat128Frac1( a );
4500 aSig0 = extractFloat128Frac0( a );
4501 aExp = extractFloat128Exp( a );
4502 aSign = extractFloat128Sign( a );
4503 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4504 shiftCount = aExp - 0x402F;
4505 if ( 0 < shiftCount ) {
4506 if ( 0x403F <= aExp ) {
4507 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4508 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4509 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4510 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4513 float_raise( float_flag_invalid );
4515 return LIT64( 0xFFFFFFFFFFFFFFFF );
4517 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4518 if ( (bits64) ( aSig1<<shiftCount ) ) {
4519 float_exception_flags |= float_flag_inexact;
4523 if ( aExp < 0x3FFF ) {
4524 if ( aExp | aSig0 | aSig1 ) {
4525 float_exception_flags |= float_flag_inexact;
4529 z = aSig0>>( - shiftCount );
4530 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4531 float_exception_flags |= float_flag_inexact;
4534 if ( aSign ) z = - z;
4538 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4541 -------------------------------------------------------------------------------
4542 Returns the result of converting the quadruple-precision floating-point
4543 value `a' to the single-precision floating-point format. The conversion
4544 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4546 -------------------------------------------------------------------------------
4548 float32 float128_to_float32( float128 a )
4552 bits64 aSig0, aSig1;
4555 aSig1 = extractFloat128Frac1( a );
4556 aSig0 = extractFloat128Frac0( a );
4557 aExp = extractFloat128Exp( a );
4558 aSign = extractFloat128Sign( a );
4559 if ( aExp == 0x7FFF ) {
4560 if ( aSig0 | aSig1 ) {
4561 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4563 return packFloat32( aSign, 0xFF, 0 );
4565 aSig0 |= ( aSig1 != 0 );
4566 shift64RightJamming( aSig0, 18, &aSig0 );
4568 if ( aExp || zSig ) {
4572 return roundAndPackFloat32( aSign, aExp, zSig );
4577 -------------------------------------------------------------------------------
4578 Returns the result of converting the quadruple-precision floating-point
4579 value `a' to the double-precision floating-point format. The conversion
4580 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4582 -------------------------------------------------------------------------------
4584 float64 float128_to_float64( float128 a )
4588 bits64 aSig0, aSig1;
4590 aSig1 = extractFloat128Frac1( a );
4591 aSig0 = extractFloat128Frac0( a );
4592 aExp = extractFloat128Exp( a );
4593 aSign = extractFloat128Sign( a );
4594 if ( aExp == 0x7FFF ) {
4595 if ( aSig0 | aSig1 ) {
4596 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4598 return packFloat64( aSign, 0x7FF, 0 );
4600 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4601 aSig0 |= ( aSig1 != 0 );
4602 if ( aExp || aSig0 ) {
4603 aSig0 |= LIT64( 0x4000000000000000 );
4606 return roundAndPackFloat64( aSign, aExp, aSig0 );
4613 -------------------------------------------------------------------------------
4614 Returns the result of converting the quadruple-precision floating-point
4615 value `a' to the extended double-precision floating-point format. The
4616 conversion is performed according to the IEC/IEEE Standard for Binary
4617 Floating-Point Arithmetic.
4618 -------------------------------------------------------------------------------
4620 floatx80 float128_to_floatx80( float128 a )
4624 bits64 aSig0, aSig1;
4626 aSig1 = extractFloat128Frac1( a );
4627 aSig0 = extractFloat128Frac0( a );
4628 aExp = extractFloat128Exp( a );
4629 aSign = extractFloat128Sign( a );
4630 if ( aExp == 0x7FFF ) {
4631 if ( aSig0 | aSig1 ) {
4632 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4634 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4637 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4638 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4641 aSig0 |= LIT64( 0x0001000000000000 );
4643 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4644 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4651 -------------------------------------------------------------------------------
4652 Rounds the quadruple-precision floating-point value `a' to an integer, and
4653 returns the result as a quadruple-precision floating-point value. The
4654 operation is performed according to the IEC/IEEE Standard for Binary
4655 Floating-Point Arithmetic.
4656 -------------------------------------------------------------------------------
4658 float128 float128_round_to_int( float128 a )
4662 bits64 lastBitMask, roundBitsMask;
4666 aExp = extractFloat128Exp( a );
4667 if ( 0x402F <= aExp ) {
4668 if ( 0x406F <= aExp ) {
4669 if ( ( aExp == 0x7FFF )
4670 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4672 return propagateFloat128NaN( a, a );
4677 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4678 roundBitsMask = lastBitMask - 1;
4680 roundingMode = float_rounding_mode;
4681 if ( roundingMode == float_round_nearest_even ) {
4682 if ( lastBitMask ) {
4683 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4684 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4687 if ( (sbits64) z.low < 0 ) {
4689 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4693 else if ( roundingMode != float_round_to_zero ) {
4694 if ( extractFloat128Sign( z )
4695 ^ ( roundingMode == float_round_up ) ) {
4696 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4699 z.low &= ~ roundBitsMask;
4702 if ( aExp < 0x3FFF ) {
4703 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4704 float_exception_flags |= float_flag_inexact;
4705 aSign = extractFloat128Sign( a );
4706 switch ( float_rounding_mode ) {
4707 case float_round_nearest_even:
4708 if ( ( aExp == 0x3FFE )
4709 && ( extractFloat128Frac0( a )
4710 | extractFloat128Frac1( a ) )
4712 return packFloat128( aSign, 0x3FFF, 0, 0 );
4715 case float_round_to_zero:
4717 case float_round_down:
4719 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4720 : packFloat128( 0, 0, 0, 0 );
4721 case float_round_up:
4723 aSign ? packFloat128( 1, 0, 0, 0 )
4724 : packFloat128( 0, 0x3FFF, 0, 0 );
4726 return packFloat128( aSign, 0, 0, 0 );
4729 lastBitMask <<= 0x402F - aExp;
4730 roundBitsMask = lastBitMask - 1;
4733 roundingMode = float_rounding_mode;
4734 if ( roundingMode == float_round_nearest_even ) {
4735 z.high += lastBitMask>>1;
4736 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4737 z.high &= ~ lastBitMask;
4740 else if ( roundingMode != float_round_to_zero ) {
4741 if ( extractFloat128Sign( z )
4742 ^ ( roundingMode == float_round_up ) ) {
4743 z.high |= ( a.low != 0 );
4744 z.high += roundBitsMask;
4747 z.high &= ~ roundBitsMask;
4749 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4750 float_exception_flags |= float_flag_inexact;
4757 -------------------------------------------------------------------------------
4758 Returns the result of adding the absolute values of the quadruple-precision
4759 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4760 before being returned. `zSign' is ignored if the result is a NaN.
4761 The addition is performed according to the IEC/IEEE Standard for Binary
4762 Floating-Point Arithmetic.
4763 -------------------------------------------------------------------------------
4765 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4767 int32 aExp, bExp, zExp;
4768 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4771 aSig1 = extractFloat128Frac1( a );
4772 aSig0 = extractFloat128Frac0( a );
4773 aExp = extractFloat128Exp( a );
4774 bSig1 = extractFloat128Frac1( b );
4775 bSig0 = extractFloat128Frac0( b );
4776 bExp = extractFloat128Exp( b );
4777 expDiff = aExp - bExp;
4778 if ( 0 < expDiff ) {
4779 if ( aExp == 0x7FFF ) {
4780 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4787 bSig0 |= LIT64( 0x0001000000000000 );
4789 shift128ExtraRightJamming(
4790 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4793 else if ( expDiff < 0 ) {
4794 if ( bExp == 0x7FFF ) {
4795 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4796 return packFloat128( zSign, 0x7FFF, 0, 0 );
4802 aSig0 |= LIT64( 0x0001000000000000 );
4804 shift128ExtraRightJamming(
4805 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4809 if ( aExp == 0x7FFF ) {
4810 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4811 return propagateFloat128NaN( a, b );
4815 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4816 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4818 zSig0 |= LIT64( 0x0002000000000000 );
4822 aSig0 |= LIT64( 0x0001000000000000 );
4823 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4825 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4828 shift128ExtraRightJamming(
4829 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4831 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4836 -------------------------------------------------------------------------------
4837 Returns the result of subtracting the absolute values of the quadruple-
4838 precision floating-point values `a' and `b'. If `zSign' is 1, the
4839 difference is negated before being returned. `zSign' is ignored if the
4840 result is a NaN. The subtraction is performed according to the IEC/IEEE
4841 Standard for Binary Floating-Point Arithmetic.
4842 -------------------------------------------------------------------------------
4844 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4846 int32 aExp, bExp, zExp;
4847 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4851 aSig1 = extractFloat128Frac1( a );
4852 aSig0 = extractFloat128Frac0( a );
4853 aExp = extractFloat128Exp( a );
4854 bSig1 = extractFloat128Frac1( b );
4855 bSig0 = extractFloat128Frac0( b );
4856 bExp = extractFloat128Exp( b );
4857 expDiff = aExp - bExp;
4858 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4859 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4860 if ( 0 < expDiff ) goto aExpBigger;
4861 if ( expDiff < 0 ) goto bExpBigger;
4862 if ( aExp == 0x7FFF ) {
4863 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4864 return propagateFloat128NaN( a, b );
4866 float_raise( float_flag_invalid );
4867 z.low = float128_default_nan_low;
4868 z.high = float128_default_nan_high;
4875 if ( bSig0 < aSig0 ) goto aBigger;
4876 if ( aSig0 < bSig0 ) goto bBigger;
4877 if ( bSig1 < aSig1 ) goto aBigger;
4878 if ( aSig1 < bSig1 ) goto bBigger;
4879 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4881 if ( bExp == 0x7FFF ) {
4882 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4883 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4889 aSig0 |= LIT64( 0x4000000000000000 );
4891 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4892 bSig0 |= LIT64( 0x4000000000000000 );
4894 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4897 goto normalizeRoundAndPack;
4899 if ( aExp == 0x7FFF ) {
4900 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4907 bSig0 |= LIT64( 0x4000000000000000 );
4909 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4910 aSig0 |= LIT64( 0x4000000000000000 );
4912 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4914 normalizeRoundAndPack:
4916 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4921 -------------------------------------------------------------------------------
4922 Returns the result of adding the quadruple-precision floating-point values
4923 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4924 for Binary Floating-Point Arithmetic.
4925 -------------------------------------------------------------------------------
4927 float128 float128_add( float128 a, float128 b )
4931 aSign = extractFloat128Sign( a );
4932 bSign = extractFloat128Sign( b );
4933 if ( aSign == bSign ) {
4934 return addFloat128Sigs( a, b, aSign );
4937 return subFloat128Sigs( a, b, aSign );
4943 -------------------------------------------------------------------------------
4944 Returns the result of subtracting the quadruple-precision floating-point
4945 values `a' and `b'. The operation is performed according to the IEC/IEEE
4946 Standard for Binary Floating-Point Arithmetic.
4947 -------------------------------------------------------------------------------
4949 float128 float128_sub( float128 a, float128 b )
4953 aSign = extractFloat128Sign( a );
4954 bSign = extractFloat128Sign( b );
4955 if ( aSign == bSign ) {
4956 return subFloat128Sigs( a, b, aSign );
4959 return addFloat128Sigs( a, b, aSign );
4965 -------------------------------------------------------------------------------
4966 Returns the result of multiplying the quadruple-precision floating-point
4967 values `a' and `b'. The operation is performed according to the IEC/IEEE
4968 Standard for Binary Floating-Point Arithmetic.
4969 -------------------------------------------------------------------------------
4971 float128 float128_mul( float128 a, float128 b )
4973 flag aSign, bSign, zSign;
4974 int32 aExp, bExp, zExp;
4975 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4978 aSig1 = extractFloat128Frac1( a );
4979 aSig0 = extractFloat128Frac0( a );
4980 aExp = extractFloat128Exp( a );
4981 aSign = extractFloat128Sign( a );
4982 bSig1 = extractFloat128Frac1( b );
4983 bSig0 = extractFloat128Frac0( b );
4984 bExp = extractFloat128Exp( b );
4985 bSign = extractFloat128Sign( b );
4986 zSign = aSign ^ bSign;
4987 if ( aExp == 0x7FFF ) {
4988 if ( ( aSig0 | aSig1 )
4989 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4990 return propagateFloat128NaN( a, b );
4992 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4993 return packFloat128( zSign, 0x7FFF, 0, 0 );
4995 if ( bExp == 0x7FFF ) {
4996 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4997 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4999 float_raise( float_flag_invalid );
5000 z.low = float128_default_nan_low;
5001 z.high = float128_default_nan_high;
5004 return packFloat128( zSign, 0x7FFF, 0, 0 );
5007 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5008 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5011 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5014 zExp = aExp + bExp - 0x4000;
5015 aSig0 |= LIT64( 0x0001000000000000 );
5016 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5017 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5018 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5019 zSig2 |= ( zSig3 != 0 );
5020 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5021 shift128ExtraRightJamming(
5022 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5025 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5030 -------------------------------------------------------------------------------
5031 Returns the result of dividing the quadruple-precision floating-point value
5032 `a' by the corresponding value `b'. The operation is performed according to
5033 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5034 -------------------------------------------------------------------------------
5036 float128 float128_div( float128 a, float128 b )
5038 flag aSign, bSign, zSign;
5039 int32 aExp, bExp, zExp;
5040 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5041 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5044 aSig1 = extractFloat128Frac1( a );
5045 aSig0 = extractFloat128Frac0( a );
5046 aExp = extractFloat128Exp( a );
5047 aSign = extractFloat128Sign( a );
5048 bSig1 = extractFloat128Frac1( b );
5049 bSig0 = extractFloat128Frac0( b );
5050 bExp = extractFloat128Exp( b );
5051 bSign = extractFloat128Sign( b );
5052 zSign = aSign ^ bSign;
5053 if ( aExp == 0x7FFF ) {
5054 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5055 if ( bExp == 0x7FFF ) {
5056 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5059 return packFloat128( zSign, 0x7FFF, 0, 0 );
5061 if ( bExp == 0x7FFF ) {
5062 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5063 return packFloat128( zSign, 0, 0, 0 );
5066 if ( ( bSig0 | bSig1 ) == 0 ) {
5067 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5069 float_raise( float_flag_invalid );
5070 z.low = float128_default_nan_low;
5071 z.high = float128_default_nan_high;
5074 float_raise( float_flag_divbyzero );
5075 return packFloat128( zSign, 0x7FFF, 0, 0 );
5077 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5080 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5081 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5083 zExp = aExp - bExp + 0x3FFD;
5085 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5087 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5088 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5089 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5092 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5093 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5094 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5095 while ( (sbits64) rem0 < 0 ) {
5097 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5099 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5100 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5101 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5102 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5103 while ( (sbits64) rem1 < 0 ) {
5105 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5107 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5109 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5110 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5115 -------------------------------------------------------------------------------
5116 Returns the remainder of the quadruple-precision floating-point value `a'
5117 with respect to the corresponding value `b'. The operation is performed
5118 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5119 -------------------------------------------------------------------------------
5121 float128 float128_rem( float128 a, float128 b )
5123 flag aSign, bSign, zSign;
5124 int32 aExp, bExp, expDiff;
5125 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5126 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5130 aSig1 = extractFloat128Frac1( a );
5131 aSig0 = extractFloat128Frac0( a );
5132 aExp = extractFloat128Exp( a );
5133 aSign = extractFloat128Sign( a );
5134 bSig1 = extractFloat128Frac1( b );
5135 bSig0 = extractFloat128Frac0( b );
5136 bExp = extractFloat128Exp( b );
5137 bSign = extractFloat128Sign( b );
5138 if ( aExp == 0x7FFF ) {
5139 if ( ( aSig0 | aSig1 )
5140 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5141 return propagateFloat128NaN( a, b );
5145 if ( bExp == 0x7FFF ) {
5146 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5150 if ( ( bSig0 | bSig1 ) == 0 ) {
5152 float_raise( float_flag_invalid );
5153 z.low = float128_default_nan_low;
5154 z.high = float128_default_nan_high;
5157 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5160 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5161 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5163 expDiff = aExp - bExp;
5164 if ( expDiff < -1 ) return a;
5166 aSig0 | LIT64( 0x0001000000000000 ),
5168 15 - ( expDiff < 0 ),
5173 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5174 q = le128( bSig0, bSig1, aSig0, aSig1 );
5175 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5177 while ( 0 < expDiff ) {
5178 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5179 q = ( 4 < q ) ? q - 4 : 0;
5180 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5181 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5182 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5183 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5186 if ( -64 < expDiff ) {
5187 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5188 q = ( 4 < q ) ? q - 4 : 0;
5190 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5192 if ( expDiff < 0 ) {
5193 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5196 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5198 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5199 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5202 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5203 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5206 alternateASig0 = aSig0;
5207 alternateASig1 = aSig1;
5209 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5210 } while ( 0 <= (sbits64) aSig0 );
5212 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5213 if ( ( sigMean0 < 0 )
5214 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5215 aSig0 = alternateASig0;
5216 aSig1 = alternateASig1;
5218 zSign = ( (sbits64) aSig0 < 0 );
5219 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5221 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5226 -------------------------------------------------------------------------------
5227 Returns the square root of the quadruple-precision floating-point value `a'.
5228 The operation is performed according to the IEC/IEEE Standard for Binary
5229 Floating-Point Arithmetic.
5230 -------------------------------------------------------------------------------
5232 float128 float128_sqrt( float128 a )
5236 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5237 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5240 aSig1 = extractFloat128Frac1( a );
5241 aSig0 = extractFloat128Frac0( a );
5242 aExp = extractFloat128Exp( a );
5243 aSign = extractFloat128Sign( a );
5244 if ( aExp == 0x7FFF ) {
5245 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5246 if ( ! aSign ) return a;
5250 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5252 float_raise( float_flag_invalid );
5253 z.low = float128_default_nan_low;
5254 z.high = float128_default_nan_high;
5258 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5259 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5261 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5262 aSig0 |= LIT64( 0x0001000000000000 );
5263 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5264 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5265 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5266 doubleZSig0 = zSig0<<1;
5267 mul64To128( zSig0, zSig0, &term0, &term1 );
5268 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5269 while ( (sbits64) rem0 < 0 ) {
5272 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5274 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5275 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5276 if ( zSig1 == 0 ) zSig1 = 1;
5277 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5278 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5279 mul64To128( zSig1, zSig1, &term2, &term3 );
5280 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5281 while ( (sbits64) rem1 < 0 ) {
5283 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5285 term2 |= doubleZSig0;
5286 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5288 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5290 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5291 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5296 -------------------------------------------------------------------------------
5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5298 the corresponding value `b', and 0 otherwise. The comparison is performed
5299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5300 -------------------------------------------------------------------------------
5302 flag float128_eq( float128 a, float128 b )
5305 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5306 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5307 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5308 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5310 if ( float128_is_signaling_nan( a )
5311 || float128_is_signaling_nan( b ) ) {
5312 float_raise( float_flag_invalid );
5318 && ( ( a.high == b.high )
5320 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5326 -------------------------------------------------------------------------------
5327 Returns 1 if the quadruple-precision floating-point value `a' is less than
5328 or equal to the corresponding value `b', and 0 otherwise. The comparison
5329 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5331 -------------------------------------------------------------------------------
5333 flag float128_le( float128 a, float128 b )
5337 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5338 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5339 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5340 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5342 float_raise( float_flag_invalid );
5345 aSign = extractFloat128Sign( a );
5346 bSign = extractFloat128Sign( b );
5347 if ( aSign != bSign ) {
5350 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354 aSign ? le128( b.high, b.low, a.high, a.low )
5355 : le128( a.high, a.low, b.high, b.low );
5360 -------------------------------------------------------------------------------
5361 Returns 1 if the quadruple-precision floating-point value `a' is less than
5362 the corresponding value `b', and 0 otherwise. The comparison is performed
5363 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5366 flag float128_lt( float128 a, float128 b )
5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5375 float_raise( float_flag_invalid );
5378 aSign = extractFloat128Sign( a );
5379 bSign = extractFloat128Sign( b );
5380 if ( aSign != bSign ) {
5383 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387 aSign ? lt128( b.high, b.low, a.high, a.low )
5388 : lt128( a.high, a.low, b.high, b.low );
5393 -------------------------------------------------------------------------------
5394 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395 the corresponding value `b', and 0 otherwise. The invalid exception is
5396 raised if either operand is a NaN. Otherwise, the comparison is performed
5397 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5398 -------------------------------------------------------------------------------
5400 flag float128_eq_signaling( float128 a, float128 b )
5403 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5404 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5405 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5406 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5408 float_raise( float_flag_invalid );
5413 && ( ( a.high == b.high )
5415 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5421 -------------------------------------------------------------------------------
5422 Returns 1 if the quadruple-precision floating-point value `a' is less than
5423 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5424 cause an exception. Otherwise, the comparison is performed according to the
5425 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5426 -------------------------------------------------------------------------------
5428 flag float128_le_quiet( float128 a, float128 b )
5432 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5433 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5434 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5435 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5437 if ( float128_is_signaling_nan( a )
5438 || float128_is_signaling_nan( b ) ) {
5439 float_raise( float_flag_invalid );
5443 aSign = extractFloat128Sign( a );
5444 bSign = extractFloat128Sign( b );
5445 if ( aSign != bSign ) {
5448 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5452 aSign ? le128( b.high, b.low, a.high, a.low )
5453 : le128( a.high, a.low, b.high, b.low );
5458 -------------------------------------------------------------------------------
5459 Returns 1 if the quadruple-precision floating-point value `a' is less than
5460 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5461 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5462 Standard for Binary Floating-Point Arithmetic.
5463 -------------------------------------------------------------------------------
5465 flag float128_lt_quiet( float128 a, float128 b )
5469 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5470 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5471 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5472 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5474 if ( float128_is_signaling_nan( a )
5475 || float128_is_signaling_nan( b ) ) {
5476 float_raise( float_flag_invalid );
5480 aSign = extractFloat128Sign( a );
5481 bSign = extractFloat128Sign( b );
5482 if ( aSign != bSign ) {
5485 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5489 aSign ? lt128( b.high, b.low, a.high, a.low )
5490 : lt128( a.high, a.low, b.high, b.low );
5497 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5500 * These two routines are not part of the original softfloat distribution.
5502 * They are based on the corresponding conversions to integer but return
5503 * unsigned numbers instead since these functions are required by GCC.
5505 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5507 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5511 -------------------------------------------------------------------------------
5512 Returns the result of converting the double-precision floating-point value
5513 `a' to the 32-bit unsigned integer format. The conversion is
5514 performed according to the IEC/IEEE Standard for Binary Floating-point
5515 Arithmetic, except that the conversion is always rounded toward zero. If
5516 `a' is a NaN, the largest positive integer is returned. If the conversion
5517 overflows, the largest integer positive is returned.
5518 -------------------------------------------------------------------------------
5520 uint32 float64_to_uint32_round_to_zero( float64 a )
5523 int16 aExp, shiftCount;
5524 bits64 aSig, savedASig;
5527 aSig = extractFloat64Frac( a );
5528 aExp = extractFloat64Exp( a );
5529 aSign = extractFloat64Sign( a );
5532 float_raise( float_flag_invalid );
5536 if ( 0x41E < aExp ) {
5537 float_raise( float_flag_invalid );
5540 else if ( aExp < 0x3FF ) {
5541 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5544 aSig |= LIT64( 0x0010000000000000 );
5545 shiftCount = 0x433 - aExp;
5547 aSig >>= shiftCount;
5549 if ( ( aSig<<shiftCount ) != savedASig ) {
5550 float_exception_flags |= float_flag_inexact;
5557 -------------------------------------------------------------------------------
5558 Returns the result of converting the single-precision floating-point value
5559 `a' to the 32-bit unsigned integer format. The conversion is
5560 performed according to the IEC/IEEE Standard for Binary Floating-point
5561 Arithmetic, except that the conversion is always rounded toward zero. If
5562 `a' is a NaN, the largest positive integer is returned. If the conversion
5563 overflows, the largest positive integer is returned.
5564 -------------------------------------------------------------------------------
5566 uint32 float32_to_uint32_round_to_zero( float32 a )
5569 int16 aExp, shiftCount;
5573 aSig = extractFloat32Frac( a );
5574 aExp = extractFloat32Exp( a );
5575 aSign = extractFloat32Sign( a );
5576 shiftCount = aExp - 0x9E;
5579 float_raise( float_flag_invalid );
5582 if ( 0 < shiftCount ) {
5583 float_raise( float_flag_invalid );
5586 else if ( aExp <= 0x7E ) {
5587 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5590 aSig = ( aSig | 0x800000 )<<8;
5591 z = aSig>>( - shiftCount );
5592 if ( aSig<<( shiftCount & 31 ) ) {
5593 float_exception_flags |= float_flag_inexact;