1 /* $NetBSD: softfloat.c,v 1.2 2003/07/26 19:24:52 salo Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
50 #ifdef SOFTFLOAT_FOR_GCC
51 #include "softfloat-for-gcc.h"
55 #include "softfloat.h"
58 * Conversions between floats as stored in memory and floats as
61 #ifndef FLOAT64_DEMANGLE
62 #define FLOAT64_DEMANGLE(a) (a)
64 #ifndef FLOAT64_MANGLE
65 #define FLOAT64_MANGLE(a) (a)
69 -------------------------------------------------------------------------------
70 Floating-point rounding mode, extended double-precision rounding precision,
72 -------------------------------------------------------------------------------
74 fp_rnd_t float_rounding_mode = float_round_nearest_even;
75 fp_except float_exception_flags = 0;
77 int8 floatx80_rounding_precision = 80;
81 -------------------------------------------------------------------------------
82 Primitive arithmetic functions, including multi-word arithmetic, and
83 division and square root approximations. (Can be specialized to target if
85 -------------------------------------------------------------------------------
87 #include "softfloat-macros"
90 -------------------------------------------------------------------------------
91 Functions and definitions to determine: (1) whether tininess for underflow
92 is detected before or after rounding by default, (2) what (if anything)
93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
95 are propagated from function inputs to output. These details are target-
97 -------------------------------------------------------------------------------
99 #include "softfloat-specialize"
101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
103 -------------------------------------------------------------------------------
104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
105 and 7, and returns the properly rounded 32-bit integer corresponding to the
106 input. If `zSign' is 1, the input is negated before being converted to an
107 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
108 is simply rounded to an integer, with the inexact exception raised if the
109 input cannot be represented exactly as an integer. However, if the fixed-
110 point input is too large, the invalid exception is raised and the largest
111 positive or negative integer is returned.
112 -------------------------------------------------------------------------------
114 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 flag roundNearestEven;
118 int8 roundIncrement, roundBits;
121 roundingMode = float_rounding_mode;
122 roundNearestEven = ( roundingMode == float_round_nearest_even );
123 roundIncrement = 0x40;
124 if ( ! roundNearestEven ) {
125 if ( roundingMode == float_round_to_zero ) {
129 roundIncrement = 0x7F;
131 if ( roundingMode == float_round_up ) roundIncrement = 0;
134 if ( roundingMode == float_round_down ) roundIncrement = 0;
138 roundBits = absZ & 0x7F;
139 absZ = ( absZ + roundIncrement )>>7;
140 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142 if ( zSign ) z = - z;
143 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
144 float_raise( float_flag_invalid );
145 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
147 if ( roundBits ) float_exception_flags |= float_flag_inexact;
153 -------------------------------------------------------------------------------
154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 `absZ1', with binary point between bits 63 and 64 (between the input words),
156 and returns the properly rounded 64-bit integer corresponding to the input.
157 If `zSign' is 1, the input is negated before being converted to an integer.
158 Ordinarily, the fixed-point input is simply rounded to an integer, with
159 the inexact exception raised if the input cannot be represented exactly as
160 an integer. However, if the fixed-point input is too large, the invalid
161 exception is raised and the largest positive or negative integer is
163 -------------------------------------------------------------------------------
165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 flag roundNearestEven, increment;
171 roundingMode = float_rounding_mode;
172 roundNearestEven = ( roundingMode == float_round_nearest_even );
173 increment = ( (sbits64) absZ1 < 0 );
174 if ( ! roundNearestEven ) {
175 if ( roundingMode == float_round_to_zero ) {
180 increment = ( roundingMode == float_round_down ) && absZ1;
183 increment = ( roundingMode == float_round_up ) && absZ1;
189 if ( absZ0 == 0 ) goto overflow;
190 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193 if ( zSign ) z = - z;
194 if ( z && ( ( z < 0 ) ^ zSign ) ) {
196 float_raise( float_flag_invalid );
198 zSign ? (sbits64) LIT64( 0x8000000000000000 )
199 : LIT64( 0x7FFFFFFFFFFFFFFF );
201 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
208 -------------------------------------------------------------------------------
209 Returns the fraction bits of the single-precision floating-point value `a'.
210 -------------------------------------------------------------------------------
212 INLINE bits32 extractFloat32Frac( float32 a )
215 return a & 0x007FFFFF;
220 -------------------------------------------------------------------------------
221 Returns the exponent bits of the single-precision floating-point value `a'.
222 -------------------------------------------------------------------------------
224 INLINE int16 extractFloat32Exp( float32 a )
227 return ( a>>23 ) & 0xFF;
232 -------------------------------------------------------------------------------
233 Returns the sign bit of the single-precision floating-point value `a'.
234 -------------------------------------------------------------------------------
236 INLINE flag extractFloat32Sign( float32 a )
244 -------------------------------------------------------------------------------
245 Normalizes the subnormal single-precision floating-point value represented
246 by the denormalized significand `aSig'. The normalized exponent and
247 significand are stored at the locations pointed to by `zExpPtr' and
248 `zSigPtr', respectively.
249 -------------------------------------------------------------------------------
252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
256 shiftCount = countLeadingZeros32( aSig ) - 8;
257 *zSigPtr = aSig<<shiftCount;
258 *zExpPtr = 1 - shiftCount;
263 -------------------------------------------------------------------------------
264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
265 single-precision floating-point value, returning the result. After being
266 shifted into the proper positions, the three fields are simply added
267 together to form the result. This means that any integer portion of `zSig'
268 will be added into the exponent. Since a properly normalized significand
269 will have an integer portion equal to 1, the `zExp' input should be 1 less
270 than the desired result exponent whenever `zSig' is a complete, normalized
272 -------------------------------------------------------------------------------
274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
282 -------------------------------------------------------------------------------
283 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
284 and significand `zSig', and returns the proper single-precision floating-
285 point value corresponding to the abstract input. Ordinarily, the abstract
286 value is simply rounded and packed into the single-precision format, with
287 the inexact exception raised if the abstract input cannot be represented
288 exactly. However, if the abstract value is too large, the overflow and
289 inexact exceptions are raised and an infinity or maximal finite value is
290 returned. If the abstract value is too small, the input value is rounded to
291 a subnormal number, and the underflow and inexact exceptions are raised if
292 the abstract input cannot be represented exactly as a subnormal single-
293 precision floating-point number.
294 The input significand `zSig' has its binary point between bits 30
295 and 29, which is 7 bits to the left of the usual location. This shifted
296 significand must be normalized or smaller. If `zSig' is not normalized,
297 `zExp' must be 0; in that case, the result returned is a subnormal number,
298 and it must not require rounding. In the usual case that `zSig' is
299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
300 The handling of underflow and overflow follows the IEC/IEEE Standard for
301 Binary Floating-Point Arithmetic.
302 -------------------------------------------------------------------------------
304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 flag roundNearestEven;
308 int8 roundIncrement, roundBits;
311 roundingMode = float_rounding_mode;
312 roundNearestEven = ( roundingMode == float_round_nearest_even );
313 roundIncrement = 0x40;
314 if ( ! roundNearestEven ) {
315 if ( roundingMode == float_round_to_zero ) {
319 roundIncrement = 0x7F;
321 if ( roundingMode == float_round_up ) roundIncrement = 0;
324 if ( roundingMode == float_round_down ) roundIncrement = 0;
328 roundBits = zSig & 0x7F;
329 if ( 0xFD <= (bits16) zExp ) {
331 || ( ( zExp == 0xFD )
332 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
334 float_raise( float_flag_overflow | float_flag_inexact );
335 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
339 ( float_detect_tininess == float_tininess_before_rounding )
341 || ( zSig + roundIncrement < 0x80000000 );
342 shift32RightJamming( zSig, - zExp, &zSig );
344 roundBits = zSig & 0x7F;
345 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348 if ( roundBits ) float_exception_flags |= float_flag_inexact;
349 zSig = ( zSig + roundIncrement )>>7;
350 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
351 if ( zSig == 0 ) zExp = 0;
352 return packFloat32( zSign, zExp, zSig );
357 -------------------------------------------------------------------------------
358 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
359 and significand `zSig', and returns the proper single-precision floating-
360 point value corresponding to the abstract input. This routine is just like
361 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
363 floating-point exponent.
364 -------------------------------------------------------------------------------
367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
371 shiftCount = countLeadingZeros32( zSig ) - 1;
372 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
377 -------------------------------------------------------------------------------
378 Returns the fraction bits of the double-precision floating-point value `a'.
379 -------------------------------------------------------------------------------
381 INLINE bits64 extractFloat64Frac( float64 a )
384 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
389 -------------------------------------------------------------------------------
390 Returns the exponent bits of the double-precision floating-point value `a'.
391 -------------------------------------------------------------------------------
393 INLINE int16 extractFloat64Exp( float64 a )
396 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
401 -------------------------------------------------------------------------------
402 Returns the sign bit of the double-precision floating-point value `a'.
403 -------------------------------------------------------------------------------
405 INLINE flag extractFloat64Sign( float64 a )
408 return FLOAT64_DEMANGLE(a)>>63;
413 -------------------------------------------------------------------------------
414 Normalizes the subnormal double-precision floating-point value represented
415 by the denormalized significand `aSig'. The normalized exponent and
416 significand are stored at the locations pointed to by `zExpPtr' and
417 `zSigPtr', respectively.
418 -------------------------------------------------------------------------------
421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
425 shiftCount = countLeadingZeros64( aSig ) - 11;
426 *zSigPtr = aSig<<shiftCount;
427 *zExpPtr = 1 - shiftCount;
432 -------------------------------------------------------------------------------
433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
434 double-precision floating-point value, returning the result. After being
435 shifted into the proper positions, the three fields are simply added
436 together to form the result. This means that any integer portion of `zSig'
437 will be added into the exponent. Since a properly normalized significand
438 will have an integer portion equal to 1, the `zExp' input should be 1 less
439 than the desired result exponent whenever `zSig' is a complete, normalized
441 -------------------------------------------------------------------------------
443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
447 ( ( (bits64) zExp )<<52 ) + zSig );
452 -------------------------------------------------------------------------------
453 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454 and significand `zSig', and returns the proper double-precision floating-
455 point value corresponding to the abstract input. Ordinarily, the abstract
456 value is simply rounded and packed into the double-precision format, with
457 the inexact exception raised if the abstract input cannot be represented
458 exactly. However, if the abstract value is too large, the overflow and
459 inexact exceptions are raised and an infinity or maximal finite value is
460 returned. If the abstract value is too small, the input value is rounded to
461 a subnormal number, and the underflow and inexact exceptions are raised if
462 the abstract input cannot be represented exactly as a subnormal double-
463 precision floating-point number.
464 The input significand `zSig' has its binary point between bits 62
465 and 61, which is 10 bits to the left of the usual location. This shifted
466 significand must be normalized or smaller. If `zSig' is not normalized,
467 `zExp' must be 0; in that case, the result returned is a subnormal number,
468 and it must not require rounding. In the usual case that `zSig' is
469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
470 The handling of underflow and overflow follows the IEC/IEEE Standard for
471 Binary Floating-Point Arithmetic.
472 -------------------------------------------------------------------------------
474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 flag roundNearestEven;
478 int16 roundIncrement, roundBits;
481 roundingMode = float_rounding_mode;
482 roundNearestEven = ( roundingMode == float_round_nearest_even );
483 roundIncrement = 0x200;
484 if ( ! roundNearestEven ) {
485 if ( roundingMode == float_round_to_zero ) {
489 roundIncrement = 0x3FF;
491 if ( roundingMode == float_round_up ) roundIncrement = 0;
494 if ( roundingMode == float_round_down ) roundIncrement = 0;
498 roundBits = zSig & 0x3FF;
499 if ( 0x7FD <= (bits16) zExp ) {
500 if ( ( 0x7FD < zExp )
501 || ( ( zExp == 0x7FD )
502 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
504 float_raise( float_flag_overflow | float_flag_inexact );
505 return FLOAT64_MANGLE(
506 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
507 ( roundIncrement == 0 ));
511 ( float_detect_tininess == float_tininess_before_rounding )
513 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
514 shift64RightJamming( zSig, - zExp, &zSig );
516 roundBits = zSig & 0x3FF;
517 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520 if ( roundBits ) float_exception_flags |= float_flag_inexact;
521 zSig = ( zSig + roundIncrement )>>10;
522 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
523 if ( zSig == 0 ) zExp = 0;
524 return packFloat64( zSign, zExp, zSig );
529 -------------------------------------------------------------------------------
530 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
531 and significand `zSig', and returns the proper double-precision floating-
532 point value corresponding to the abstract input. This routine is just like
533 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
535 floating-point exponent.
536 -------------------------------------------------------------------------------
539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
543 shiftCount = countLeadingZeros64( zSig ) - 1;
544 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
551 -------------------------------------------------------------------------------
552 Returns the fraction bits of the extended double-precision floating-point
554 -------------------------------------------------------------------------------
556 INLINE bits64 extractFloatx80Frac( floatx80 a )
564 -------------------------------------------------------------------------------
565 Returns the exponent bits of the extended double-precision floating-point
567 -------------------------------------------------------------------------------
569 INLINE int32 extractFloatx80Exp( floatx80 a )
572 return a.high & 0x7FFF;
577 -------------------------------------------------------------------------------
578 Returns the sign bit of the extended double-precision floating-point value
580 -------------------------------------------------------------------------------
582 INLINE flag extractFloatx80Sign( floatx80 a )
590 -------------------------------------------------------------------------------
591 Normalizes the subnormal extended double-precision floating-point value
592 represented by the denormalized significand `aSig'. The normalized exponent
593 and significand are stored at the locations pointed to by `zExpPtr' and
594 `zSigPtr', respectively.
595 -------------------------------------------------------------------------------
598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
602 shiftCount = countLeadingZeros64( aSig );
603 *zSigPtr = aSig<<shiftCount;
604 *zExpPtr = 1 - shiftCount;
609 -------------------------------------------------------------------------------
610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
611 extended double-precision floating-point value, returning the result.
612 -------------------------------------------------------------------------------
614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
619 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
625 -------------------------------------------------------------------------------
626 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
627 and extended significand formed by the concatenation of `zSig0' and `zSig1',
628 and returns the proper extended double-precision floating-point value
629 corresponding to the abstract input. Ordinarily, the abstract value is
630 rounded and packed into the extended double-precision format, with the
631 inexact exception raised if the abstract input cannot be represented
632 exactly. However, if the abstract value is too large, the overflow and
633 inexact exceptions are raised and an infinity or maximal finite value is
634 returned. If the abstract value is too small, the input value is rounded to
635 a subnormal number, and the underflow and inexact exceptions are raised if
636 the abstract input cannot be represented exactly as a subnormal extended
637 double-precision floating-point number.
638 If `roundingPrecision' is 32 or 64, the result is rounded to the same
639 number of bits as single or double precision, respectively. Otherwise, the
640 result is rounded to the full precision of the extended double-precision
642 The input significand must be normalized or smaller. If the input
643 significand is not normalized, `zExp' must be 0; in that case, the result
644 returned is a subnormal number, and it must not require rounding. The
645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
646 Floating-Point Arithmetic.
647 -------------------------------------------------------------------------------
650 roundAndPackFloatx80(
651 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
655 flag roundNearestEven, increment, isTiny;
656 int64 roundIncrement, roundMask, roundBits;
658 roundingMode = float_rounding_mode;
659 roundNearestEven = ( roundingMode == float_round_nearest_even );
660 if ( roundingPrecision == 80 ) goto precision80;
661 if ( roundingPrecision == 64 ) {
662 roundIncrement = LIT64( 0x0000000000000400 );
663 roundMask = LIT64( 0x00000000000007FF );
665 else if ( roundingPrecision == 32 ) {
666 roundIncrement = LIT64( 0x0000008000000000 );
667 roundMask = LIT64( 0x000000FFFFFFFFFF );
672 zSig0 |= ( zSig1 != 0 );
673 if ( ! roundNearestEven ) {
674 if ( roundingMode == float_round_to_zero ) {
678 roundIncrement = roundMask;
680 if ( roundingMode == float_round_up ) roundIncrement = 0;
683 if ( roundingMode == float_round_down ) roundIncrement = 0;
687 roundBits = zSig0 & roundMask;
688 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
689 if ( ( 0x7FFE < zExp )
690 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
696 ( float_detect_tininess == float_tininess_before_rounding )
698 || ( zSig0 <= zSig0 + roundIncrement );
699 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
701 roundBits = zSig0 & roundMask;
702 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
703 if ( roundBits ) float_exception_flags |= float_flag_inexact;
704 zSig0 += roundIncrement;
705 if ( (sbits64) zSig0 < 0 ) zExp = 1;
706 roundIncrement = roundMask + 1;
707 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
708 roundMask |= roundIncrement;
710 zSig0 &= ~ roundMask;
711 return packFloatx80( zSign, zExp, zSig0 );
714 if ( roundBits ) float_exception_flags |= float_flag_inexact;
715 zSig0 += roundIncrement;
716 if ( zSig0 < roundIncrement ) {
718 zSig0 = LIT64( 0x8000000000000000 );
720 roundIncrement = roundMask + 1;
721 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
722 roundMask |= roundIncrement;
724 zSig0 &= ~ roundMask;
725 if ( zSig0 == 0 ) zExp = 0;
726 return packFloatx80( zSign, zExp, zSig0 );
728 increment = ( (sbits64) zSig1 < 0 );
729 if ( ! roundNearestEven ) {
730 if ( roundingMode == float_round_to_zero ) {
735 increment = ( roundingMode == float_round_down ) && zSig1;
738 increment = ( roundingMode == float_round_up ) && zSig1;
742 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
743 if ( ( 0x7FFE < zExp )
744 || ( ( zExp == 0x7FFE )
745 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
751 float_raise( float_flag_overflow | float_flag_inexact );
752 if ( ( roundingMode == float_round_to_zero )
753 || ( zSign && ( roundingMode == float_round_up ) )
754 || ( ! zSign && ( roundingMode == float_round_down ) )
756 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
758 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
762 ( float_detect_tininess == float_tininess_before_rounding )
765 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
766 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
768 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
769 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
770 if ( roundNearestEven ) {
771 increment = ( (sbits64) zSig1 < 0 );
775 increment = ( roundingMode == float_round_down ) && zSig1;
778 increment = ( roundingMode == float_round_up ) && zSig1;
784 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
785 if ( (sbits64) zSig0 < 0 ) zExp = 1;
787 return packFloatx80( zSign, zExp, zSig0 );
790 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
795 zSig0 = LIT64( 0x8000000000000000 );
798 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802 if ( zSig0 == 0 ) zExp = 0;
804 return packFloatx80( zSign, zExp, zSig0 );
809 -------------------------------------------------------------------------------
810 Takes an abstract floating-point value having sign `zSign', exponent
811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
812 and returns the proper extended double-precision floating-point value
813 corresponding to the abstract input. This routine is just like
814 `roundAndPackFloatx80' except that the input significand does not have to be
816 -------------------------------------------------------------------------------
819 normalizeRoundAndPackFloatx80(
820 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
830 shiftCount = countLeadingZeros64( zSig0 );
831 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
843 -------------------------------------------------------------------------------
844 Returns the least-significant 64 fraction bits of the quadruple-precision
845 floating-point value `a'.
846 -------------------------------------------------------------------------------
848 INLINE bits64 extractFloat128Frac1( float128 a )
856 -------------------------------------------------------------------------------
857 Returns the most-significant 48 fraction bits of the quadruple-precision
858 floating-point value `a'.
859 -------------------------------------------------------------------------------
861 INLINE bits64 extractFloat128Frac0( float128 a )
864 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
869 -------------------------------------------------------------------------------
870 Returns the exponent bits of the quadruple-precision floating-point value
872 -------------------------------------------------------------------------------
874 INLINE int32 extractFloat128Exp( float128 a )
877 return ( a.high>>48 ) & 0x7FFF;
882 -------------------------------------------------------------------------------
883 Returns the sign bit of the quadruple-precision floating-point value `a'.
884 -------------------------------------------------------------------------------
886 INLINE flag extractFloat128Sign( float128 a )
894 -------------------------------------------------------------------------------
895 Normalizes the subnormal quadruple-precision floating-point value
896 represented by the denormalized significand formed by the concatenation of
897 `aSig0' and `aSig1'. The normalized exponent is stored at the location
898 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
899 significand are stored at the location pointed to by `zSig0Ptr', and the
900 least significant 64 bits of the normalized significand are stored at the
901 location pointed to by `zSig1Ptr'.
902 -------------------------------------------------------------------------------
905 normalizeFloat128Subnormal(
916 shiftCount = countLeadingZeros64( aSig1 ) - 15;
917 if ( shiftCount < 0 ) {
918 *zSig0Ptr = aSig1>>( - shiftCount );
919 *zSig1Ptr = aSig1<<( shiftCount & 63 );
922 *zSig0Ptr = aSig1<<shiftCount;
925 *zExpPtr = - shiftCount - 63;
928 shiftCount = countLeadingZeros64( aSig0 ) - 15;
929 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
930 *zExpPtr = 1 - shiftCount;
936 -------------------------------------------------------------------------------
937 Packs the sign `zSign', the exponent `zExp', and the significand formed
938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
939 floating-point value, returning the result. After being shifted into the
940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
941 added together to form the most significant 32 bits of the result. This
942 means that any integer portion of `zSig0' will be added into the exponent.
943 Since a properly normalized significand will have an integer portion equal
944 to 1, the `zExp' input should be 1 less than the desired result exponent
945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 -------------------------------------------------------------------------------
950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
955 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
961 -------------------------------------------------------------------------------
962 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 and extended significand formed by the concatenation of `zSig0', `zSig1',
964 and `zSig2', and returns the proper quadruple-precision floating-point value
965 corresponding to the abstract input. Ordinarily, the abstract value is
966 simply rounded and packed into the quadruple-precision format, with the
967 inexact exception raised if the abstract input cannot be represented
968 exactly. However, if the abstract value is too large, the overflow and
969 inexact exceptions are raised and an infinity or maximal finite value is
970 returned. If the abstract value is too small, the input value is rounded to
971 a subnormal number, and the underflow and inexact exceptions are raised if
972 the abstract input cannot be represented exactly as a subnormal quadruple-
973 precision floating-point number.
974 The input significand must be normalized or smaller. If the input
975 significand is not normalized, `zExp' must be 0; in that case, the result
976 returned is a subnormal number, and it must not require rounding. In the
977 usual case that the input significand is normalized, `zExp' must be 1 less
978 than the ``true'' floating-point exponent. The handling of underflow and
979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 -------------------------------------------------------------------------------
983 roundAndPackFloat128(
984 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 flag roundNearestEven, increment, isTiny;
989 roundingMode = float_rounding_mode;
990 roundNearestEven = ( roundingMode == float_round_nearest_even );
991 increment = ( (sbits64) zSig2 < 0 );
992 if ( ! roundNearestEven ) {
993 if ( roundingMode == float_round_to_zero ) {
998 increment = ( roundingMode == float_round_down ) && zSig2;
1001 increment = ( roundingMode == float_round_up ) && zSig2;
1005 if ( 0x7FFD <= (bits32) zExp ) {
1006 if ( ( 0x7FFD < zExp )
1007 || ( ( zExp == 0x7FFD )
1009 LIT64( 0x0001FFFFFFFFFFFF ),
1010 LIT64( 0xFFFFFFFFFFFFFFFF ),
1017 float_raise( float_flag_overflow | float_flag_inexact );
1018 if ( ( roundingMode == float_round_to_zero )
1019 || ( zSign && ( roundingMode == float_round_up ) )
1020 || ( ! zSign && ( roundingMode == float_round_down ) )
1026 LIT64( 0x0000FFFFFFFFFFFF ),
1027 LIT64( 0xFFFFFFFFFFFFFFFF )
1030 return packFloat128( zSign, 0x7FFF, 0, 0 );
1034 ( float_detect_tininess == float_tininess_before_rounding )
1040 LIT64( 0x0001FFFFFFFFFFFF ),
1041 LIT64( 0xFFFFFFFFFFFFFFFF )
1043 shift128ExtraRightJamming(
1044 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047 if ( roundNearestEven ) {
1048 increment = ( (sbits64) zSig2 < 0 );
1052 increment = ( roundingMode == float_round_down ) && zSig2;
1055 increment = ( roundingMode == float_round_up ) && zSig2;
1060 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1062 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068 return packFloat128( zSign, zExp, zSig0, zSig1 );
1073 -------------------------------------------------------------------------------
1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 returns the proper quadruple-precision floating-point value corresponding
1077 to the abstract input. This routine is just like `roundAndPackFloat128'
1078 except that the input significand has fewer bits and does not have to be
1079 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1081 -------------------------------------------------------------------------------
1084 normalizeRoundAndPackFloat128(
1085 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1095 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096 if ( 0 <= shiftCount ) {
1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101 shift128ExtraRightJamming(
1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1112 -------------------------------------------------------------------------------
1113 Returns the result of converting the 32-bit two's complement integer `a'
1114 to the single-precision floating-point format. The conversion is performed
1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116 -------------------------------------------------------------------------------
1118 float32 int32_to_float32( int32 a )
1122 if ( a == 0 ) return 0;
1123 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1125 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1130 -------------------------------------------------------------------------------
1131 Returns the result of converting the 32-bit two's complement integer `a'
1132 to the double-precision floating-point format. The conversion is performed
1133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 -------------------------------------------------------------------------------
1136 float64 int32_to_float64( int32 a )
1143 if ( a == 0 ) return 0;
1145 absA = zSign ? - a : a;
1146 shiftCount = countLeadingZeros32( absA ) + 21;
1148 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1155 -------------------------------------------------------------------------------
1156 Returns the result of converting the 32-bit two's complement integer `a'
1157 to the extended double-precision floating-point format. The conversion
1158 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1160 -------------------------------------------------------------------------------
1162 floatx80 int32_to_floatx80( int32 a )
1169 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1171 absA = zSign ? - a : a;
1172 shiftCount = countLeadingZeros32( absA ) + 32;
1174 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a' to
1185 the quadruple-precision floating-point format. The conversion is performed
1186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 -------------------------------------------------------------------------------
1189 float128 int32_to_float128( int32 a )
1196 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1198 absA = zSign ? - a : a;
1199 shiftCount = countLeadingZeros32( absA ) + 17;
1201 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1209 -------------------------------------------------------------------------------
1210 Returns the result of converting the 64-bit two's complement integer `a'
1211 to the single-precision floating-point format. The conversion is performed
1212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1213 -------------------------------------------------------------------------------
1215 float32 int64_to_float32( int64 a )
1221 if ( a == 0 ) return 0;
1223 absA = zSign ? - a : a;
1224 shiftCount = countLeadingZeros64( absA ) - 40;
1225 if ( 0 <= shiftCount ) {
1226 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1230 if ( shiftCount < 0 ) {
1231 shift64RightJamming( absA, - shiftCount, &absA );
1234 absA <<= shiftCount;
1236 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1242 -------------------------------------------------------------------------------
1243 Returns the result of converting the 64-bit two's complement integer `a'
1244 to the double-precision floating-point format. The conversion is performed
1245 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1246 -------------------------------------------------------------------------------
1248 float64 int64_to_float64( int64 a )
1252 if ( a == 0 ) return 0;
1253 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1254 return packFloat64( 1, 0x43E, 0 );
1257 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1264 -------------------------------------------------------------------------------
1265 Returns the result of converting the 64-bit two's complement integer `a'
1266 to the extended double-precision floating-point format. The conversion
1267 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1269 -------------------------------------------------------------------------------
1271 floatx80 int64_to_floatx80( int64 a )
1277 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1279 absA = zSign ? - a : a;
1280 shiftCount = countLeadingZeros64( absA );
1281 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1287 #endif /* !SOFTFLOAT_FOR_GCC */
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a' to
1294 the quadruple-precision floating-point format. The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1298 float128 int64_to_float128( int64 a )
1304 bits64 zSig0, zSig1;
1306 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1308 absA = zSign ? - a : a;
1309 shiftCount = countLeadingZeros64( absA ) + 49;
1310 zExp = 0x406E - shiftCount;
1311 if ( 64 <= shiftCount ) {
1320 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1321 return packFloat128( zSign, zExp, zSig0, zSig1 );
1327 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1329 -------------------------------------------------------------------------------
1330 Returns the result of converting the single-precision floating-point value
1331 `a' to the 32-bit two's complement integer format. The conversion is
1332 performed according to the IEC/IEEE Standard for Binary Floating-Point
1333 Arithmetic---which means in particular that the conversion is rounded
1334 according to the current rounding mode. If `a' is a NaN, the largest
1335 positive integer is returned. Otherwise, if the conversion overflows, the
1336 largest integer with the same sign as `a' is returned.
1337 -------------------------------------------------------------------------------
1339 int32 float32_to_int32( float32 a )
1342 int16 aExp, shiftCount;
1346 aSig = extractFloat32Frac( a );
1347 aExp = extractFloat32Exp( a );
1348 aSign = extractFloat32Sign( a );
1349 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1350 if ( aExp ) aSig |= 0x00800000;
1351 shiftCount = 0xAF - aExp;
1354 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1355 return roundAndPackInt32( aSign, aSig64 );
1358 #endif /* !SOFTFLOAT_FOR_GCC */
1361 -------------------------------------------------------------------------------
1362 Returns the result of converting the single-precision floating-point value
1363 `a' to the 32-bit two's complement integer format. The conversion is
1364 performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 Arithmetic, except that the conversion is always rounded toward zero.
1366 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1367 the conversion overflows, the largest integer with the same sign as `a' is
1369 -------------------------------------------------------------------------------
1371 int32 float32_to_int32_round_to_zero( float32 a )
1374 int16 aExp, shiftCount;
1378 aSig = extractFloat32Frac( a );
1379 aExp = extractFloat32Exp( a );
1380 aSign = extractFloat32Sign( a );
1381 shiftCount = aExp - 0x9E;
1382 if ( 0 <= shiftCount ) {
1383 if ( a != 0xCF000000 ) {
1384 float_raise( float_flag_invalid );
1385 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1387 return (sbits32) 0x80000000;
1389 else if ( aExp <= 0x7E ) {
1390 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1393 aSig = ( aSig | 0x00800000 )<<8;
1394 z = aSig>>( - shiftCount );
1395 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1396 float_exception_flags |= float_flag_inexact;
1398 if ( aSign ) z = - z;
1403 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 64-bit two's complement integer format. The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic---which means in particular that the conversion is rounded
1410 according to the current rounding mode. If `a' is a NaN, the largest
1411 positive integer is returned. Otherwise, if the conversion overflows, the
1412 largest integer with the same sign as `a' is returned.
1413 -------------------------------------------------------------------------------
1415 int64 float32_to_int64( float32 a )
1418 int16 aExp, shiftCount;
1420 bits64 aSig64, aSigExtra;
1422 aSig = extractFloat32Frac( a );
1423 aExp = extractFloat32Exp( a );
1424 aSign = extractFloat32Sign( a );
1425 shiftCount = 0xBE - aExp;
1426 if ( shiftCount < 0 ) {
1427 float_raise( float_flag_invalid );
1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429 return LIT64( 0x7FFFFFFFFFFFFFFF );
1431 return (sbits64) LIT64( 0x8000000000000000 );
1433 if ( aExp ) aSig |= 0x00800000;
1436 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1437 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1442 -------------------------------------------------------------------------------
1443 Returns the result of converting the single-precision floating-point value
1444 `a' to the 64-bit two's complement integer format. The conversion is
1445 performed according to the IEC/IEEE Standard for Binary Floating-Point
1446 Arithmetic, except that the conversion is always rounded toward zero. If
1447 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1448 conversion overflows, the largest integer with the same sign as `a' is
1450 -------------------------------------------------------------------------------
1452 int64 float32_to_int64_round_to_zero( float32 a )
1455 int16 aExp, shiftCount;
1460 aSig = extractFloat32Frac( a );
1461 aExp = extractFloat32Exp( a );
1462 aSign = extractFloat32Sign( a );
1463 shiftCount = aExp - 0xBE;
1464 if ( 0 <= shiftCount ) {
1465 if ( a != 0xDF000000 ) {
1466 float_raise( float_flag_invalid );
1467 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1468 return LIT64( 0x7FFFFFFFFFFFFFFF );
1471 return (sbits64) LIT64( 0x8000000000000000 );
1473 else if ( aExp <= 0x7E ) {
1474 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1477 aSig64 = aSig | 0x00800000;
1479 z = aSig64>>( - shiftCount );
1480 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1481 float_exception_flags |= float_flag_inexact;
1483 if ( aSign ) z = - z;
1487 #endif /* !SOFTFLOAT_FOR_GCC */
1490 -------------------------------------------------------------------------------
1491 Returns the result of converting the single-precision floating-point value
1492 `a' to the double-precision floating-point format. The conversion is
1493 performed according to the IEC/IEEE Standard for Binary Floating-Point
1495 -------------------------------------------------------------------------------
1497 float64 float32_to_float64( float32 a )
1503 aSig = extractFloat32Frac( a );
1504 aExp = extractFloat32Exp( a );
1505 aSign = extractFloat32Sign( a );
1506 if ( aExp == 0xFF ) {
1507 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1508 return packFloat64( aSign, 0x7FF, 0 );
1511 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1512 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1515 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1522 -------------------------------------------------------------------------------
1523 Returns the result of converting the single-precision floating-point value
1524 `a' to the extended double-precision floating-point format. The conversion
1525 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1527 -------------------------------------------------------------------------------
1529 floatx80 float32_to_floatx80( float32 a )
1535 aSig = extractFloat32Frac( a );
1536 aExp = extractFloat32Exp( a );
1537 aSign = extractFloat32Sign( a );
1538 if ( aExp == 0xFF ) {
1539 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1540 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1543 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1544 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1547 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1556 -------------------------------------------------------------------------------
1557 Returns the result of converting the single-precision floating-point value
1558 `a' to the double-precision floating-point format. The conversion is
1559 performed according to the IEC/IEEE Standard for Binary Floating-Point
1561 -------------------------------------------------------------------------------
1563 float128 float32_to_float128( float32 a )
1569 aSig = extractFloat32Frac( a );
1570 aExp = extractFloat32Exp( a );
1571 aSign = extractFloat32Sign( a );
1572 if ( aExp == 0xFF ) {
1573 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1574 return packFloat128( aSign, 0x7FFF, 0, 0 );
1577 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1578 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1587 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1589 -------------------------------------------------------------------------------
1590 Rounds the single-precision floating-point value `a' to an integer, and
1591 returns the result as a single-precision floating-point value. The
1592 operation is performed according to the IEC/IEEE Standard for Binary
1593 Floating-Point Arithmetic.
1594 -------------------------------------------------------------------------------
1596 float32 float32_round_to_int( float32 a )
1600 bits32 lastBitMask, roundBitsMask;
1604 aExp = extractFloat32Exp( a );
1605 if ( 0x96 <= aExp ) {
1606 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1607 return propagateFloat32NaN( a, a );
1611 if ( aExp <= 0x7E ) {
1612 if ( (bits32) ( a<<1 ) == 0 ) return a;
1613 float_exception_flags |= float_flag_inexact;
1614 aSign = extractFloat32Sign( a );
1615 switch ( float_rounding_mode ) {
1616 case float_round_nearest_even:
1617 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1618 return packFloat32( aSign, 0x7F, 0 );
1621 case float_round_to_zero:
1623 case float_round_down:
1624 return aSign ? 0xBF800000 : 0;
1625 case float_round_up:
1626 return aSign ? 0x80000000 : 0x3F800000;
1628 return packFloat32( aSign, 0, 0 );
1631 lastBitMask <<= 0x96 - aExp;
1632 roundBitsMask = lastBitMask - 1;
1634 roundingMode = float_rounding_mode;
1635 if ( roundingMode == float_round_nearest_even ) {
1636 z += lastBitMask>>1;
1637 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1639 else if ( roundingMode != float_round_to_zero ) {
1640 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1644 z &= ~ roundBitsMask;
1645 if ( z != a ) float_exception_flags |= float_flag_inexact;
1649 #endif /* !SOFTFLOAT_FOR_GCC */
1652 -------------------------------------------------------------------------------
1653 Returns the result of adding the absolute values of the single-precision
1654 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1655 before being returned. `zSign' is ignored if the result is a NaN.
1656 The addition is performed according to the IEC/IEEE Standard for Binary
1657 Floating-Point Arithmetic.
1658 -------------------------------------------------------------------------------
1660 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1662 int16 aExp, bExp, zExp;
1663 bits32 aSig, bSig, zSig;
1666 aSig = extractFloat32Frac( a );
1667 aExp = extractFloat32Exp( a );
1668 bSig = extractFloat32Frac( b );
1669 bExp = extractFloat32Exp( b );
1670 expDiff = aExp - bExp;
1673 if ( 0 < expDiff ) {
1674 if ( aExp == 0xFF ) {
1675 if ( aSig ) return propagateFloat32NaN( a, b );
1684 shift32RightJamming( bSig, expDiff, &bSig );
1687 else if ( expDiff < 0 ) {
1688 if ( bExp == 0xFF ) {
1689 if ( bSig ) return propagateFloat32NaN( a, b );
1690 return packFloat32( zSign, 0xFF, 0 );
1698 shift32RightJamming( aSig, - expDiff, &aSig );
1702 if ( aExp == 0xFF ) {
1703 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1706 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1707 zSig = 0x40000000 + aSig + bSig;
1712 zSig = ( aSig + bSig )<<1;
1714 if ( (sbits32) zSig < 0 ) {
1719 return roundAndPackFloat32( zSign, zExp, zSig );
1724 -------------------------------------------------------------------------------
1725 Returns the result of subtracting the absolute values of the single-
1726 precision floating-point values `a' and `b'. If `zSign' is 1, the
1727 difference is negated before being returned. `zSign' is ignored if the
1728 result is a NaN. The subtraction is performed according to the IEC/IEEE
1729 Standard for Binary Floating-Point Arithmetic.
1730 -------------------------------------------------------------------------------
1732 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1734 int16 aExp, bExp, zExp;
1735 bits32 aSig, bSig, zSig;
1738 aSig = extractFloat32Frac( a );
1739 aExp = extractFloat32Exp( a );
1740 bSig = extractFloat32Frac( b );
1741 bExp = extractFloat32Exp( b );
1742 expDiff = aExp - bExp;
1745 if ( 0 < expDiff ) goto aExpBigger;
1746 if ( expDiff < 0 ) goto bExpBigger;
1747 if ( aExp == 0xFF ) {
1748 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1749 float_raise( float_flag_invalid );
1750 return float32_default_nan;
1756 if ( bSig < aSig ) goto aBigger;
1757 if ( aSig < bSig ) goto bBigger;
1758 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1760 if ( bExp == 0xFF ) {
1761 if ( bSig ) return propagateFloat32NaN( a, b );
1762 return packFloat32( zSign ^ 1, 0xFF, 0 );
1770 shift32RightJamming( aSig, - expDiff, &aSig );
1776 goto normalizeRoundAndPack;
1778 if ( aExp == 0xFF ) {
1779 if ( aSig ) return propagateFloat32NaN( a, b );
1788 shift32RightJamming( bSig, expDiff, &bSig );
1793 normalizeRoundAndPack:
1795 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1800 -------------------------------------------------------------------------------
1801 Returns the result of adding the single-precision floating-point values `a'
1802 and `b'. The operation is performed according to the IEC/IEEE Standard for
1803 Binary Floating-Point Arithmetic.
1804 -------------------------------------------------------------------------------
1806 float32 float32_add( float32 a, float32 b )
1810 aSign = extractFloat32Sign( a );
1811 bSign = extractFloat32Sign( b );
1812 if ( aSign == bSign ) {
1813 return addFloat32Sigs( a, b, aSign );
1816 return subFloat32Sigs( a, b, aSign );
1822 -------------------------------------------------------------------------------
1823 Returns the result of subtracting the single-precision floating-point values
1824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1825 for Binary Floating-Point Arithmetic.
1826 -------------------------------------------------------------------------------
1828 float32 float32_sub( float32 a, float32 b )
1832 aSign = extractFloat32Sign( a );
1833 bSign = extractFloat32Sign( b );
1834 if ( aSign == bSign ) {
1835 return subFloat32Sigs( a, b, aSign );
1838 return addFloat32Sigs( a, b, aSign );
1844 -------------------------------------------------------------------------------
1845 Returns the result of multiplying the single-precision floating-point values
1846 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1847 for Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1850 float32 float32_mul( float32 a, float32 b )
1852 flag aSign, bSign, zSign;
1853 int16 aExp, bExp, zExp;
1858 aSig = extractFloat32Frac( a );
1859 aExp = extractFloat32Exp( a );
1860 aSign = extractFloat32Sign( a );
1861 bSig = extractFloat32Frac( b );
1862 bExp = extractFloat32Exp( b );
1863 bSign = extractFloat32Sign( b );
1864 zSign = aSign ^ bSign;
1865 if ( aExp == 0xFF ) {
1866 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1867 return propagateFloat32NaN( a, b );
1869 if ( ( bExp | bSig ) == 0 ) {
1870 float_raise( float_flag_invalid );
1871 return float32_default_nan;
1873 return packFloat32( zSign, 0xFF, 0 );
1875 if ( bExp == 0xFF ) {
1876 if ( bSig ) return propagateFloat32NaN( a, b );
1877 if ( ( aExp | aSig ) == 0 ) {
1878 float_raise( float_flag_invalid );
1879 return float32_default_nan;
1881 return packFloat32( zSign, 0xFF, 0 );
1884 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1885 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1888 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1889 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1891 zExp = aExp + bExp - 0x7F;
1892 aSig = ( aSig | 0x00800000 )<<7;
1893 bSig = ( bSig | 0x00800000 )<<8;
1894 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1896 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1900 return roundAndPackFloat32( zSign, zExp, zSig );
1905 -------------------------------------------------------------------------------
1906 Returns the result of dividing the single-precision floating-point value `a'
1907 by the corresponding value `b'. The operation is performed according to the
1908 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 -------------------------------------------------------------------------------
1911 float32 float32_div( float32 a, float32 b )
1913 flag aSign, bSign, zSign;
1914 int16 aExp, bExp, zExp;
1915 bits32 aSig, bSig, zSig;
1917 aSig = extractFloat32Frac( a );
1918 aExp = extractFloat32Exp( a );
1919 aSign = extractFloat32Sign( a );
1920 bSig = extractFloat32Frac( b );
1921 bExp = extractFloat32Exp( b );
1922 bSign = extractFloat32Sign( b );
1923 zSign = aSign ^ bSign;
1924 if ( aExp == 0xFF ) {
1925 if ( aSig ) return propagateFloat32NaN( a, b );
1926 if ( bExp == 0xFF ) {
1927 if ( bSig ) return propagateFloat32NaN( a, b );
1928 float_raise( float_flag_invalid );
1929 return float32_default_nan;
1931 return packFloat32( zSign, 0xFF, 0 );
1933 if ( bExp == 0xFF ) {
1934 if ( bSig ) return propagateFloat32NaN( a, b );
1935 return packFloat32( zSign, 0, 0 );
1939 if ( ( aExp | aSig ) == 0 ) {
1940 float_raise( float_flag_invalid );
1941 return float32_default_nan;
1943 float_raise( float_flag_divbyzero );
1944 return packFloat32( zSign, 0xFF, 0 );
1946 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1949 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1950 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1952 zExp = aExp - bExp + 0x7D;
1953 aSig = ( aSig | 0x00800000 )<<7;
1954 bSig = ( bSig | 0x00800000 )<<8;
1955 if ( bSig <= ( aSig + aSig ) ) {
1959 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1960 if ( ( zSig & 0x3F ) == 0 ) {
1961 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1963 return roundAndPackFloat32( zSign, zExp, zSig );
1967 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1969 -------------------------------------------------------------------------------
1970 Returns the remainder of the single-precision floating-point value `a'
1971 with respect to the corresponding value `b'. The operation is performed
1972 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1973 -------------------------------------------------------------------------------
1975 float32 float32_rem( float32 a, float32 b )
1977 flag aSign, bSign, zSign;
1978 int16 aExp, bExp, expDiff;
1981 bits64 aSig64, bSig64, q64;
1982 bits32 alternateASig;
1985 aSig = extractFloat32Frac( a );
1986 aExp = extractFloat32Exp( a );
1987 aSign = extractFloat32Sign( a );
1988 bSig = extractFloat32Frac( b );
1989 bExp = extractFloat32Exp( b );
1990 bSign = extractFloat32Sign( b );
1991 if ( aExp == 0xFF ) {
1992 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1993 return propagateFloat32NaN( a, b );
1995 float_raise( float_flag_invalid );
1996 return float32_default_nan;
1998 if ( bExp == 0xFF ) {
1999 if ( bSig ) return propagateFloat32NaN( a, b );
2004 float_raise( float_flag_invalid );
2005 return float32_default_nan;
2007 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2010 if ( aSig == 0 ) return a;
2011 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2013 expDiff = aExp - bExp;
2016 if ( expDiff < 32 ) {
2019 if ( expDiff < 0 ) {
2020 if ( expDiff < -1 ) return a;
2023 q = ( bSig <= aSig );
2024 if ( q ) aSig -= bSig;
2025 if ( 0 < expDiff ) {
2026 q = ( ( (bits64) aSig )<<32 ) / bSig;
2029 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2037 if ( bSig <= aSig ) aSig -= bSig;
2038 aSig64 = ( (bits64) aSig )<<40;
2039 bSig64 = ( (bits64) bSig )<<40;
2041 while ( 0 < expDiff ) {
2042 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2043 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2044 aSig64 = - ( ( bSig * q64 )<<38 );
2048 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050 q = q64>>( 64 - expDiff );
2052 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2055 alternateASig = aSig;
2058 } while ( 0 <= (sbits32) aSig );
2059 sigMean = aSig + alternateASig;
2060 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2061 aSig = alternateASig;
2063 zSign = ( (sbits32) aSig < 0 );
2064 if ( zSign ) aSig = - aSig;
2065 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2068 #endif /* !SOFTFLOAT_FOR_GCC */
2070 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2072 -------------------------------------------------------------------------------
2073 Returns the square root of the single-precision floating-point value `a'.
2074 The operation is performed according to the IEC/IEEE Standard for Binary
2075 Floating-Point Arithmetic.
2076 -------------------------------------------------------------------------------
2078 float32 float32_sqrt( float32 a )
2085 aSig = extractFloat32Frac( a );
2086 aExp = extractFloat32Exp( a );
2087 aSign = extractFloat32Sign( a );
2088 if ( aExp == 0xFF ) {
2089 if ( aSig ) return propagateFloat32NaN( a, 0 );
2090 if ( ! aSign ) return a;
2091 float_raise( float_flag_invalid );
2092 return float32_default_nan;
2095 if ( ( aExp | aSig ) == 0 ) return a;
2096 float_raise( float_flag_invalid );
2097 return float32_default_nan;
2100 if ( aSig == 0 ) return 0;
2101 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2103 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2104 aSig = ( aSig | 0x00800000 )<<8;
2105 zSig = estimateSqrt32( aExp, aSig ) + 2;
2106 if ( ( zSig & 0x7F ) <= 5 ) {
2112 term = ( (bits64) zSig ) * zSig;
2113 rem = ( ( (bits64) aSig )<<32 ) - term;
2114 while ( (sbits64) rem < 0 ) {
2116 rem += ( ( (bits64) zSig )<<1 ) | 1;
2118 zSig |= ( rem != 0 );
2120 shift32RightJamming( zSig, 1, &zSig );
2122 return roundAndPackFloat32( 0, zExp, zSig );
2125 #endif /* !SOFTFLOAT_FOR_GCC */
2128 -------------------------------------------------------------------------------
2129 Returns 1 if the single-precision floating-point value `a' is equal to
2130 the corresponding value `b', and 0 otherwise. The comparison is performed
2131 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132 -------------------------------------------------------------------------------
2134 flag float32_eq( float32 a, float32 b )
2137 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2140 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2141 float_raise( float_flag_invalid );
2145 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2150 -------------------------------------------------------------------------------
2151 Returns 1 if the single-precision floating-point value `a' is less than
2152 or equal to the corresponding value `b', and 0 otherwise. The comparison
2153 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2155 -------------------------------------------------------------------------------
2157 flag float32_le( float32 a, float32 b )
2161 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2162 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2164 float_raise( float_flag_invalid );
2167 aSign = extractFloat32Sign( a );
2168 bSign = extractFloat32Sign( b );
2169 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2170 return ( a == b ) || ( aSign ^ ( a < b ) );
2175 -------------------------------------------------------------------------------
2176 Returns 1 if the single-precision floating-point value `a' is less than
2177 the corresponding value `b', and 0 otherwise. The comparison is performed
2178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179 -------------------------------------------------------------------------------
2181 flag float32_lt( float32 a, float32 b )
2185 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2186 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2188 float_raise( float_flag_invalid );
2191 aSign = extractFloat32Sign( a );
2192 bSign = extractFloat32Sign( b );
2193 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2194 return ( a != b ) && ( aSign ^ ( a < b ) );
2198 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is equal to
2202 the corresponding value `b', and 0 otherwise. The invalid exception is
2203 raised if either operand is a NaN. Otherwise, the comparison is performed
2204 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2205 -------------------------------------------------------------------------------
2207 flag float32_eq_signaling( float32 a, float32 b )
2210 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2211 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2213 float_raise( float_flag_invalid );
2216 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2221 -------------------------------------------------------------------------------
2222 Returns 1 if the single-precision floating-point value `a' is less than or
2223 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2224 cause an exception. Otherwise, the comparison is performed according to the
2225 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2226 -------------------------------------------------------------------------------
2228 flag float32_le_quiet( float32 a, float32 b )
2232 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2233 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2235 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2236 float_raise( float_flag_invalid );
2240 aSign = extractFloat32Sign( a );
2241 bSign = extractFloat32Sign( b );
2242 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2243 return ( a == b ) || ( aSign ^ ( a < b ) );
2248 -------------------------------------------------------------------------------
2249 Returns 1 if the single-precision floating-point value `a' is less than
2250 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2251 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2252 Standard for Binary Floating-Point Arithmetic.
2253 -------------------------------------------------------------------------------
2255 flag float32_lt_quiet( float32 a, float32 b )
2259 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2260 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2262 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2263 float_raise( float_flag_invalid );
2267 aSign = extractFloat32Sign( a );
2268 bSign = extractFloat32Sign( b );
2269 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2270 return ( a != b ) && ( aSign ^ ( a < b ) );
2273 #endif /* !SOFTFLOAT_FOR_GCC */
2275 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2277 -------------------------------------------------------------------------------
2278 Returns the result of converting the double-precision floating-point value
2279 `a' to the 32-bit two's complement integer format. The conversion is
2280 performed according to the IEC/IEEE Standard for Binary Floating-Point
2281 Arithmetic---which means in particular that the conversion is rounded
2282 according to the current rounding mode. If `a' is a NaN, the largest
2283 positive integer is returned. Otherwise, if the conversion overflows, the
2284 largest integer with the same sign as `a' is returned.
2285 -------------------------------------------------------------------------------
2287 int32 float64_to_int32( float64 a )
2290 int16 aExp, shiftCount;
2293 aSig = extractFloat64Frac( a );
2294 aExp = extractFloat64Exp( a );
2295 aSign = extractFloat64Sign( a );
2296 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2297 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2298 shiftCount = 0x42C - aExp;
2299 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2300 return roundAndPackInt32( aSign, aSig );
2303 #endif /* !SOFTFLOAT_FOR_GCC */
2306 -------------------------------------------------------------------------------
2307 Returns the result of converting the double-precision floating-point value
2308 `a' to the 32-bit two's complement integer format. The conversion is
2309 performed according to the IEC/IEEE Standard for Binary Floating-Point
2310 Arithmetic, except that the conversion is always rounded toward zero.
2311 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2312 the conversion overflows, the largest integer with the same sign as `a' is
2314 -------------------------------------------------------------------------------
2316 int32 float64_to_int32_round_to_zero( float64 a )
2319 int16 aExp, shiftCount;
2320 bits64 aSig, savedASig;
2323 aSig = extractFloat64Frac( a );
2324 aExp = extractFloat64Exp( a );
2325 aSign = extractFloat64Sign( a );
2326 if ( 0x41E < aExp ) {
2327 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2330 else if ( aExp < 0x3FF ) {
2331 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2334 aSig |= LIT64( 0x0010000000000000 );
2335 shiftCount = 0x433 - aExp;
2337 aSig >>= shiftCount;
2339 if ( aSign ) z = - z;
2340 if ( ( z < 0 ) ^ aSign ) {
2342 float_raise( float_flag_invalid );
2343 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2345 if ( ( aSig<<shiftCount ) != savedASig ) {
2346 float_exception_flags |= float_flag_inexact;
2352 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2354 -------------------------------------------------------------------------------
2355 Returns the result of converting the double-precision floating-point value
2356 `a' to the 64-bit two's complement integer format. The conversion is
2357 performed according to the IEC/IEEE Standard for Binary Floating-Point
2358 Arithmetic---which means in particular that the conversion is rounded
2359 according to the current rounding mode. If `a' is a NaN, the largest
2360 positive integer is returned. Otherwise, if the conversion overflows, the
2361 largest integer with the same sign as `a' is returned.
2362 -------------------------------------------------------------------------------
2364 int64 float64_to_int64( float64 a )
2367 int16 aExp, shiftCount;
2368 bits64 aSig, aSigExtra;
2370 aSig = extractFloat64Frac( a );
2371 aExp = extractFloat64Exp( a );
2372 aSign = extractFloat64Sign( a );
2373 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2374 shiftCount = 0x433 - aExp;
2375 if ( shiftCount <= 0 ) {
2376 if ( 0x43E < aExp ) {
2377 float_raise( float_flag_invalid );
2379 || ( ( aExp == 0x7FF )
2380 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2382 return LIT64( 0x7FFFFFFFFFFFFFFF );
2384 return (sbits64) LIT64( 0x8000000000000000 );
2387 aSig <<= - shiftCount;
2390 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2392 return roundAndPackInt64( aSign, aSig, aSigExtra );
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format. The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic, except that the conversion is always rounded toward zero.
2402 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2403 the conversion overflows, the largest integer with the same sign as `a' is
2405 -------------------------------------------------------------------------------
2407 int64 float64_to_int64_round_to_zero( float64 a )
2410 int16 aExp, shiftCount;
2414 aSig = extractFloat64Frac( a );
2415 aExp = extractFloat64Exp( a );
2416 aSign = extractFloat64Sign( a );
2417 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418 shiftCount = aExp - 0x433;
2419 if ( 0 <= shiftCount ) {
2420 if ( 0x43E <= aExp ) {
2421 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2422 float_raise( float_flag_invalid );
2424 || ( ( aExp == 0x7FF )
2425 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2427 return LIT64( 0x7FFFFFFFFFFFFFFF );
2430 return (sbits64) LIT64( 0x8000000000000000 );
2432 z = aSig<<shiftCount;
2435 if ( aExp < 0x3FE ) {
2436 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2439 z = aSig>>( - shiftCount );
2440 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2441 float_exception_flags |= float_flag_inexact;
2444 if ( aSign ) z = - z;
2448 #endif /* !SOFTFLOAT_FOR_GCC */
2451 -------------------------------------------------------------------------------
2452 Returns the result of converting the double-precision floating-point value
2453 `a' to the single-precision floating-point format. The conversion is
2454 performed according to the IEC/IEEE Standard for Binary Floating-Point
2456 -------------------------------------------------------------------------------
2458 float32 float64_to_float32( float64 a )
2465 aSig = extractFloat64Frac( a );
2466 aExp = extractFloat64Exp( a );
2467 aSign = extractFloat64Sign( a );
2468 if ( aExp == 0x7FF ) {
2469 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2470 return packFloat32( aSign, 0xFF, 0 );
2472 shift64RightJamming( aSig, 22, &aSig );
2474 if ( aExp || zSig ) {
2478 return roundAndPackFloat32( aSign, aExp, zSig );
2485 -------------------------------------------------------------------------------
2486 Returns the result of converting the double-precision floating-point value
2487 `a' to the extended double-precision floating-point format. The conversion
2488 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2490 -------------------------------------------------------------------------------
2492 floatx80 float64_to_floatx80( float64 a )
2498 aSig = extractFloat64Frac( a );
2499 aExp = extractFloat64Exp( a );
2500 aSign = extractFloat64Sign( a );
2501 if ( aExp == 0x7FF ) {
2502 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2503 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2506 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2507 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2511 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2520 -------------------------------------------------------------------------------
2521 Returns the result of converting the double-precision floating-point value
2522 `a' to the quadruple-precision floating-point format. The conversion is
2523 performed according to the IEC/IEEE Standard for Binary Floating-Point
2525 -------------------------------------------------------------------------------
2527 float128 float64_to_float128( float64 a )
2531 bits64 aSig, zSig0, zSig1;
2533 aSig = extractFloat64Frac( a );
2534 aExp = extractFloat64Exp( a );
2535 aSign = extractFloat64Sign( a );
2536 if ( aExp == 0x7FF ) {
2537 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2538 return packFloat128( aSign, 0x7FFF, 0, 0 );
2541 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2542 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2545 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2546 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2552 #ifndef SOFTFLOAT_FOR_GCC
2554 -------------------------------------------------------------------------------
2555 Rounds the double-precision floating-point value `a' to an integer, and
2556 returns the result as a double-precision floating-point value. The
2557 operation is performed according to the IEC/IEEE Standard for Binary
2558 Floating-Point Arithmetic.
2559 -------------------------------------------------------------------------------
2561 float64 float64_round_to_int( float64 a )
2565 bits64 lastBitMask, roundBitsMask;
2569 aExp = extractFloat64Exp( a );
2570 if ( 0x433 <= aExp ) {
2571 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2572 return propagateFloat64NaN( a, a );
2576 if ( aExp < 0x3FF ) {
2577 if ( (bits64) ( a<<1 ) == 0 ) return a;
2578 float_exception_flags |= float_flag_inexact;
2579 aSign = extractFloat64Sign( a );
2580 switch ( float_rounding_mode ) {
2581 case float_round_nearest_even:
2582 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2583 return packFloat64( aSign, 0x3FF, 0 );
2586 case float_round_to_zero:
2588 case float_round_down:
2589 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590 case float_round_up:
2592 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2594 return packFloat64( aSign, 0, 0 );
2597 lastBitMask <<= 0x433 - aExp;
2598 roundBitsMask = lastBitMask - 1;
2600 roundingMode = float_rounding_mode;
2601 if ( roundingMode == float_round_nearest_even ) {
2602 z += lastBitMask>>1;
2603 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2605 else if ( roundingMode != float_round_to_zero ) {
2606 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2610 z &= ~ roundBitsMask;
2611 if ( z != a ) float_exception_flags |= float_flag_inexact;
2618 -------------------------------------------------------------------------------
2619 Returns the result of adding the absolute values of the double-precision
2620 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2621 before being returned. `zSign' is ignored if the result is a NaN.
2622 The addition is performed according to the IEC/IEEE Standard for Binary
2623 Floating-Point Arithmetic.
2624 -------------------------------------------------------------------------------
2626 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2628 int16 aExp, bExp, zExp;
2629 bits64 aSig, bSig, zSig;
2632 aSig = extractFloat64Frac( a );
2633 aExp = extractFloat64Exp( a );
2634 bSig = extractFloat64Frac( b );
2635 bExp = extractFloat64Exp( b );
2636 expDiff = aExp - bExp;
2639 if ( 0 < expDiff ) {
2640 if ( aExp == 0x7FF ) {
2641 if ( aSig ) return propagateFloat64NaN( a, b );
2648 bSig |= LIT64( 0x2000000000000000 );
2650 shift64RightJamming( bSig, expDiff, &bSig );
2653 else if ( expDiff < 0 ) {
2654 if ( bExp == 0x7FF ) {
2655 if ( bSig ) return propagateFloat64NaN( a, b );
2656 return packFloat64( zSign, 0x7FF, 0 );
2662 aSig |= LIT64( 0x2000000000000000 );
2664 shift64RightJamming( aSig, - expDiff, &aSig );
2668 if ( aExp == 0x7FF ) {
2669 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2672 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2673 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2677 aSig |= LIT64( 0x2000000000000000 );
2678 zSig = ( aSig + bSig )<<1;
2680 if ( (sbits64) zSig < 0 ) {
2685 return roundAndPackFloat64( zSign, zExp, zSig );
2690 -------------------------------------------------------------------------------
2691 Returns the result of subtracting the absolute values of the double-
2692 precision floating-point values `a' and `b'. If `zSign' is 1, the
2693 difference is negated before being returned. `zSign' is ignored if the
2694 result is a NaN. The subtraction is performed according to the IEC/IEEE
2695 Standard for Binary Floating-Point Arithmetic.
2696 -------------------------------------------------------------------------------
2698 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2700 int16 aExp, bExp, zExp;
2701 bits64 aSig, bSig, zSig;
2704 aSig = extractFloat64Frac( a );
2705 aExp = extractFloat64Exp( a );
2706 bSig = extractFloat64Frac( b );
2707 bExp = extractFloat64Exp( b );
2708 expDiff = aExp - bExp;
2711 if ( 0 < expDiff ) goto aExpBigger;
2712 if ( expDiff < 0 ) goto bExpBigger;
2713 if ( aExp == 0x7FF ) {
2714 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2715 float_raise( float_flag_invalid );
2716 return float64_default_nan;
2722 if ( bSig < aSig ) goto aBigger;
2723 if ( aSig < bSig ) goto bBigger;
2724 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2726 if ( bExp == 0x7FF ) {
2727 if ( bSig ) return propagateFloat64NaN( a, b );
2728 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2734 aSig |= LIT64( 0x4000000000000000 );
2736 shift64RightJamming( aSig, - expDiff, &aSig );
2737 bSig |= LIT64( 0x4000000000000000 );
2742 goto normalizeRoundAndPack;
2744 if ( aExp == 0x7FF ) {
2745 if ( aSig ) return propagateFloat64NaN( a, b );
2752 bSig |= LIT64( 0x4000000000000000 );
2754 shift64RightJamming( bSig, expDiff, &bSig );
2755 aSig |= LIT64( 0x4000000000000000 );
2759 normalizeRoundAndPack:
2761 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2766 -------------------------------------------------------------------------------
2767 Returns the result of adding the double-precision floating-point values `a'
2768 and `b'. The operation is performed according to the IEC/IEEE Standard for
2769 Binary Floating-Point Arithmetic.
2770 -------------------------------------------------------------------------------
2772 float64 float64_add( float64 a, float64 b )
2776 aSign = extractFloat64Sign( a );
2777 bSign = extractFloat64Sign( b );
2778 if ( aSign == bSign ) {
2779 return addFloat64Sigs( a, b, aSign );
2782 return subFloat64Sigs( a, b, aSign );
2788 -------------------------------------------------------------------------------
2789 Returns the result of subtracting the double-precision floating-point values
2790 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2791 for Binary Floating-Point Arithmetic.
2792 -------------------------------------------------------------------------------
2794 float64 float64_sub( float64 a, float64 b )
2798 aSign = extractFloat64Sign( a );
2799 bSign = extractFloat64Sign( b );
2800 if ( aSign == bSign ) {
2801 return subFloat64Sigs( a, b, aSign );
2804 return addFloat64Sigs( a, b, aSign );
2810 -------------------------------------------------------------------------------
2811 Returns the result of multiplying the double-precision floating-point values
2812 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2813 for Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2816 float64 float64_mul( float64 a, float64 b )
2818 flag aSign, bSign, zSign;
2819 int16 aExp, bExp, zExp;
2820 bits64 aSig, bSig, zSig0, zSig1;
2822 aSig = extractFloat64Frac( a );
2823 aExp = extractFloat64Exp( a );
2824 aSign = extractFloat64Sign( a );
2825 bSig = extractFloat64Frac( b );
2826 bExp = extractFloat64Exp( b );
2827 bSign = extractFloat64Sign( b );
2828 zSign = aSign ^ bSign;
2829 if ( aExp == 0x7FF ) {
2830 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2831 return propagateFloat64NaN( a, b );
2833 if ( ( bExp | bSig ) == 0 ) {
2834 float_raise( float_flag_invalid );
2835 return float64_default_nan;
2837 return packFloat64( zSign, 0x7FF, 0 );
2839 if ( bExp == 0x7FF ) {
2840 if ( bSig ) return propagateFloat64NaN( a, b );
2841 if ( ( aExp | aSig ) == 0 ) {
2842 float_raise( float_flag_invalid );
2843 return float64_default_nan;
2845 return packFloat64( zSign, 0x7FF, 0 );
2848 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2849 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2852 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2853 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2855 zExp = aExp + bExp - 0x3FF;
2856 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2857 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2859 zSig0 |= ( zSig1 != 0 );
2860 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2864 return roundAndPackFloat64( zSign, zExp, zSig0 );
2869 -------------------------------------------------------------------------------
2870 Returns the result of dividing the double-precision floating-point value `a'
2871 by the corresponding value `b'. The operation is performed according to
2872 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2873 -------------------------------------------------------------------------------
2875 float64 float64_div( float64 a, float64 b )
2877 flag aSign, bSign, zSign;
2878 int16 aExp, bExp, zExp;
2879 bits64 aSig, bSig, zSig;
2881 bits64 term0, term1;
2883 aSig = extractFloat64Frac( a );
2884 aExp = extractFloat64Exp( a );
2885 aSign = extractFloat64Sign( a );
2886 bSig = extractFloat64Frac( b );
2887 bExp = extractFloat64Exp( b );
2888 bSign = extractFloat64Sign( b );
2889 zSign = aSign ^ bSign;
2890 if ( aExp == 0x7FF ) {
2891 if ( aSig ) return propagateFloat64NaN( a, b );
2892 if ( bExp == 0x7FF ) {
2893 if ( bSig ) return propagateFloat64NaN( a, b );
2894 float_raise( float_flag_invalid );
2895 return float64_default_nan;
2897 return packFloat64( zSign, 0x7FF, 0 );
2899 if ( bExp == 0x7FF ) {
2900 if ( bSig ) return propagateFloat64NaN( a, b );
2901 return packFloat64( zSign, 0, 0 );
2905 if ( ( aExp | aSig ) == 0 ) {
2906 float_raise( float_flag_invalid );
2907 return float64_default_nan;
2909 float_raise( float_flag_divbyzero );
2910 return packFloat64( zSign, 0x7FF, 0 );
2912 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2915 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2916 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2918 zExp = aExp - bExp + 0x3FD;
2919 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2920 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2921 if ( bSig <= ( aSig + aSig ) ) {
2925 zSig = estimateDiv128To64( aSig, 0, bSig );
2926 if ( ( zSig & 0x1FF ) <= 2 ) {
2927 mul64To128( bSig, zSig, &term0, &term1 );
2928 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2929 while ( (sbits64) rem0 < 0 ) {
2931 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2933 zSig |= ( rem1 != 0 );
2935 return roundAndPackFloat64( zSign, zExp, zSig );
2939 #ifndef SOFTFLOAT_FOR_GCC
2941 -------------------------------------------------------------------------------
2942 Returns the remainder of the double-precision floating-point value `a'
2943 with respect to the corresponding value `b'. The operation is performed
2944 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2945 -------------------------------------------------------------------------------
2947 float64 float64_rem( float64 a, float64 b )
2949 flag aSign, bSign, zSign;
2950 int16 aExp, bExp, expDiff;
2952 bits64 q, alternateASig;
2955 aSig = extractFloat64Frac( a );
2956 aExp = extractFloat64Exp( a );
2957 aSign = extractFloat64Sign( a );
2958 bSig = extractFloat64Frac( b );
2959 bExp = extractFloat64Exp( b );
2960 bSign = extractFloat64Sign( b );
2961 if ( aExp == 0x7FF ) {
2962 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2963 return propagateFloat64NaN( a, b );
2965 float_raise( float_flag_invalid );
2966 return float64_default_nan;
2968 if ( bExp == 0x7FF ) {
2969 if ( bSig ) return propagateFloat64NaN( a, b );
2974 float_raise( float_flag_invalid );
2975 return float64_default_nan;
2977 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2980 if ( aSig == 0 ) return a;
2981 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2983 expDiff = aExp - bExp;
2984 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2985 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2986 if ( expDiff < 0 ) {
2987 if ( expDiff < -1 ) return a;
2990 q = ( bSig <= aSig );
2991 if ( q ) aSig -= bSig;
2993 while ( 0 < expDiff ) {
2994 q = estimateDiv128To64( aSig, 0, bSig );
2995 q = ( 2 < q ) ? q - 2 : 0;
2996 aSig = - ( ( bSig>>2 ) * q );
3000 if ( 0 < expDiff ) {
3001 q = estimateDiv128To64( aSig, 0, bSig );
3002 q = ( 2 < q ) ? q - 2 : 0;
3005 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3012 alternateASig = aSig;
3015 } while ( 0 <= (sbits64) aSig );
3016 sigMean = aSig + alternateASig;
3017 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3018 aSig = alternateASig;
3020 zSign = ( (sbits64) aSig < 0 );
3021 if ( zSign ) aSig = - aSig;
3022 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3027 -------------------------------------------------------------------------------
3028 Returns the square root of the double-precision floating-point value `a'.
3029 The operation is performed according to the IEC/IEEE Standard for Binary
3030 Floating-Point Arithmetic.
3031 -------------------------------------------------------------------------------
3033 float64 float64_sqrt( float64 a )
3037 bits64 aSig, zSig, doubleZSig;
3038 bits64 rem0, rem1, term0, term1;
3040 aSig = extractFloat64Frac( a );
3041 aExp = extractFloat64Exp( a );
3042 aSign = extractFloat64Sign( a );
3043 if ( aExp == 0x7FF ) {
3044 if ( aSig ) return propagateFloat64NaN( a, a );
3045 if ( ! aSign ) return a;
3046 float_raise( float_flag_invalid );
3047 return float64_default_nan;
3050 if ( ( aExp | aSig ) == 0 ) return a;
3051 float_raise( float_flag_invalid );
3052 return float64_default_nan;
3055 if ( aSig == 0 ) return 0;
3056 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3058 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3059 aSig |= LIT64( 0x0010000000000000 );
3060 zSig = estimateSqrt32( aExp, aSig>>21 );
3061 aSig <<= 9 - ( aExp & 1 );
3062 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3063 if ( ( zSig & 0x1FF ) <= 5 ) {
3064 doubleZSig = zSig<<1;
3065 mul64To128( zSig, zSig, &term0, &term1 );
3066 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3067 while ( (sbits64) rem0 < 0 ) {
3070 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3072 zSig |= ( ( rem0 | rem1 ) != 0 );
3074 return roundAndPackFloat64( 0, zExp, zSig );
3080 -------------------------------------------------------------------------------
3081 Returns 1 if the double-precision floating-point value `a' is equal to the
3082 corresponding value `b', and 0 otherwise. The comparison is performed
3083 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3084 -------------------------------------------------------------------------------
3086 flag float64_eq( float64 a, float64 b )
3089 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3090 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3092 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3093 float_raise( float_flag_invalid );
3097 return ( a == b ) ||
3098 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3103 -------------------------------------------------------------------------------
3104 Returns 1 if the double-precision floating-point value `a' is less than or
3105 equal to the corresponding value `b', and 0 otherwise. The comparison is
3106 performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 -------------------------------------------------------------------------------
3110 flag float64_le( float64 a, float64 b )
3114 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3115 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3117 float_raise( float_flag_invalid );
3120 aSign = extractFloat64Sign( a );
3121 bSign = extractFloat64Sign( b );
3122 if ( aSign != bSign )
3124 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3126 return ( a == b ) ||
3127 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3132 -------------------------------------------------------------------------------
3133 Returns 1 if the double-precision floating-point value `a' is less than
3134 the corresponding value `b', and 0 otherwise. The comparison is performed
3135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3136 -------------------------------------------------------------------------------
3138 flag float64_lt( float64 a, float64 b )
3142 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3143 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3145 float_raise( float_flag_invalid );
3148 aSign = extractFloat64Sign( a );
3149 bSign = extractFloat64Sign( b );
3150 if ( aSign != bSign )
3152 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3154 return ( a != b ) &&
3155 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3159 #ifndef SOFTFLOAT_FOR_GCC
3161 -------------------------------------------------------------------------------
3162 Returns 1 if the double-precision floating-point value `a' is equal to the
3163 corresponding value `b', and 0 otherwise. The invalid exception is raised
3164 if either operand is a NaN. Otherwise, the comparison is performed
3165 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3166 -------------------------------------------------------------------------------
3168 flag float64_eq_signaling( float64 a, float64 b )
3171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3174 float_raise( float_flag_invalid );
3177 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than or
3184 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3185 cause an exception. Otherwise, the comparison is performed according to the
3186 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3187 -------------------------------------------------------------------------------
3189 flag float64_le_quiet( float64 a, float64 b )
3193 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3194 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3196 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3197 float_raise( float_flag_invalid );
3201 aSign = extractFloat64Sign( a );
3202 bSign = extractFloat64Sign( b );
3203 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3204 return ( a == b ) || ( aSign ^ ( a < b ) );
3209 -------------------------------------------------------------------------------
3210 Returns 1 if the double-precision floating-point value `a' is less than
3211 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3212 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3213 Standard for Binary Floating-Point Arithmetic.
3214 -------------------------------------------------------------------------------
3216 flag float64_lt_quiet( float64 a, float64 b )
3220 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3221 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3224 float_raise( float_flag_invalid );
3228 aSign = extractFloat64Sign( a );
3229 bSign = extractFloat64Sign( b );
3230 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3231 return ( a != b ) && ( aSign ^ ( a < b ) );
3239 -------------------------------------------------------------------------------
3240 Returns the result of converting the extended double-precision floating-
3241 point value `a' to the 32-bit two's complement integer format. The
3242 conversion is performed according to the IEC/IEEE Standard for Binary
3243 Floating-Point Arithmetic---which means in particular that the conversion
3244 is rounded according to the current rounding mode. If `a' is a NaN, the
3245 largest positive integer is returned. Otherwise, if the conversion
3246 overflows, the largest integer with the same sign as `a' is returned.
3247 -------------------------------------------------------------------------------
3249 int32 floatx80_to_int32( floatx80 a )
3252 int32 aExp, shiftCount;
3255 aSig = extractFloatx80Frac( a );
3256 aExp = extractFloatx80Exp( a );
3257 aSign = extractFloatx80Sign( a );
3258 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3259 shiftCount = 0x4037 - aExp;
3260 if ( shiftCount <= 0 ) shiftCount = 1;
3261 shift64RightJamming( aSig, shiftCount, &aSig );
3262 return roundAndPackInt32( aSign, aSig );
3267 -------------------------------------------------------------------------------
3268 Returns the result of converting the extended double-precision floating-
3269 point value `a' to the 32-bit two's complement integer format. The
3270 conversion is performed according to the IEC/IEEE Standard for Binary
3271 Floating-Point Arithmetic, except that the conversion is always rounded
3272 toward zero. If `a' is a NaN, the largest positive integer is returned.
3273 Otherwise, if the conversion overflows, the largest integer with the same
3274 sign as `a' is returned.
3275 -------------------------------------------------------------------------------
3277 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3280 int32 aExp, shiftCount;
3281 bits64 aSig, savedASig;
3284 aSig = extractFloatx80Frac( a );
3285 aExp = extractFloatx80Exp( a );
3286 aSign = extractFloatx80Sign( a );
3287 if ( 0x401E < aExp ) {
3288 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3291 else if ( aExp < 0x3FFF ) {
3292 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3295 shiftCount = 0x403E - aExp;
3297 aSig >>= shiftCount;
3299 if ( aSign ) z = - z;
3300 if ( ( z < 0 ) ^ aSign ) {
3302 float_raise( float_flag_invalid );
3303 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3305 if ( ( aSig<<shiftCount ) != savedASig ) {
3306 float_exception_flags |= float_flag_inexact;
3313 -------------------------------------------------------------------------------
3314 Returns the result of converting the extended double-precision floating-
3315 point value `a' to the 64-bit two's complement integer format. The
3316 conversion is performed according to the IEC/IEEE Standard for Binary
3317 Floating-Point Arithmetic---which means in particular that the conversion
3318 is rounded according to the current rounding mode. If `a' is a NaN,
3319 the largest positive integer is returned. Otherwise, if the conversion
3320 overflows, the largest integer with the same sign as `a' is returned.
3321 -------------------------------------------------------------------------------
3323 int64 floatx80_to_int64( floatx80 a )
3326 int32 aExp, shiftCount;
3327 bits64 aSig, aSigExtra;
3329 aSig = extractFloatx80Frac( a );
3330 aExp = extractFloatx80Exp( a );
3331 aSign = extractFloatx80Sign( a );
3332 shiftCount = 0x403E - aExp;
3333 if ( shiftCount <= 0 ) {
3335 float_raise( float_flag_invalid );
3337 || ( ( aExp == 0x7FFF )
3338 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3340 return LIT64( 0x7FFFFFFFFFFFFFFF );
3342 return (sbits64) LIT64( 0x8000000000000000 );
3347 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3349 return roundAndPackInt64( aSign, aSig, aSigExtra );
3354 -------------------------------------------------------------------------------
3355 Returns the result of converting the extended double-precision floating-
3356 point value `a' to the 64-bit two's complement integer format. The
3357 conversion is performed according to the IEC/IEEE Standard for Binary
3358 Floating-Point Arithmetic, except that the conversion is always rounded
3359 toward zero. If `a' is a NaN, the largest positive integer is returned.
3360 Otherwise, if the conversion overflows, the largest integer with the same
3361 sign as `a' is returned.
3362 -------------------------------------------------------------------------------
3364 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3367 int32 aExp, shiftCount;
3371 aSig = extractFloatx80Frac( a );
3372 aExp = extractFloatx80Exp( a );
3373 aSign = extractFloatx80Sign( a );
3374 shiftCount = aExp - 0x403E;
3375 if ( 0 <= shiftCount ) {
3376 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3377 if ( ( a.high != 0xC03E ) || aSig ) {
3378 float_raise( float_flag_invalid );
3379 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3380 return LIT64( 0x7FFFFFFFFFFFFFFF );
3383 return (sbits64) LIT64( 0x8000000000000000 );
3385 else if ( aExp < 0x3FFF ) {
3386 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3389 z = aSig>>( - shiftCount );
3390 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3391 float_exception_flags |= float_flag_inexact;
3393 if ( aSign ) z = - z;
3399 -------------------------------------------------------------------------------
3400 Returns the result of converting the extended double-precision floating-
3401 point value `a' to the single-precision floating-point format. The
3402 conversion is performed according to the IEC/IEEE Standard for Binary
3403 Floating-Point Arithmetic.
3404 -------------------------------------------------------------------------------
3406 float32 floatx80_to_float32( floatx80 a )
3412 aSig = extractFloatx80Frac( a );
3413 aExp = extractFloatx80Exp( a );
3414 aSign = extractFloatx80Sign( a );
3415 if ( aExp == 0x7FFF ) {
3416 if ( (bits64) ( aSig<<1 ) ) {
3417 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3419 return packFloat32( aSign, 0xFF, 0 );
3421 shift64RightJamming( aSig, 33, &aSig );
3422 if ( aExp || aSig ) aExp -= 0x3F81;
3423 return roundAndPackFloat32( aSign, aExp, aSig );
3428 -------------------------------------------------------------------------------
3429 Returns the result of converting the extended double-precision floating-
3430 point value `a' to the double-precision floating-point format. The
3431 conversion is performed according to the IEC/IEEE Standard for Binary
3432 Floating-Point Arithmetic.
3433 -------------------------------------------------------------------------------
3435 float64 floatx80_to_float64( floatx80 a )
3441 aSig = extractFloatx80Frac( a );
3442 aExp = extractFloatx80Exp( a );
3443 aSign = extractFloatx80Sign( a );
3444 if ( aExp == 0x7FFF ) {
3445 if ( (bits64) ( aSig<<1 ) ) {
3446 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3448 return packFloat64( aSign, 0x7FF, 0 );
3450 shift64RightJamming( aSig, 1, &zSig );
3451 if ( aExp || aSig ) aExp -= 0x3C01;
3452 return roundAndPackFloat64( aSign, aExp, zSig );
3459 -------------------------------------------------------------------------------
3460 Returns the result of converting the extended double-precision floating-
3461 point value `a' to the quadruple-precision floating-point format. The
3462 conversion is performed according to the IEC/IEEE Standard for Binary
3463 Floating-Point Arithmetic.
3464 -------------------------------------------------------------------------------
3466 float128 floatx80_to_float128( floatx80 a )
3470 bits64 aSig, zSig0, zSig1;
3472 aSig = extractFloatx80Frac( a );
3473 aExp = extractFloatx80Exp( a );
3474 aSign = extractFloatx80Sign( a );
3475 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3476 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3478 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3479 return packFloat128( aSign, aExp, zSig0, zSig1 );
3486 -------------------------------------------------------------------------------
3487 Rounds the extended double-precision floating-point value `a' to an integer,
3488 and returns the result as an extended quadruple-precision floating-point
3489 value. The operation is performed according to the IEC/IEEE Standard for
3490 Binary Floating-Point Arithmetic.
3491 -------------------------------------------------------------------------------
3493 floatx80 floatx80_round_to_int( floatx80 a )
3497 bits64 lastBitMask, roundBitsMask;
3501 aExp = extractFloatx80Exp( a );
3502 if ( 0x403E <= aExp ) {
3503 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3504 return propagateFloatx80NaN( a, a );
3508 if ( aExp < 0x3FFF ) {
3510 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3513 float_exception_flags |= float_flag_inexact;
3514 aSign = extractFloatx80Sign( a );
3515 switch ( float_rounding_mode ) {
3516 case float_round_nearest_even:
3517 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3520 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3523 case float_round_to_zero:
3525 case float_round_down:
3528 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3529 : packFloatx80( 0, 0, 0 );
3530 case float_round_up:
3532 aSign ? packFloatx80( 1, 0, 0 )
3533 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3535 return packFloatx80( aSign, 0, 0 );
3538 lastBitMask <<= 0x403E - aExp;
3539 roundBitsMask = lastBitMask - 1;
3541 roundingMode = float_rounding_mode;
3542 if ( roundingMode == float_round_nearest_even ) {
3543 z.low += lastBitMask>>1;
3544 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3546 else if ( roundingMode != float_round_to_zero ) {
3547 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3548 z.low += roundBitsMask;
3551 z.low &= ~ roundBitsMask;
3554 z.low = LIT64( 0x8000000000000000 );
3556 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3562 -------------------------------------------------------------------------------
3563 Returns the result of adding the absolute values of the extended double-
3564 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3565 negated before being returned. `zSign' is ignored if the result is a NaN.
3566 The addition is performed according to the IEC/IEEE Standard for Binary
3567 Floating-Point Arithmetic.
3568 -------------------------------------------------------------------------------
3570 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3572 int32 aExp, bExp, zExp;
3573 bits64 aSig, bSig, zSig0, zSig1;
3576 aSig = extractFloatx80Frac( a );
3577 aExp = extractFloatx80Exp( a );
3578 bSig = extractFloatx80Frac( b );
3579 bExp = extractFloatx80Exp( b );
3580 expDiff = aExp - bExp;
3581 if ( 0 < expDiff ) {
3582 if ( aExp == 0x7FFF ) {
3583 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586 if ( bExp == 0 ) --expDiff;
3587 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590 else if ( expDiff < 0 ) {
3591 if ( bExp == 0x7FFF ) {
3592 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3593 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3595 if ( aExp == 0 ) ++expDiff;
3596 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3600 if ( aExp == 0x7FFF ) {
3601 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3602 return propagateFloatx80NaN( a, b );
3607 zSig0 = aSig + bSig;
3609 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3615 zSig0 = aSig + bSig;
3616 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3618 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3619 zSig0 |= LIT64( 0x8000000000000000 );
3623 roundAndPackFloatx80(
3624 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3629 -------------------------------------------------------------------------------
3630 Returns the result of subtracting the absolute values of the extended
3631 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3632 difference is negated before being returned. `zSign' is ignored if the
3633 result is a NaN. The subtraction is performed according to the IEC/IEEE
3634 Standard for Binary Floating-Point Arithmetic.
3635 -------------------------------------------------------------------------------
3637 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3639 int32 aExp, bExp, zExp;
3640 bits64 aSig, bSig, zSig0, zSig1;
3644 aSig = extractFloatx80Frac( a );
3645 aExp = extractFloatx80Exp( a );
3646 bSig = extractFloatx80Frac( b );
3647 bExp = extractFloatx80Exp( b );
3648 expDiff = aExp - bExp;
3649 if ( 0 < expDiff ) goto aExpBigger;
3650 if ( expDiff < 0 ) goto bExpBigger;
3651 if ( aExp == 0x7FFF ) {
3652 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3653 return propagateFloatx80NaN( a, b );
3655 float_raise( float_flag_invalid );
3656 z.low = floatx80_default_nan_low;
3657 z.high = floatx80_default_nan_high;
3665 if ( bSig < aSig ) goto aBigger;
3666 if ( aSig < bSig ) goto bBigger;
3667 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3669 if ( bExp == 0x7FFF ) {
3670 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3671 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3673 if ( aExp == 0 ) ++expDiff;
3674 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3676 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679 goto normalizeRoundAndPack;
3681 if ( aExp == 0x7FFF ) {
3682 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685 if ( bExp == 0 ) --expDiff;
3686 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3688 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3690 normalizeRoundAndPack:
3692 normalizeRoundAndPackFloatx80(
3693 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3698 -------------------------------------------------------------------------------
3699 Returns the result of adding the extended double-precision floating-point
3700 values `a' and `b'. The operation is performed according to the IEC/IEEE
3701 Standard for Binary Floating-Point Arithmetic.
3702 -------------------------------------------------------------------------------
3704 floatx80 floatx80_add( floatx80 a, floatx80 b )
3708 aSign = extractFloatx80Sign( a );
3709 bSign = extractFloatx80Sign( b );
3710 if ( aSign == bSign ) {
3711 return addFloatx80Sigs( a, b, aSign );
3714 return subFloatx80Sigs( a, b, aSign );
3720 -------------------------------------------------------------------------------
3721 Returns the result of subtracting the extended double-precision floating-
3722 point values `a' and `b'. The operation is performed according to the
3723 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3724 -------------------------------------------------------------------------------
3726 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3730 aSign = extractFloatx80Sign( a );
3731 bSign = extractFloatx80Sign( b );
3732 if ( aSign == bSign ) {
3733 return subFloatx80Sigs( a, b, aSign );
3736 return addFloatx80Sigs( a, b, aSign );
3742 -------------------------------------------------------------------------------
3743 Returns the result of multiplying the extended double-precision floating-
3744 point values `a' and `b'. The operation is performed according to the
3745 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3748 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3750 flag aSign, bSign, zSign;
3751 int32 aExp, bExp, zExp;
3752 bits64 aSig, bSig, zSig0, zSig1;
3755 aSig = extractFloatx80Frac( a );
3756 aExp = extractFloatx80Exp( a );
3757 aSign = extractFloatx80Sign( a );
3758 bSig = extractFloatx80Frac( b );
3759 bExp = extractFloatx80Exp( b );
3760 bSign = extractFloatx80Sign( b );
3761 zSign = aSign ^ bSign;
3762 if ( aExp == 0x7FFF ) {
3763 if ( (bits64) ( aSig<<1 )
3764 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3765 return propagateFloatx80NaN( a, b );
3767 if ( ( bExp | bSig ) == 0 ) goto invalid;
3768 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3770 if ( bExp == 0x7FFF ) {
3771 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3772 if ( ( aExp | aSig ) == 0 ) {
3774 float_raise( float_flag_invalid );
3775 z.low = floatx80_default_nan_low;
3776 z.high = floatx80_default_nan_high;
3779 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3782 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3783 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3786 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3787 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3789 zExp = aExp + bExp - 0x3FFE;
3790 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3791 if ( 0 < (sbits64) zSig0 ) {
3792 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3796 roundAndPackFloatx80(
3797 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3802 -------------------------------------------------------------------------------
3803 Returns the result of dividing the extended double-precision floating-point
3804 value `a' by the corresponding value `b'. The operation is performed
3805 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3806 -------------------------------------------------------------------------------
3808 floatx80 floatx80_div( floatx80 a, floatx80 b )
3810 flag aSign, bSign, zSign;
3811 int32 aExp, bExp, zExp;
3812 bits64 aSig, bSig, zSig0, zSig1;
3813 bits64 rem0, rem1, rem2, term0, term1, term2;
3816 aSig = extractFloatx80Frac( a );
3817 aExp = extractFloatx80Exp( a );
3818 aSign = extractFloatx80Sign( a );
3819 bSig = extractFloatx80Frac( b );
3820 bExp = extractFloatx80Exp( b );
3821 bSign = extractFloatx80Sign( b );
3822 zSign = aSign ^ bSign;
3823 if ( aExp == 0x7FFF ) {
3824 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3825 if ( bExp == 0x7FFF ) {
3826 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3831 if ( bExp == 0x7FFF ) {
3832 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3833 return packFloatx80( zSign, 0, 0 );
3837 if ( ( aExp | aSig ) == 0 ) {
3839 float_raise( float_flag_invalid );
3840 z.low = floatx80_default_nan_low;
3841 z.high = floatx80_default_nan_high;
3844 float_raise( float_flag_divbyzero );
3845 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3847 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3850 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3851 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3853 zExp = aExp - bExp + 0x3FFE;
3855 if ( bSig <= aSig ) {
3856 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3860 mul64To128( bSig, zSig0, &term0, &term1 );
3861 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3862 while ( (sbits64) rem0 < 0 ) {
3864 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3866 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3867 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3868 mul64To128( bSig, zSig1, &term1, &term2 );
3869 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3870 while ( (sbits64) rem1 < 0 ) {
3872 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3874 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3877 roundAndPackFloatx80(
3878 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3883 -------------------------------------------------------------------------------
3884 Returns the remainder of the extended double-precision floating-point value
3885 `a' with respect to the corresponding value `b'. The operation is performed
3886 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3887 -------------------------------------------------------------------------------
3889 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3891 flag aSign, bSign, zSign;
3892 int32 aExp, bExp, expDiff;
3893 bits64 aSig0, aSig1, bSig;
3894 bits64 q, term0, term1, alternateASig0, alternateASig1;
3897 aSig0 = extractFloatx80Frac( a );
3898 aExp = extractFloatx80Exp( a );
3899 aSign = extractFloatx80Sign( a );
3900 bSig = extractFloatx80Frac( b );
3901 bExp = extractFloatx80Exp( b );
3902 bSign = extractFloatx80Sign( b );
3903 if ( aExp == 0x7FFF ) {
3904 if ( (bits64) ( aSig0<<1 )
3905 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3906 return propagateFloatx80NaN( a, b );
3910 if ( bExp == 0x7FFF ) {
3911 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3917 float_raise( float_flag_invalid );
3918 z.low = floatx80_default_nan_low;
3919 z.high = floatx80_default_nan_high;
3922 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3925 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3926 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3928 bSig |= LIT64( 0x8000000000000000 );
3930 expDiff = aExp - bExp;
3932 if ( expDiff < 0 ) {
3933 if ( expDiff < -1 ) return a;
3934 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937 q = ( bSig <= aSig0 );
3938 if ( q ) aSig0 -= bSig;
3940 while ( 0 < expDiff ) {
3941 q = estimateDiv128To64( aSig0, aSig1, bSig );
3942 q = ( 2 < q ) ? q - 2 : 0;
3943 mul64To128( bSig, q, &term0, &term1 );
3944 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3945 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3949 if ( 0 < expDiff ) {
3950 q = estimateDiv128To64( aSig0, aSig1, bSig );
3951 q = ( 2 < q ) ? q - 2 : 0;
3953 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3954 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3955 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3956 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3958 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3965 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3966 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3967 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970 aSig0 = alternateASig0;
3971 aSig1 = alternateASig1;
3975 normalizeRoundAndPackFloatx80(
3976 80, zSign, bExp + expDiff, aSig0, aSig1 );
3981 -------------------------------------------------------------------------------
3982 Returns the square root of the extended double-precision floating-point
3983 value `a'. The operation is performed according to the IEC/IEEE Standard
3984 for Binary Floating-Point Arithmetic.
3985 -------------------------------------------------------------------------------
3987 floatx80 floatx80_sqrt( floatx80 a )
3991 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3992 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995 aSig0 = extractFloatx80Frac( a );
3996 aExp = extractFloatx80Exp( a );
3997 aSign = extractFloatx80Sign( a );
3998 if ( aExp == 0x7FFF ) {
3999 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4000 if ( ! aSign ) return a;
4004 if ( ( aExp | aSig0 ) == 0 ) return a;
4006 float_raise( float_flag_invalid );
4007 z.low = floatx80_default_nan_low;
4008 z.high = floatx80_default_nan_high;
4012 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4013 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4015 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4016 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4017 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4018 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4019 doubleZSig0 = zSig0<<1;
4020 mul64To128( zSig0, zSig0, &term0, &term1 );
4021 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4022 while ( (sbits64) rem0 < 0 ) {
4025 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4027 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4028 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4029 if ( zSig1 == 0 ) zSig1 = 1;
4030 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4031 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4032 mul64To128( zSig1, zSig1, &term2, &term3 );
4033 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4034 while ( (sbits64) rem1 < 0 ) {
4036 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4038 term2 |= doubleZSig0;
4039 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4041 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4043 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4044 zSig0 |= doubleZSig0;
4046 roundAndPackFloatx80(
4047 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4052 -------------------------------------------------------------------------------
4053 Returns 1 if the extended double-precision floating-point value `a' is
4054 equal to the corresponding value `b', and 0 otherwise. The comparison is
4055 performed according to the IEC/IEEE Standard for Binary Floating-Point
4057 -------------------------------------------------------------------------------
4059 flag floatx80_eq( floatx80 a, floatx80 b )
4062 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4063 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4064 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4065 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4067 if ( floatx80_is_signaling_nan( a )
4068 || floatx80_is_signaling_nan( b ) ) {
4069 float_raise( float_flag_invalid );
4075 && ( ( a.high == b.high )
4077 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4083 -------------------------------------------------------------------------------
4084 Returns 1 if the extended double-precision floating-point value `a' is
4085 less than or equal to the corresponding value `b', and 0 otherwise. The
4086 comparison is performed according to the IEC/IEEE Standard for Binary
4087 Floating-Point Arithmetic.
4088 -------------------------------------------------------------------------------
4090 flag floatx80_le( floatx80 a, floatx80 b )
4094 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4095 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4096 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4097 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4099 float_raise( float_flag_invalid );
4102 aSign = extractFloatx80Sign( a );
4103 bSign = extractFloatx80Sign( b );
4104 if ( aSign != bSign ) {
4107 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4111 aSign ? le128( b.high, b.low, a.high, a.low )
4112 : le128( a.high, a.low, b.high, b.low );
4117 -------------------------------------------------------------------------------
4118 Returns 1 if the extended double-precision floating-point value `a' is
4119 less than the corresponding value `b', and 0 otherwise. The comparison
4120 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4122 -------------------------------------------------------------------------------
4124 flag floatx80_lt( floatx80 a, floatx80 b )
4128 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4129 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4130 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4131 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4133 float_raise( float_flag_invalid );
4136 aSign = extractFloatx80Sign( a );
4137 bSign = extractFloatx80Sign( b );
4138 if ( aSign != bSign ) {
4141 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4145 aSign ? lt128( b.high, b.low, a.high, a.low )
4146 : lt128( a.high, a.low, b.high, b.low );
4151 -------------------------------------------------------------------------------
4152 Returns 1 if the extended double-precision floating-point value `a' is equal
4153 to the corresponding value `b', and 0 otherwise. The invalid exception is
4154 raised if either operand is a NaN. Otherwise, the comparison is performed
4155 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4156 -------------------------------------------------------------------------------
4158 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4161 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4162 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4163 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4164 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4166 float_raise( float_flag_invalid );
4171 && ( ( a.high == b.high )
4173 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4179 -------------------------------------------------------------------------------
4180 Returns 1 if the extended double-precision floating-point value `a' is less
4181 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4182 do not cause an exception. Otherwise, the comparison is performed according
4183 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184 -------------------------------------------------------------------------------
4186 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4190 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4191 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4193 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4195 if ( floatx80_is_signaling_nan( a )
4196 || floatx80_is_signaling_nan( b ) ) {
4197 float_raise( float_flag_invalid );
4201 aSign = extractFloatx80Sign( a );
4202 bSign = extractFloatx80Sign( b );
4203 if ( aSign != bSign ) {
4206 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4210 aSign ? le128( b.high, b.low, a.high, a.low )
4211 : le128( a.high, a.low, b.high, b.low );
4216 -------------------------------------------------------------------------------
4217 Returns 1 if the extended double-precision floating-point value `a' is less
4218 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4219 an exception. Otherwise, the comparison is performed according to the
4220 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4221 -------------------------------------------------------------------------------
4223 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4227 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4228 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4229 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4230 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4232 if ( floatx80_is_signaling_nan( a )
4233 || floatx80_is_signaling_nan( b ) ) {
4234 float_raise( float_flag_invalid );
4238 aSign = extractFloatx80Sign( a );
4239 bSign = extractFloatx80Sign( b );
4240 if ( aSign != bSign ) {
4243 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4247 aSign ? lt128( b.high, b.low, a.high, a.low )
4248 : lt128( a.high, a.low, b.high, b.low );
4257 -------------------------------------------------------------------------------
4258 Returns the result of converting the quadruple-precision floating-point
4259 value `a' to the 32-bit two's complement integer format. The conversion
4260 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4261 Arithmetic---which means in particular that the conversion is rounded
4262 according to the current rounding mode. If `a' is a NaN, the largest
4263 positive integer is returned. Otherwise, if the conversion overflows, the
4264 largest integer with the same sign as `a' is returned.
4265 -------------------------------------------------------------------------------
4267 int32 float128_to_int32( float128 a )
4270 int32 aExp, shiftCount;
4271 bits64 aSig0, aSig1;
4273 aSig1 = extractFloat128Frac1( a );
4274 aSig0 = extractFloat128Frac0( a );
4275 aExp = extractFloat128Exp( a );
4276 aSign = extractFloat128Sign( a );
4277 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4278 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4279 aSig0 |= ( aSig1 != 0 );
4280 shiftCount = 0x4028 - aExp;
4281 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4282 return roundAndPackInt32( aSign, aSig0 );
4287 -------------------------------------------------------------------------------
4288 Returns the result of converting the quadruple-precision floating-point
4289 value `a' to the 32-bit two's complement integer format. The conversion
4290 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4291 Arithmetic, except that the conversion is always rounded toward zero. If
4292 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4293 conversion overflows, the largest integer with the same sign as `a' is
4295 -------------------------------------------------------------------------------
4297 int32 float128_to_int32_round_to_zero( float128 a )
4300 int32 aExp, shiftCount;
4301 bits64 aSig0, aSig1, savedASig;
4304 aSig1 = extractFloat128Frac1( a );
4305 aSig0 = extractFloat128Frac0( a );
4306 aExp = extractFloat128Exp( a );
4307 aSign = extractFloat128Sign( a );
4308 aSig0 |= ( aSig1 != 0 );
4309 if ( 0x401E < aExp ) {
4310 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313 else if ( aExp < 0x3FFF ) {
4314 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4317 aSig0 |= LIT64( 0x0001000000000000 );
4318 shiftCount = 0x402F - aExp;
4320 aSig0 >>= shiftCount;
4322 if ( aSign ) z = - z;
4323 if ( ( z < 0 ) ^ aSign ) {
4325 float_raise( float_flag_invalid );
4326 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4328 if ( ( aSig0<<shiftCount ) != savedASig ) {
4329 float_exception_flags |= float_flag_inexact;
4336 -------------------------------------------------------------------------------
4337 Returns the result of converting the quadruple-precision floating-point
4338 value `a' to the 64-bit two's complement integer format. The conversion
4339 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4340 Arithmetic---which means in particular that the conversion is rounded
4341 according to the current rounding mode. If `a' is a NaN, the largest
4342 positive integer is returned. Otherwise, if the conversion overflows, the
4343 largest integer with the same sign as `a' is returned.
4344 -------------------------------------------------------------------------------
4346 int64 float128_to_int64( float128 a )
4349 int32 aExp, shiftCount;
4350 bits64 aSig0, aSig1;
4352 aSig1 = extractFloat128Frac1( a );
4353 aSig0 = extractFloat128Frac0( a );
4354 aExp = extractFloat128Exp( a );
4355 aSign = extractFloat128Sign( a );
4356 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4357 shiftCount = 0x402F - aExp;
4358 if ( shiftCount <= 0 ) {
4359 if ( 0x403E < aExp ) {
4360 float_raise( float_flag_invalid );
4362 || ( ( aExp == 0x7FFF )
4363 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4366 return LIT64( 0x7FFFFFFFFFFFFFFF );
4368 return (sbits64) LIT64( 0x8000000000000000 );
4370 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4373 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4375 return roundAndPackInt64( aSign, aSig0, aSig1 );
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format. The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic, except that the conversion is always rounded toward zero.
4385 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4386 the conversion overflows, the largest integer with the same sign as `a' is
4388 -------------------------------------------------------------------------------
4390 int64 float128_to_int64_round_to_zero( float128 a )
4393 int32 aExp, shiftCount;
4394 bits64 aSig0, aSig1;
4397 aSig1 = extractFloat128Frac1( a );
4398 aSig0 = extractFloat128Frac0( a );
4399 aExp = extractFloat128Exp( a );
4400 aSign = extractFloat128Sign( a );
4401 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4402 shiftCount = aExp - 0x402F;
4403 if ( 0 < shiftCount ) {
4404 if ( 0x403E <= aExp ) {
4405 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4406 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4407 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4408 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4411 float_raise( float_flag_invalid );
4412 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4413 return LIT64( 0x7FFFFFFFFFFFFFFF );
4416 return (sbits64) LIT64( 0x8000000000000000 );
4418 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4419 if ( (bits64) ( aSig1<<shiftCount ) ) {
4420 float_exception_flags |= float_flag_inexact;
4424 if ( aExp < 0x3FFF ) {
4425 if ( aExp | aSig0 | aSig1 ) {
4426 float_exception_flags |= float_flag_inexact;
4430 z = aSig0>>( - shiftCount );
4432 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4433 float_exception_flags |= float_flag_inexact;
4436 if ( aSign ) z = - z;
4442 -------------------------------------------------------------------------------
4443 Returns the result of converting the quadruple-precision floating-point
4444 value `a' to the single-precision floating-point format. The conversion
4445 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4447 -------------------------------------------------------------------------------
4449 float32 float128_to_float32( float128 a )
4453 bits64 aSig0, aSig1;
4456 aSig1 = extractFloat128Frac1( a );
4457 aSig0 = extractFloat128Frac0( a );
4458 aExp = extractFloat128Exp( a );
4459 aSign = extractFloat128Sign( a );
4460 if ( aExp == 0x7FFF ) {
4461 if ( aSig0 | aSig1 ) {
4462 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4464 return packFloat32( aSign, 0xFF, 0 );
4466 aSig0 |= ( aSig1 != 0 );
4467 shift64RightJamming( aSig0, 18, &aSig0 );
4469 if ( aExp || zSig ) {
4473 return roundAndPackFloat32( aSign, aExp, zSig );
4478 -------------------------------------------------------------------------------
4479 Returns the result of converting the quadruple-precision floating-point
4480 value `a' to the double-precision floating-point format. The conversion
4481 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4483 -------------------------------------------------------------------------------
4485 float64 float128_to_float64( float128 a )
4489 bits64 aSig0, aSig1;
4491 aSig1 = extractFloat128Frac1( a );
4492 aSig0 = extractFloat128Frac0( a );
4493 aExp = extractFloat128Exp( a );
4494 aSign = extractFloat128Sign( a );
4495 if ( aExp == 0x7FFF ) {
4496 if ( aSig0 | aSig1 ) {
4497 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4499 return packFloat64( aSign, 0x7FF, 0 );
4501 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4502 aSig0 |= ( aSig1 != 0 );
4503 if ( aExp || aSig0 ) {
4504 aSig0 |= LIT64( 0x4000000000000000 );
4507 return roundAndPackFloat64( aSign, aExp, aSig0 );
4514 -------------------------------------------------------------------------------
4515 Returns the result of converting the quadruple-precision floating-point
4516 value `a' to the extended double-precision floating-point format. The
4517 conversion is performed according to the IEC/IEEE Standard for Binary
4518 Floating-Point Arithmetic.
4519 -------------------------------------------------------------------------------
4521 floatx80 float128_to_floatx80( float128 a )
4525 bits64 aSig0, aSig1;
4527 aSig1 = extractFloat128Frac1( a );
4528 aSig0 = extractFloat128Frac0( a );
4529 aExp = extractFloat128Exp( a );
4530 aSign = extractFloat128Sign( a );
4531 if ( aExp == 0x7FFF ) {
4532 if ( aSig0 | aSig1 ) {
4533 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4535 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4538 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4539 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4542 aSig0 |= LIT64( 0x0001000000000000 );
4544 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4545 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4552 -------------------------------------------------------------------------------
4553 Rounds the quadruple-precision floating-point value `a' to an integer, and
4554 returns the result as a quadruple-precision floating-point value. The
4555 operation is performed according to the IEC/IEEE Standard for Binary
4556 Floating-Point Arithmetic.
4557 -------------------------------------------------------------------------------
4559 float128 float128_round_to_int( float128 a )
4563 bits64 lastBitMask, roundBitsMask;
4567 aExp = extractFloat128Exp( a );
4568 if ( 0x402F <= aExp ) {
4569 if ( 0x406F <= aExp ) {
4570 if ( ( aExp == 0x7FFF )
4571 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4573 return propagateFloat128NaN( a, a );
4578 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4579 roundBitsMask = lastBitMask - 1;
4581 roundingMode = float_rounding_mode;
4582 if ( roundingMode == float_round_nearest_even ) {
4583 if ( lastBitMask ) {
4584 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4585 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4588 if ( (sbits64) z.low < 0 ) {
4590 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4594 else if ( roundingMode != float_round_to_zero ) {
4595 if ( extractFloat128Sign( z )
4596 ^ ( roundingMode == float_round_up ) ) {
4597 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4600 z.low &= ~ roundBitsMask;
4603 if ( aExp < 0x3FFF ) {
4604 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4605 float_exception_flags |= float_flag_inexact;
4606 aSign = extractFloat128Sign( a );
4607 switch ( float_rounding_mode ) {
4608 case float_round_nearest_even:
4609 if ( ( aExp == 0x3FFE )
4610 && ( extractFloat128Frac0( a )
4611 | extractFloat128Frac1( a ) )
4613 return packFloat128( aSign, 0x3FFF, 0, 0 );
4616 case float_round_to_zero:
4618 case float_round_down:
4620 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621 : packFloat128( 0, 0, 0, 0 );
4622 case float_round_up:
4624 aSign ? packFloat128( 1, 0, 0, 0 )
4625 : packFloat128( 0, 0x3FFF, 0, 0 );
4627 return packFloat128( aSign, 0, 0, 0 );
4630 lastBitMask <<= 0x402F - aExp;
4631 roundBitsMask = lastBitMask - 1;
4634 roundingMode = float_rounding_mode;
4635 if ( roundingMode == float_round_nearest_even ) {
4636 z.high += lastBitMask>>1;
4637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638 z.high &= ~ lastBitMask;
4641 else if ( roundingMode != float_round_to_zero ) {
4642 if ( extractFloat128Sign( z )
4643 ^ ( roundingMode == float_round_up ) ) {
4644 z.high |= ( a.low != 0 );
4645 z.high += roundBitsMask;
4648 z.high &= ~ roundBitsMask;
4650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651 float_exception_flags |= float_flag_inexact;
4658 -------------------------------------------------------------------------------
4659 Returns the result of adding the absolute values of the quadruple-precision
4660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4661 before being returned. `zSign' is ignored if the result is a NaN.
4662 The addition is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4668 int32 aExp, bExp, zExp;
4669 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4672 aSig1 = extractFloat128Frac1( a );
4673 aSig0 = extractFloat128Frac0( a );
4674 aExp = extractFloat128Exp( a );
4675 bSig1 = extractFloat128Frac1( b );
4676 bSig0 = extractFloat128Frac0( b );
4677 bExp = extractFloat128Exp( b );
4678 expDiff = aExp - bExp;
4679 if ( 0 < expDiff ) {
4680 if ( aExp == 0x7FFF ) {
4681 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4688 bSig0 |= LIT64( 0x0001000000000000 );
4690 shift128ExtraRightJamming(
4691 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4694 else if ( expDiff < 0 ) {
4695 if ( bExp == 0x7FFF ) {
4696 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697 return packFloat128( zSign, 0x7FFF, 0, 0 );
4703 aSig0 |= LIT64( 0x0001000000000000 );
4705 shift128ExtraRightJamming(
4706 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4710 if ( aExp == 0x7FFF ) {
4711 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712 return propagateFloat128NaN( a, b );
4716 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4719 zSig0 |= LIT64( 0x0002000000000000 );
4723 aSig0 |= LIT64( 0x0001000000000000 );
4724 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4726 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4729 shift128ExtraRightJamming(
4730 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4732 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4737 -------------------------------------------------------------------------------
4738 Returns the result of subtracting the absolute values of the quadruple-
4739 precision floating-point values `a' and `b'. If `zSign' is 1, the
4740 difference is negated before being returned. `zSign' is ignored if the
4741 result is a NaN. The subtraction is performed according to the IEC/IEEE
4742 Standard for Binary Floating-Point Arithmetic.
4743 -------------------------------------------------------------------------------
4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4747 int32 aExp, bExp, zExp;
4748 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4752 aSig1 = extractFloat128Frac1( a );
4753 aSig0 = extractFloat128Frac0( a );
4754 aExp = extractFloat128Exp( a );
4755 bSig1 = extractFloat128Frac1( b );
4756 bSig0 = extractFloat128Frac0( b );
4757 bExp = extractFloat128Exp( b );
4758 expDiff = aExp - bExp;
4759 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761 if ( 0 < expDiff ) goto aExpBigger;
4762 if ( expDiff < 0 ) goto bExpBigger;
4763 if ( aExp == 0x7FFF ) {
4764 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765 return propagateFloat128NaN( a, b );
4767 float_raise( float_flag_invalid );
4768 z.low = float128_default_nan_low;
4769 z.high = float128_default_nan_high;
4776 if ( bSig0 < aSig0 ) goto aBigger;
4777 if ( aSig0 < bSig0 ) goto bBigger;
4778 if ( bSig1 < aSig1 ) goto aBigger;
4779 if ( aSig1 < bSig1 ) goto bBigger;
4780 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4782 if ( bExp == 0x7FFF ) {
4783 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4790 aSig0 |= LIT64( 0x4000000000000000 );
4792 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793 bSig0 |= LIT64( 0x4000000000000000 );
4795 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4798 goto normalizeRoundAndPack;
4800 if ( aExp == 0x7FFF ) {
4801 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4808 bSig0 |= LIT64( 0x4000000000000000 );
4810 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811 aSig0 |= LIT64( 0x4000000000000000 );
4813 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4815 normalizeRoundAndPack:
4817 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4822 -------------------------------------------------------------------------------
4823 Returns the result of adding the quadruple-precision floating-point values
4824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4825 for Binary Floating-Point Arithmetic.
4826 -------------------------------------------------------------------------------
4828 float128 float128_add( float128 a, float128 b )
4832 aSign = extractFloat128Sign( a );
4833 bSign = extractFloat128Sign( b );
4834 if ( aSign == bSign ) {
4835 return addFloat128Sigs( a, b, aSign );
4838 return subFloat128Sigs( a, b, aSign );
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the quadruple-precision floating-point
4846 values `a' and `b'. The operation is performed according to the IEC/IEEE
4847 Standard for Binary Floating-Point Arithmetic.
4848 -------------------------------------------------------------------------------
4850 float128 float128_sub( float128 a, float128 b )
4854 aSign = extractFloat128Sign( a );
4855 bSign = extractFloat128Sign( b );
4856 if ( aSign == bSign ) {
4857 return subFloat128Sigs( a, b, aSign );
4860 return addFloat128Sigs( a, b, aSign );
4866 -------------------------------------------------------------------------------
4867 Returns the result of multiplying the quadruple-precision floating-point
4868 values `a' and `b'. The operation is performed according to the IEC/IEEE
4869 Standard for Binary Floating-Point Arithmetic.
4870 -------------------------------------------------------------------------------
4872 float128 float128_mul( float128 a, float128 b )
4874 flag aSign, bSign, zSign;
4875 int32 aExp, bExp, zExp;
4876 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4879 aSig1 = extractFloat128Frac1( a );
4880 aSig0 = extractFloat128Frac0( a );
4881 aExp = extractFloat128Exp( a );
4882 aSign = extractFloat128Sign( a );
4883 bSig1 = extractFloat128Frac1( b );
4884 bSig0 = extractFloat128Frac0( b );
4885 bExp = extractFloat128Exp( b );
4886 bSign = extractFloat128Sign( b );
4887 zSign = aSign ^ bSign;
4888 if ( aExp == 0x7FFF ) {
4889 if ( ( aSig0 | aSig1 )
4890 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891 return propagateFloat128NaN( a, b );
4893 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894 return packFloat128( zSign, 0x7FFF, 0, 0 );
4896 if ( bExp == 0x7FFF ) {
4897 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4900 float_raise( float_flag_invalid );
4901 z.low = float128_default_nan_low;
4902 z.high = float128_default_nan_high;
4905 return packFloat128( zSign, 0x7FFF, 0, 0 );
4908 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4912 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4915 zExp = aExp + bExp - 0x4000;
4916 aSig0 |= LIT64( 0x0001000000000000 );
4917 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920 zSig2 |= ( zSig3 != 0 );
4921 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922 shift128ExtraRightJamming(
4923 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4926 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4931 -------------------------------------------------------------------------------
4932 Returns the result of dividing the quadruple-precision floating-point value
4933 `a' by the corresponding value `b'. The operation is performed according to
4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4937 float128 float128_div( float128 a, float128 b )
4939 flag aSign, bSign, zSign;
4940 int32 aExp, bExp, zExp;
4941 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4945 aSig1 = extractFloat128Frac1( a );
4946 aSig0 = extractFloat128Frac0( a );
4947 aExp = extractFloat128Exp( a );
4948 aSign = extractFloat128Sign( a );
4949 bSig1 = extractFloat128Frac1( b );
4950 bSig0 = extractFloat128Frac0( b );
4951 bExp = extractFloat128Exp( b );
4952 bSign = extractFloat128Sign( b );
4953 zSign = aSign ^ bSign;
4954 if ( aExp == 0x7FFF ) {
4955 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956 if ( bExp == 0x7FFF ) {
4957 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4960 return packFloat128( zSign, 0x7FFF, 0, 0 );
4962 if ( bExp == 0x7FFF ) {
4963 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964 return packFloat128( zSign, 0, 0, 0 );
4967 if ( ( bSig0 | bSig1 ) == 0 ) {
4968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4970 float_raise( float_flag_invalid );
4971 z.low = float128_default_nan_low;
4972 z.high = float128_default_nan_high;
4975 float_raise( float_flag_divbyzero );
4976 return packFloat128( zSign, 0x7FFF, 0, 0 );
4978 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4984 zExp = aExp - bExp + 0x3FFD;
4986 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4988 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4993 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996 while ( (sbits64) rem0 < 0 ) {
4998 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5000 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004 while ( (sbits64) rem1 < 0 ) {
5006 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5008 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5010 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5016 -------------------------------------------------------------------------------
5017 Returns the remainder of the quadruple-precision floating-point value `a'
5018 with respect to the corresponding value `b'. The operation is performed
5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5022 float128 float128_rem( float128 a, float128 b )
5024 flag aSign, bSign, zSign;
5025 int32 aExp, bExp, expDiff;
5026 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5031 aSig1 = extractFloat128Frac1( a );
5032 aSig0 = extractFloat128Frac0( a );
5033 aExp = extractFloat128Exp( a );
5034 aSign = extractFloat128Sign( a );
5035 bSig1 = extractFloat128Frac1( b );
5036 bSig0 = extractFloat128Frac0( b );
5037 bExp = extractFloat128Exp( b );
5038 bSign = extractFloat128Sign( b );
5039 if ( aExp == 0x7FFF ) {
5040 if ( ( aSig0 | aSig1 )
5041 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042 return propagateFloat128NaN( a, b );
5046 if ( bExp == 0x7FFF ) {
5047 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5051 if ( ( bSig0 | bSig1 ) == 0 ) {
5053 float_raise( float_flag_invalid );
5054 z.low = float128_default_nan_low;
5055 z.high = float128_default_nan_high;
5058 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5061 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5064 expDiff = aExp - bExp;
5065 if ( expDiff < -1 ) return a;
5067 aSig0 | LIT64( 0x0001000000000000 ),
5069 15 - ( expDiff < 0 ),
5074 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075 q = le128( bSig0, bSig1, aSig0, aSig1 );
5076 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5078 while ( 0 < expDiff ) {
5079 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080 q = ( 4 < q ) ? q - 4 : 0;
5081 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5087 if ( -64 < expDiff ) {
5088 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089 q = ( 4 < q ) ? q - 4 : 0;
5091 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5093 if ( expDiff < 0 ) {
5094 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5099 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5103 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5107 alternateASig0 = aSig0;
5108 alternateASig1 = aSig1;
5110 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111 } while ( 0 <= (sbits64) aSig0 );
5113 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114 if ( ( sigMean0 < 0 )
5115 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116 aSig0 = alternateASig0;
5117 aSig1 = alternateASig1;
5119 zSign = ( (sbits64) aSig0 < 0 );
5120 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5122 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5127 -------------------------------------------------------------------------------
5128 Returns the square root of the quadruple-precision floating-point value `a'.
5129 The operation is performed according to the IEC/IEEE Standard for Binary
5130 Floating-Point Arithmetic.
5131 -------------------------------------------------------------------------------
5133 float128 float128_sqrt( float128 a )
5137 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5141 aSig1 = extractFloat128Frac1( a );
5142 aSig0 = extractFloat128Frac0( a );
5143 aExp = extractFloat128Exp( a );
5144 aSign = extractFloat128Sign( a );
5145 if ( aExp == 0x7FFF ) {
5146 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147 if ( ! aSign ) return a;
5151 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5153 float_raise( float_flag_invalid );
5154 z.low = float128_default_nan_low;
5155 z.high = float128_default_nan_high;
5159 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5162 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163 aSig0 |= LIT64( 0x0001000000000000 );
5164 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167 doubleZSig0 = zSig0<<1;
5168 mul64To128( zSig0, zSig0, &term0, &term1 );
5169 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170 while ( (sbits64) rem0 < 0 ) {
5173 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5175 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177 if ( zSig1 == 0 ) zSig1 = 1;
5178 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180 mul64To128( zSig1, zSig1, &term2, &term3 );
5181 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182 while ( (sbits64) rem1 < 0 ) {
5184 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5186 term2 |= doubleZSig0;
5187 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5189 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5191 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5197 -------------------------------------------------------------------------------
5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 the corresponding value `b', and 0 otherwise. The comparison is performed
5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5203 flag float128_eq( float128 a, float128 b )
5206 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5207 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5209 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5211 if ( float128_is_signaling_nan( a )
5212 || float128_is_signaling_nan( b ) ) {
5213 float_raise( float_flag_invalid );
5219 && ( ( a.high == b.high )
5221 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5227 -------------------------------------------------------------------------------
5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 or equal to the corresponding value `b', and 0 otherwise. The comparison
5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5232 -------------------------------------------------------------------------------
5234 flag float128_le( float128 a, float128 b )
5238 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5239 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5241 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5243 float_raise( float_flag_invalid );
5246 aSign = extractFloat128Sign( a );
5247 bSign = extractFloat128Sign( b );
5248 if ( aSign != bSign ) {
5251 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5255 aSign ? le128( b.high, b.low, a.high, a.low )
5256 : le128( a.high, a.low, b.high, b.low );
5261 -------------------------------------------------------------------------------
5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 the corresponding value `b', and 0 otherwise. The comparison is performed
5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 -------------------------------------------------------------------------------
5267 flag float128_lt( float128 a, float128 b )
5271 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5272 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5274 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5276 float_raise( float_flag_invalid );
5279 aSign = extractFloat128Sign( a );
5280 bSign = extractFloat128Sign( b );
5281 if ( aSign != bSign ) {
5284 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5288 aSign ? lt128( b.high, b.low, a.high, a.low )
5289 : lt128( a.high, a.low, b.high, b.low );
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise. The invalid exception is
5297 raised if either operand is a NaN. Otherwise, the comparison is performed
5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 -------------------------------------------------------------------------------
5301 flag float128_eq_signaling( float128 a, float128 b )
5304 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5305 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5307 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5309 float_raise( float_flag_invalid );
5314 && ( ( a.high == b.high )
5316 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5325 cause an exception. Otherwise, the comparison is performed according to the
5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 -------------------------------------------------------------------------------
5329 flag float128_le_quiet( float128 a, float128 b )
5333 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5334 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5336 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338 if ( float128_is_signaling_nan( a )
5339 || float128_is_signaling_nan( b ) ) {
5340 float_raise( float_flag_invalid );
5344 aSign = extractFloat128Sign( a );
5345 bSign = extractFloat128Sign( b );
5346 if ( aSign != bSign ) {
5349 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5353 aSign ? le128( b.high, b.low, a.high, a.low )
5354 : le128( a.high, a.low, b.high, b.low );
5359 -------------------------------------------------------------------------------
5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5362 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5363 Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5366 flag float128_lt_quiet( float128 a, float128 b )
5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5375 if ( float128_is_signaling_nan( a )
5376 || float128_is_signaling_nan( b ) ) {
5377 float_raise( float_flag_invalid );
5381 aSign = extractFloat128Sign( a );
5382 bSign = extractFloat128Sign( b );
5383 if ( aSign != bSign ) {
5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5390 aSign ? lt128( b.high, b.low, a.high, a.low )
5391 : lt128( a.high, a.low, b.high, b.low );
5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 * These two routines are not part of the original softfloat distribution.
5403 * They are based on the corresponding conversions to integer but return
5404 * unsigned numbers instead since these functions are required by GCC.
5406 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5412 -------------------------------------------------------------------------------
5413 Returns the result of converting the double-precision floating-point value
5414 `a' to the 32-bit unsigned integer format. The conversion is
5415 performed according to the IEC/IEEE Standard for Binary Floating-point
5416 Arithmetic, except that the conversion is always rounded toward zero. If
5417 `a' is a NaN, the largest positive integer is returned. If the conversion
5418 overflows, the largest integer positive is returned.
5419 -------------------------------------------------------------------------------
5421 uint32 float64_to_uint32_round_to_zero( float64 a )
5424 int16 aExp, shiftCount;
5425 bits64 aSig, savedASig;
5428 aSig = extractFloat64Frac( a );
5429 aExp = extractFloat64Exp( a );
5430 aSign = extractFloat64Sign( a );
5433 float_raise( float_flag_invalid );
5437 if ( 0x41E < aExp ) {
5438 float_raise( float_flag_invalid );
5441 else if ( aExp < 0x3FF ) {
5442 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5445 aSig |= LIT64( 0x0010000000000000 );
5446 shiftCount = 0x433 - aExp;
5448 aSig >>= shiftCount;
5450 if ( ( aSig<<shiftCount ) != savedASig ) {
5451 float_exception_flags |= float_flag_inexact;
5458 -------------------------------------------------------------------------------
5459 Returns the result of converting the single-precision floating-point value
5460 `a' to the 32-bit unsigned integer format. The conversion is
5461 performed according to the IEC/IEEE Standard for Binary Floating-point
5462 Arithmetic, except that the conversion is always rounded toward zero. If
5463 `a' is a NaN, the largest positive integer is returned. If the conversion
5464 overflows, the largest positive integer is returned.
5465 -------------------------------------------------------------------------------
5467 uint32 float32_to_uint32_round_to_zero( float32 a )
5470 int16 aExp, shiftCount;
5474 aSig = extractFloat32Frac( a );
5475 aExp = extractFloat32Exp( a );
5476 aSign = extractFloat32Sign( a );
5477 shiftCount = aExp - 0x9E;
5480 float_raise( float_flag_invalid );
5483 if ( 0 < shiftCount ) {
5484 float_raise( float_flag_invalid );
5487 else if ( aExp <= 0x7E ) {
5488 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5491 aSig = ( aSig | 0x800000 )<<8;
5492 z = aSig>>( - shiftCount );
5493 if ( aSig<<( shiftCount & 31 ) ) {
5494 float_exception_flags |= float_flag_inexact;