lib/libc/softfloat/bits64/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.2 2003/07/26 19:24:52 salo Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 #include <sys/cdefs.h>
  48 __FBSDID("$FreeBSD$");
  49
  50 #ifdef SOFTFLOAT_FOR_GCC
  51 #include "softfloat-for-gcc.h"
  52 #endif
  53
  54 #include "milieu.h"
  55 #include "softfloat.h"
  56
  57 /*
  58  * Conversions between floats as stored in memory and floats as
  59  * SoftFloat uses them
  60  */
  61 #ifndef FLOAT64_DEMANGLE
  62 #define FLOAT64_DEMANGLE(a)     (a)
  63 #endif
  64 #ifndef FLOAT64_MANGLE
  65 #define FLOAT64_MANGLE(a)       (a)
  66 #endif
  67
  68 /*
  69 -------------------------------------------------------------------------------
  70 Floating-point rounding mode, extended double-precision rounding precision,
  71 and exception flags.
  72 -------------------------------------------------------------------------------
  73 */
  74 fp_rnd_t float_rounding_mode = float_round_nearest_even;
  75 fp_except float_exception_flags = 0;
  76 #ifdef FLOATX80
  77 int8 floatx80_rounding_precision = 80;
  78 #endif
  79
  80 /*
  81 -------------------------------------------------------------------------------
  82 Primitive arithmetic functions, including multi-word arithmetic, and
  83 division and square root approximations.  (Can be specialized to target if
  84 desired.)
  85 -------------------------------------------------------------------------------
  86 */
  87 #include "softfloat-macros"
  88
  89 /*
  90 -------------------------------------------------------------------------------
  91 Functions and definitions to determine:  (1) whether tininess for underflow
  92 is detected before or after rounding by default, (2) what (if anything)
  93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  95 are propagated from function inputs to output.  These details are target-
  96 specific.
  97 -------------------------------------------------------------------------------
  98 */
  99 #include "softfloat-specialize"
 100
 101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
 102 /*
 103 -------------------------------------------------------------------------------
 104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 105 and 7, and returns the properly rounded 32-bit integer corresponding to the
 106 input.  If `zSign' is 1, the input is negated before being converted to an
 107 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 108 is simply rounded to an integer, with the inexact exception raised if the
 109 input cannot be represented exactly as an integer.  However, if the fixed-
 110 point input is too large, the invalid exception is raised and the largest
 111 positive or negative integer is returned.
 112 -------------------------------------------------------------------------------
 113 */
 114 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 115 {
 116     int8 roundingMode;
 117     flag roundNearestEven;
 118     int8 roundIncrement, roundBits;
 119     int32 z;
 120
 121     roundingMode = float_rounding_mode;
 122     roundNearestEven = ( roundingMode == float_round_nearest_even );
 123     roundIncrement = 0x40;
 124     if ( ! roundNearestEven ) {
 125         if ( roundingMode == float_round_to_zero ) {
 126             roundIncrement = 0;
 127         }
 128         else {
 129             roundIncrement = 0x7F;
 130             if ( zSign ) {
 131                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 132             }
 133             else {
 134                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 135             }
 136         }
 137     }
 138     roundBits = absZ & 0x7F;
 139     absZ = ( absZ + roundIncrement )>>7;
 140     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 141     z = absZ;
 142     if ( zSign ) z = - z;
 143     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 144         float_raise( float_flag_invalid );
 145         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 146     }
 147     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 148     return z;
 149
 150 }
 151
 152 /*
 153 -------------------------------------------------------------------------------
 154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155 `absZ1', with binary point between bits 63 and 64 (between the input words),
 156 and returns the properly rounded 64-bit integer corresponding to the input.
 157 If `zSign' is 1, the input is negated before being converted to an integer.
 158 Ordinarily, the fixed-point input is simply rounded to an integer, with
 159 the inexact exception raised if the input cannot be represented exactly as
 160 an integer.  However, if the fixed-point input is too large, the invalid
 161 exception is raised and the largest positive or negative integer is
 162 returned.
 163 -------------------------------------------------------------------------------
 164 */
 165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 166 {
 167     int8 roundingMode;
 168     flag roundNearestEven, increment;
 169     int64 z;
 170
 171     roundingMode = float_rounding_mode;
 172     roundNearestEven = ( roundingMode == float_round_nearest_even );
 173     increment = ( (sbits64) absZ1 < 0 );
 174     if ( ! roundNearestEven ) {
 175         if ( roundingMode == float_round_to_zero ) {
 176             increment = 0;
 177         }
 178         else {
 179             if ( zSign ) {
 180                 increment = ( roundingMode == float_round_down ) && absZ1;
 181             }
 182             else {
 183                 increment = ( roundingMode == float_round_up ) && absZ1;
 184             }
 185         }
 186     }
 187     if ( increment ) {
 188         ++absZ0;
 189         if ( absZ0 == 0 ) goto overflow;
 190         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191     }
 192     z = absZ0;
 193     if ( zSign ) z = - z;
 194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195  overflow:
 196         float_raise( float_flag_invalid );
 197         return
 198               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 199             : LIT64( 0x7FFFFFFFFFFFFFFF );
 200     }
 201     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 202     return z;
 203
 204 }
 205 #endif
 206
 207 /*
 208 -------------------------------------------------------------------------------
 209 Returns the fraction bits of the single-precision floating-point value `a'.
 210 -------------------------------------------------------------------------------
 211 */
 212 INLINE bits32 extractFloat32Frac( float32 a )
 213 {
 214
 215     return a & 0x007FFFFF;
 216
 217 }
 218
 219 /*
 220 -------------------------------------------------------------------------------
 221 Returns the exponent bits of the single-precision floating-point value `a'.
 222 -------------------------------------------------------------------------------
 223 */
 224 INLINE int16 extractFloat32Exp( float32 a )
 225 {
 226
 227     return ( a>>23 ) & 0xFF;
 228
 229 }
 230
 231 /*
 232 -------------------------------------------------------------------------------
 233 Returns the sign bit of the single-precision floating-point value `a'.
 234 -------------------------------------------------------------------------------
 235 */
 236 INLINE flag extractFloat32Sign( float32 a )
 237 {
 238
 239     return a>>31;
 240
 241 }
 242
 243 /*
 244 -------------------------------------------------------------------------------
 245 Normalizes the subnormal single-precision floating-point value represented
 246 by the denormalized significand `aSig'.  The normalized exponent and
 247 significand are stored at the locations pointed to by `zExpPtr' and
 248 `zSigPtr', respectively.
 249 -------------------------------------------------------------------------------
 250 */
 251 static void
 252  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 253 {
 254     int8 shiftCount;
 255
 256     shiftCount = countLeadingZeros32( aSig ) - 8;
 257     *zSigPtr = aSig<<shiftCount;
 258     *zExpPtr = 1 - shiftCount;
 259
 260 }
 261
 262 /*
 263 -------------------------------------------------------------------------------
 264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 265 single-precision floating-point value, returning the result.  After being
 266 shifted into the proper positions, the three fields are simply added
 267 together to form the result.  This means that any integer portion of `zSig'
 268 will be added into the exponent.  Since a properly normalized significand
 269 will have an integer portion equal to 1, the `zExp' input should be 1 less
 270 than the desired result exponent whenever `zSig' is a complete, normalized
 271 significand.
 272 -------------------------------------------------------------------------------
 273 */
 274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 275 {
 276
 277     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 278
 279 }
 280
 281 /*
 282 -------------------------------------------------------------------------------
 283 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 284 and significand `zSig', and returns the proper single-precision floating-
 285 point value corresponding to the abstract input.  Ordinarily, the abstract
 286 value is simply rounded and packed into the single-precision format, with
 287 the inexact exception raised if the abstract input cannot be represented
 288 exactly.  However, if the abstract value is too large, the overflow and
 289 inexact exceptions are raised and an infinity or maximal finite value is
 290 returned.  If the abstract value is too small, the input value is rounded to
 291 a subnormal number, and the underflow and inexact exceptions are raised if
 292 the abstract input cannot be represented exactly as a subnormal single-
 293 precision floating-point number.
 294     The input significand `zSig' has its binary point between bits 30
 295 and 29, which is 7 bits to the left of the usual location.  This shifted
 296 significand must be normalized or smaller.  If `zSig' is not normalized,
 297 `zExp' must be 0; in that case, the result returned is a subnormal number,
 298 and it must not require rounding.  In the usual case that `zSig' is
 299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 300 The handling of underflow and overflow follows the IEC/IEEE Standard for
 301 Binary Floating-Point Arithmetic.
 302 -------------------------------------------------------------------------------
 303 */
 304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 305 {
 306     int8 roundingMode;
 307     flag roundNearestEven;
 308     int8 roundIncrement, roundBits;
 309     flag isTiny;
 310
 311     roundingMode = float_rounding_mode;
 312     roundNearestEven = ( roundingMode == float_round_nearest_even );
 313     roundIncrement = 0x40;
 314     if ( ! roundNearestEven ) {
 315         if ( roundingMode == float_round_to_zero ) {
 316             roundIncrement = 0;
 317         }
 318         else {
 319             roundIncrement = 0x7F;
 320             if ( zSign ) {
 321                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 322             }
 323             else {
 324                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 325             }
 326         }
 327     }
 328     roundBits = zSig & 0x7F;
 329     if ( 0xFD <= (bits16) zExp ) {
 330         if (    ( 0xFD < zExp )
 331              || (    ( zExp == 0xFD )
 332                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 333            ) {
 334             float_raise( float_flag_overflow | float_flag_inexact );
 335             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 336         }
 337         if ( zExp < 0 ) {
 338             isTiny =
 339                    ( float_detect_tininess == float_tininess_before_rounding )
 340                 || ( zExp < -1 )
 341                 || ( zSig + roundIncrement < 0x80000000 );
 342             shift32RightJamming( zSig, - zExp, &zSig );
 343             zExp = 0;
 344             roundBits = zSig & 0x7F;
 345             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 346         }
 347     }
 348     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 349     zSig = ( zSig + roundIncrement )>>7;
 350     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 351     if ( zSig == 0 ) zExp = 0;
 352     return packFloat32( zSign, zExp, zSig );
 353
 354 }
 355
 356 /*
 357 -------------------------------------------------------------------------------
 358 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 359 and significand `zSig', and returns the proper single-precision floating-
 360 point value corresponding to the abstract input.  This routine is just like
 361 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 363 floating-point exponent.
 364 -------------------------------------------------------------------------------
 365 */
 366 static float32
 367  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 368 {
 369     int8 shiftCount;
 370
 371     shiftCount = countLeadingZeros32( zSig ) - 1;
 372     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 373
 374 }
 375
 376 /*
 377 -------------------------------------------------------------------------------
 378 Returns the fraction bits of the double-precision floating-point value `a'.
 379 -------------------------------------------------------------------------------
 380 */
 381 INLINE bits64 extractFloat64Frac( float64 a )
 382 {
 383
 384     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 385
 386 }
 387
 388 /*
 389 -------------------------------------------------------------------------------
 390 Returns the exponent bits of the double-precision floating-point value `a'.
 391 -------------------------------------------------------------------------------
 392 */
 393 INLINE int16 extractFloat64Exp( float64 a )
 394 {
 395
 396     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
 397
 398 }
 399
 400 /*
 401 -------------------------------------------------------------------------------
 402 Returns the sign bit of the double-precision floating-point value `a'.
 403 -------------------------------------------------------------------------------
 404 */
 405 INLINE flag extractFloat64Sign( float64 a )
 406 {
 407
 408     return FLOAT64_DEMANGLE(a)>>63;
 409
 410 }
 411
 412 /*
 413 -------------------------------------------------------------------------------
 414 Normalizes the subnormal double-precision floating-point value represented
 415 by the denormalized significand `aSig'.  The normalized exponent and
 416 significand are stored at the locations pointed to by `zExpPtr' and
 417 `zSigPtr', respectively.
 418 -------------------------------------------------------------------------------
 419 */
 420 static void
 421  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 422 {
 423     int8 shiftCount;
 424
 425     shiftCount = countLeadingZeros64( aSig ) - 11;
 426     *zSigPtr = aSig<<shiftCount;
 427     *zExpPtr = 1 - shiftCount;
 428
 429 }
 430
 431 /*
 432 -------------------------------------------------------------------------------
 433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 434 double-precision floating-point value, returning the result.  After being
 435 shifted into the proper positions, the three fields are simply added
 436 together to form the result.  This means that any integer portion of `zSig'
 437 will be added into the exponent.  Since a properly normalized significand
 438 will have an integer portion equal to 1, the `zExp' input should be 1 less
 439 than the desired result exponent whenever `zSig' is a complete, normalized
 440 significand.
 441 -------------------------------------------------------------------------------
 442 */
 443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 444 {
 445
 446     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 447                            ( ( (bits64) zExp )<<52 ) + zSig );
 448
 449 }
 450
 451 /*
 452 -------------------------------------------------------------------------------
 453 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 454 and significand `zSig', and returns the proper double-precision floating-
 455 point value corresponding to the abstract input.  Ordinarily, the abstract
 456 value is simply rounded and packed into the double-precision format, with
 457 the inexact exception raised if the abstract input cannot be represented
 458 exactly.  However, if the abstract value is too large, the overflow and
 459 inexact exceptions are raised and an infinity or maximal finite value is
 460 returned.  If the abstract value is too small, the input value is rounded to
 461 a subnormal number, and the underflow and inexact exceptions are raised if
 462 the abstract input cannot be represented exactly as a subnormal double-
 463 precision floating-point number.
 464     The input significand `zSig' has its binary point between bits 62
 465 and 61, which is 10 bits to the left of the usual location.  This shifted
 466 significand must be normalized or smaller.  If `zSig' is not normalized,
 467 `zExp' must be 0; in that case, the result returned is a subnormal number,
 468 and it must not require rounding.  In the usual case that `zSig' is
 469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 470 The handling of underflow and overflow follows the IEC/IEEE Standard for
 471 Binary Floating-Point Arithmetic.
 472 -------------------------------------------------------------------------------
 473 */
 474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 475 {
 476     int8 roundingMode;
 477     flag roundNearestEven;
 478     int16 roundIncrement, roundBits;
 479     flag isTiny;
 480
 481     roundingMode = float_rounding_mode;
 482     roundNearestEven = ( roundingMode == float_round_nearest_even );
 483     roundIncrement = 0x200;
 484     if ( ! roundNearestEven ) {
 485         if ( roundingMode == float_round_to_zero ) {
 486             roundIncrement = 0;
 487         }
 488         else {
 489             roundIncrement = 0x3FF;
 490             if ( zSign ) {
 491                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 492             }
 493             else {
 494                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 495             }
 496         }
 497     }
 498     roundBits = zSig & 0x3FF;
 499     if ( 0x7FD <= (bits16) zExp ) {
 500         if (    ( 0x7FD < zExp )
 501              || (    ( zExp == 0x7FD )
 502                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 503            ) {
 504             float_raise( float_flag_overflow | float_flag_inexact );
 505             return FLOAT64_MANGLE(
 506                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 507                 ( roundIncrement == 0 ));
 508         }
 509         if ( zExp < 0 ) {
 510             isTiny =
 511                    ( float_detect_tininess == float_tininess_before_rounding )
 512                 || ( zExp < -1 )
 513                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 514             shift64RightJamming( zSig, - zExp, &zSig );
 515             zExp = 0;
 516             roundBits = zSig & 0x3FF;
 517             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 518         }
 519     }
 520     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 521     zSig = ( zSig + roundIncrement )>>10;
 522     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 523     if ( zSig == 0 ) zExp = 0;
 524     return packFloat64( zSign, zExp, zSig );
 525
 526 }
 527
 528 /*
 529 -------------------------------------------------------------------------------
 530 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 531 and significand `zSig', and returns the proper double-precision floating-
 532 point value corresponding to the abstract input.  This routine is just like
 533 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 535 floating-point exponent.
 536 -------------------------------------------------------------------------------
 537 */
 538 static float64
 539  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 540 {
 541     int8 shiftCount;
 542
 543     shiftCount = countLeadingZeros64( zSig ) - 1;
 544     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 545
 546 }
 547
 548 #ifdef FLOATX80
 549
 550 /*
 551 -------------------------------------------------------------------------------
 552 Returns the fraction bits of the extended double-precision floating-point
 553 value `a'.
 554 -------------------------------------------------------------------------------
 555 */
 556 INLINE bits64 extractFloatx80Frac( floatx80 a )
 557 {
 558
 559     return a.low;
 560
 561 }
 562
 563 /*
 564 -------------------------------------------------------------------------------
 565 Returns the exponent bits of the extended double-precision floating-point
 566 value `a'.
 567 -------------------------------------------------------------------------------
 568 */
 569 INLINE int32 extractFloatx80Exp( floatx80 a )
 570 {
 571
 572     return a.high & 0x7FFF;
 573
 574 }
 575
 576 /*
 577 -------------------------------------------------------------------------------
 578 Returns the sign bit of the extended double-precision floating-point value
 579 `a'.
 580 -------------------------------------------------------------------------------
 581 */
 582 INLINE flag extractFloatx80Sign( floatx80 a )
 583 {
 584
 585     return a.high>>15;
 586
 587 }
 588
 589 /*
 590 -------------------------------------------------------------------------------
 591 Normalizes the subnormal extended double-precision floating-point value
 592 represented by the denormalized significand `aSig'.  The normalized exponent
 593 and significand are stored at the locations pointed to by `zExpPtr' and
 594 `zSigPtr', respectively.
 595 -------------------------------------------------------------------------------
 596 */
 597 static void
 598  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 599 {
 600     int8 shiftCount;
 601
 602     shiftCount = countLeadingZeros64( aSig );
 603     *zSigPtr = aSig<<shiftCount;
 604     *zExpPtr = 1 - shiftCount;
 605
 606 }
 607
 608 /*
 609 -------------------------------------------------------------------------------
 610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 611 extended double-precision floating-point value, returning the result.
 612 -------------------------------------------------------------------------------
 613 */
 614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 615 {
 616     floatx80 z;
 617
 618     z.low = zSig;
 619     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 620     return z;
 621
 622 }
 623
 624 /*
 625 -------------------------------------------------------------------------------
 626 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 627 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 628 and returns the proper extended double-precision floating-point value
 629 corresponding to the abstract input.  Ordinarily, the abstract value is
 630 rounded and packed into the extended double-precision format, with the
 631 inexact exception raised if the abstract input cannot be represented
 632 exactly.  However, if the abstract value is too large, the overflow and
 633 inexact exceptions are raised and an infinity or maximal finite value is
 634 returned.  If the abstract value is too small, the input value is rounded to
 635 a subnormal number, and the underflow and inexact exceptions are raised if
 636 the abstract input cannot be represented exactly as a subnormal extended
 637 double-precision floating-point number.
 638     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 639 number of bits as single or double precision, respectively.  Otherwise, the
 640 result is rounded to the full precision of the extended double-precision
 641 format.
 642     The input significand must be normalized or smaller.  If the input
 643 significand is not normalized, `zExp' must be 0; in that case, the result
 644 returned is a subnormal number, and it must not require rounding.  The
 645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 646 Floating-Point Arithmetic.
 647 -------------------------------------------------------------------------------
 648 */
 649 static floatx80
 650  roundAndPackFloatx80(
 651      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 652  )
 653 {
 654     int8 roundingMode;
 655     flag roundNearestEven, increment, isTiny;
 656     int64 roundIncrement, roundMask, roundBits;
 657
 658     roundingMode = float_rounding_mode;
 659     roundNearestEven = ( roundingMode == float_round_nearest_even );
 660     if ( roundingPrecision == 80 ) goto precision80;
 661     if ( roundingPrecision == 64 ) {
 662         roundIncrement = LIT64( 0x0000000000000400 );
 663         roundMask = LIT64( 0x00000000000007FF );
 664     }
 665     else if ( roundingPrecision == 32 ) {
 666         roundIncrement = LIT64( 0x0000008000000000 );
 667         roundMask = LIT64( 0x000000FFFFFFFFFF );
 668     }
 669     else {
 670         goto precision80;
 671     }
 672     zSig0 |= ( zSig1 != 0 );
 673     if ( ! roundNearestEven ) {
 674         if ( roundingMode == float_round_to_zero ) {
 675             roundIncrement = 0;
 676         }
 677         else {
 678             roundIncrement = roundMask;
 679             if ( zSign ) {
 680                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 681             }
 682             else {
 683                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 684             }
 685         }
 686     }
 687     roundBits = zSig0 & roundMask;
 688     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 689         if (    ( 0x7FFE < zExp )
 690              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 691            ) {
 692             goto overflow;
 693         }
 694         if ( zExp <= 0 ) {
 695             isTiny =
 696                    ( float_detect_tininess == float_tininess_before_rounding )
 697                 || ( zExp < 0 )
 698                 || ( zSig0 <= zSig0 + roundIncrement );
 699             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 700             zExp = 0;
 701             roundBits = zSig0 & roundMask;
 702             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 703             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 704             zSig0 += roundIncrement;
 705             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 706             roundIncrement = roundMask + 1;
 707             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 708                 roundMask |= roundIncrement;
 709             }
 710             zSig0 &= ~ roundMask;
 711             return packFloatx80( zSign, zExp, zSig0 );
 712         }
 713     }
 714     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 715     zSig0 += roundIncrement;
 716     if ( zSig0 < roundIncrement ) {
 717         ++zExp;
 718         zSig0 = LIT64( 0x8000000000000000 );
 719     }
 720     roundIncrement = roundMask + 1;
 721     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 722         roundMask |= roundIncrement;
 723     }
 724     zSig0 &= ~ roundMask;
 725     if ( zSig0 == 0 ) zExp = 0;
 726     return packFloatx80( zSign, zExp, zSig0 );
 727  precision80:
 728     increment = ( (sbits64) zSig1 < 0 );
 729     if ( ! roundNearestEven ) {
 730         if ( roundingMode == float_round_to_zero ) {
 731             increment = 0;
 732         }
 733         else {
 734             if ( zSign ) {
 735                 increment = ( roundingMode == float_round_down ) && zSig1;
 736             }
 737             else {
 738                 increment = ( roundingMode == float_round_up ) && zSig1;
 739             }
 740         }
 741     }
 742     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 743         if (    ( 0x7FFE < zExp )
 744              || (    ( zExp == 0x7FFE )
 745                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 746                   && increment
 747                 )
 748            ) {
 749             roundMask = 0;
 750  overflow:
 751             float_raise( float_flag_overflow | float_flag_inexact );
 752             if (    ( roundingMode == float_round_to_zero )
 753                  || ( zSign && ( roundingMode == float_round_up ) )
 754                  || ( ! zSign && ( roundingMode == float_round_down ) )
 755                ) {
 756                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 757             }
 758             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 759         }
 760         if ( zExp <= 0 ) {
 761             isTiny =
 762                    ( float_detect_tininess == float_tininess_before_rounding )
 763                 || ( zExp < 0 )
 764                 || ! increment
 765                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 766             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 767             zExp = 0;
 768             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 769             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 770             if ( roundNearestEven ) {
 771                 increment = ( (sbits64) zSig1 < 0 );
 772             }
 773             else {
 774                 if ( zSign ) {
 775                     increment = ( roundingMode == float_round_down ) && zSig1;
 776                 }
 777                 else {
 778                     increment = ( roundingMode == float_round_up ) && zSig1;
 779                 }
 780             }
 781             if ( increment ) {
 782                 ++zSig0;
 783                 zSig0 &=
 784                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 785                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 786             }
 787             return packFloatx80( zSign, zExp, zSig0 );
 788         }
 789     }
 790     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 791     if ( increment ) {
 792         ++zSig0;
 793         if ( zSig0 == 0 ) {
 794             ++zExp;
 795             zSig0 = LIT64( 0x8000000000000000 );
 796         }
 797         else {
 798             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 799         }
 800     }
 801     else {
 802         if ( zSig0 == 0 ) zExp = 0;
 803     }
 804     return packFloatx80( zSign, zExp, zSig0 );
 805
 806 }
 807
 808 /*
 809 -------------------------------------------------------------------------------
 810 Takes an abstract floating-point value having sign `zSign', exponent
 811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 812 and returns the proper extended double-precision floating-point value
 813 corresponding to the abstract input.  This routine is just like
 814 `roundAndPackFloatx80' except that the input significand does not have to be
 815 normalized.
 816 -------------------------------------------------------------------------------
 817 */
 818 static floatx80
 819  normalizeRoundAndPackFloatx80(
 820      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 821  )
 822 {
 823     int8 shiftCount;
 824
 825     if ( zSig0 == 0 ) {
 826         zSig0 = zSig1;
 827         zSig1 = 0;
 828         zExp -= 64;
 829     }
 830     shiftCount = countLeadingZeros64( zSig0 );
 831     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 832     zExp -= shiftCount;
 833     return
 834         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 835
 836 }
 837
 838 #endif
 839
 840 #ifdef FLOAT128
 841
 842 /*
 843 -------------------------------------------------------------------------------
 844 Returns the least-significant 64 fraction bits of the quadruple-precision
 845 floating-point value `a'.
 846 -------------------------------------------------------------------------------
 847 */
 848 INLINE bits64 extractFloat128Frac1( float128 a )
 849 {
 850
 851     return a.low;
 852
 853 }
 854
 855 /*
 856 -------------------------------------------------------------------------------
 857 Returns the most-significant 48 fraction bits of the quadruple-precision
 858 floating-point value `a'.
 859 -------------------------------------------------------------------------------
 860 */
 861 INLINE bits64 extractFloat128Frac0( float128 a )
 862 {
 863
 864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 865
 866 }
 867
 868 /*
 869 -------------------------------------------------------------------------------
 870 Returns the exponent bits of the quadruple-precision floating-point value
 871 `a'.
 872 -------------------------------------------------------------------------------
 873 */
 874 INLINE int32 extractFloat128Exp( float128 a )
 875 {
 876
 877     return ( a.high>>48 ) & 0x7FFF;
 878
 879 }
 880
 881 /*
 882 -------------------------------------------------------------------------------
 883 Returns the sign bit of the quadruple-precision floating-point value `a'.
 884 -------------------------------------------------------------------------------
 885 */
 886 INLINE flag extractFloat128Sign( float128 a )
 887 {
 888
 889     return a.high>>63;
 890
 891 }
 892
 893 /*
 894 -------------------------------------------------------------------------------
 895 Normalizes the subnormal quadruple-precision floating-point value
 896 represented by the denormalized significand formed by the concatenation of
 897 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 898 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 899 significand are stored at the location pointed to by `zSig0Ptr', and the
 900 least significant 64 bits of the normalized significand are stored at the
 901 location pointed to by `zSig1Ptr'.
 902 -------------------------------------------------------------------------------
 903 */
 904 static void
 905  normalizeFloat128Subnormal(
 906      bits64 aSig0,
 907      bits64 aSig1,
 908      int32 *zExpPtr,
 909      bits64 *zSig0Ptr,
 910      bits64 *zSig1Ptr
 911  )
 912 {
 913     int8 shiftCount;
 914
 915     if ( aSig0 == 0 ) {
 916         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 917         if ( shiftCount < 0 ) {
 918             *zSig0Ptr = aSig1>>( - shiftCount );
 919             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 920         }
 921         else {
 922             *zSig0Ptr = aSig1<<shiftCount;
 923             *zSig1Ptr = 0;
 924         }
 925         *zExpPtr = - shiftCount - 63;
 926     }
 927     else {
 928         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 929         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 930         *zExpPtr = 1 - shiftCount;
 931     }
 932
 933 }
 934
 935 /*
 936 -------------------------------------------------------------------------------
 937 Packs the sign `zSign', the exponent `zExp', and the significand formed
 938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 939 floating-point value, returning the result.  After being shifted into the
 940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 941 added together to form the most significant 32 bits of the result.  This
 942 means that any integer portion of `zSig0' will be added into the exponent.
 943 Since a properly normalized significand will have an integer portion equal
 944 to 1, the `zExp' input should be 1 less than the desired result exponent
 945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 946 significand.
 947 -------------------------------------------------------------------------------
 948 */
 949 INLINE float128
 950  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 951 {
 952     float128 z;
 953
 954     z.low = zSig1;
 955     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 956     return z;
 957
 958 }
 959
 960 /*
 961 -------------------------------------------------------------------------------
 962 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 and `zSig2', and returns the proper quadruple-precision floating-point value
 965 corresponding to the abstract input.  Ordinarily, the abstract value is
 966 simply rounded and packed into the quadruple-precision format, with the
 967 inexact exception raised if the abstract input cannot be represented
 968 exactly.  However, if the abstract value is too large, the overflow and
 969 inexact exceptions are raised and an infinity or maximal finite value is
 970 returned.  If the abstract value is too small, the input value is rounded to
 971 a subnormal number, and the underflow and inexact exceptions are raised if
 972 the abstract input cannot be represented exactly as a subnormal quadruple-
 973 precision floating-point number.
 974     The input significand must be normalized or smaller.  If the input
 975 significand is not normalized, `zExp' must be 0; in that case, the result
 976 returned is a subnormal number, and it must not require rounding.  In the
 977 usual case that the input significand is normalized, `zExp' must be 1 less
 978 than the ``true'' floating-point exponent.  The handling of underflow and
 979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 -------------------------------------------------------------------------------
 981 */
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = float_rounding_mode;
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (sbits64) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (bits32) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact );
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             isTiny =
1034                    ( float_detect_tininess == float_tininess_before_rounding )
1035                 || ( zExp < -1 )
1036                 || ! increment
1037                 || lt128(
1038                        zSig0,
1039                        zSig1,
1040                        LIT64( 0x0001FFFFFFFFFFFF ),
1041                        LIT64( 0xFFFFFFFFFFFFFFFF )
1042                    );
1043             shift128ExtraRightJamming(
1044                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1045             zExp = 0;
1046             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047             if ( roundNearestEven ) {
1048                 increment = ( (sbits64) zSig2 < 0 );
1049             }
1050             else {
1051                 if ( zSign ) {
1052                     increment = ( roundingMode == float_round_down ) && zSig2;
1053                 }
1054                 else {
1055                     increment = ( roundingMode == float_round_up ) && zSig2;
1056                 }
1057             }
1058         }
1059     }
1060     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1061     if ( increment ) {
1062         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1064     }
1065     else {
1066         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1067     }
1068     return packFloat128( zSign, zExp, zSig0, zSig1 );
1069
1070 }
1071
1072 /*
1073 -------------------------------------------------------------------------------
1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 returns the proper quadruple-precision floating-point value corresponding
1077 to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 except that the input significand has fewer bits and does not have to be
1079 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 point exponent.
1081 -------------------------------------------------------------------------------
1082 */
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1086 {
1087     int8 shiftCount;
1088     bits64 zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106
1107 }
1108
1109 #endif
1110
1111 /*
1112 -------------------------------------------------------------------------------
1113 Returns the result of converting the 32-bit two's complement integer `a'
1114 to the single-precision floating-point format.  The conversion is performed
1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116 -------------------------------------------------------------------------------
1117 */
1118 float32 int32_to_float32( int32 a )
1119 {
1120     flag zSign;
1121
1122     if ( a == 0 ) return 0;
1123     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1124     zSign = ( a < 0 );
1125     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1126
1127 }
1128
1129 /*
1130 -------------------------------------------------------------------------------
1131 Returns the result of converting the 32-bit two's complement integer `a'
1132 to the double-precision floating-point format.  The conversion is performed
1133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134 -------------------------------------------------------------------------------
1135 */
1136 float64 int32_to_float64( int32 a )
1137 {
1138     flag zSign;
1139     uint32 absA;
1140     int8 shiftCount;
1141     bits64 zSig;
1142
1143     if ( a == 0 ) return 0;
1144     zSign = ( a < 0 );
1145     absA = zSign ? - a : a;
1146     shiftCount = countLeadingZeros32( absA ) + 21;
1147     zSig = absA;
1148     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149
1150 }
1151
1152 #ifdef FLOATX80
1153
1154 /*
1155 -------------------------------------------------------------------------------
1156 Returns the result of converting the 32-bit two's complement integer `a'
1157 to the extended double-precision floating-point format.  The conversion
1158 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1159 Arithmetic.
1160 -------------------------------------------------------------------------------
1161 */
1162 floatx80 int32_to_floatx80( int32 a )
1163 {
1164     flag zSign;
1165     uint32 absA;
1166     int8 shiftCount;
1167     bits64 zSig;
1168
1169     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1170     zSign = ( a < 0 );
1171     absA = zSign ? - a : a;
1172     shiftCount = countLeadingZeros32( absA ) + 32;
1173     zSig = absA;
1174     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175
1176 }
1177
1178 #endif
1179
1180 #ifdef FLOAT128
1181
1182 /*
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a' to
1185 the quadruple-precision floating-point format.  The conversion is performed
1186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187 -------------------------------------------------------------------------------
1188 */
1189 float128 int32_to_float128( int32 a )
1190 {
1191     flag zSign;
1192     uint32 absA;
1193     int8 shiftCount;
1194     bits64 zSig0;
1195
1196     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1197     zSign = ( a < 0 );
1198     absA = zSign ? - a : a;
1199     shiftCount = countLeadingZeros32( absA ) + 17;
1200     zSig0 = absA;
1201     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202
1203 }
1204
1205 #endif
1206
1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1208 /*
1209 -------------------------------------------------------------------------------
1210 Returns the result of converting the 64-bit two's complement integer `a'
1211 to the single-precision floating-point format.  The conversion is performed
1212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1213 -------------------------------------------------------------------------------
1214 */
1215 float32 int64_to_float32( int64 a )
1216 {
1217     flag zSign;
1218     uint64 absA;
1219     int8 shiftCount;
1220
1221     if ( a == 0 ) return 0;
1222     zSign = ( a < 0 );
1223     absA = zSign ? - a : a;
1224     shiftCount = countLeadingZeros64( absA ) - 40;
1225     if ( 0 <= shiftCount ) {
1226         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1227     }
1228     else {
1229         shiftCount += 7;
1230         if ( shiftCount < 0 ) {
1231             shift64RightJamming( absA, - shiftCount, &absA );
1232         }
1233         else {
1234             absA <<= shiftCount;
1235         }
1236         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1237     }
1238
1239 }
1240
1241 /*
1242 -------------------------------------------------------------------------------
1243 Returns the result of converting the 64-bit two's complement integer `a'
1244 to the double-precision floating-point format.  The conversion is performed
1245 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1246 -------------------------------------------------------------------------------
1247 */
1248 float64 int64_to_float64( int64 a )
1249 {
1250     flag zSign;
1251
1252     if ( a == 0 ) return 0;
1253     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1254         return packFloat64( 1, 0x43E, 0 );
1255     }
1256     zSign = ( a < 0 );
1257     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1258
1259 }
1260
1261 #ifdef FLOATX80
1262
1263 /*
1264 -------------------------------------------------------------------------------
1265 Returns the result of converting the 64-bit two's complement integer `a'
1266 to the extended double-precision floating-point format.  The conversion
1267 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1268 Arithmetic.
1269 -------------------------------------------------------------------------------
1270 */
1271 floatx80 int64_to_floatx80( int64 a )
1272 {
1273     flag zSign;
1274     uint64 absA;
1275     int8 shiftCount;
1276
1277     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1278     zSign = ( a < 0 );
1279     absA = zSign ? - a : a;
1280     shiftCount = countLeadingZeros64( absA );
1281     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1282
1283 }
1284
1285 #endif
1286
1287 #endif /* !SOFTFLOAT_FOR_GCC */
1288
1289 #ifdef FLOAT128
1290
1291 /*
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a' to
1294 the quadruple-precision floating-point format.  The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1297 */
1298 float128 int64_to_float128( int64 a )
1299 {
1300     flag zSign;
1301     uint64 absA;
1302     int8 shiftCount;
1303     int32 zExp;
1304     bits64 zSig0, zSig1;
1305
1306     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1307     zSign = ( a < 0 );
1308     absA = zSign ? - a : a;
1309     shiftCount = countLeadingZeros64( absA ) + 49;
1310     zExp = 0x406E - shiftCount;
1311     if ( 64 <= shiftCount ) {
1312         zSig1 = 0;
1313         zSig0 = absA;
1314         shiftCount -= 64;
1315     }
1316     else {
1317         zSig1 = absA;
1318         zSig0 = 0;
1319     }
1320     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1321     return packFloat128( zSign, zExp, zSig0, zSig1 );
1322
1323 }
1324
1325 #endif
1326
1327 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1328 /*
1329 -------------------------------------------------------------------------------
1330 Returns the result of converting the single-precision floating-point value
1331 `a' to the 32-bit two's complement integer format.  The conversion is
1332 performed according to the IEC/IEEE Standard for Binary Floating-Point
1333 Arithmetic---which means in particular that the conversion is rounded
1334 according to the current rounding mode.  If `a' is a NaN, the largest
1335 positive integer is returned.  Otherwise, if the conversion overflows, the
1336 largest integer with the same sign as `a' is returned.
1337 -------------------------------------------------------------------------------
1338 */
1339 int32 float32_to_int32( float32 a )
1340 {
1341     flag aSign;
1342     int16 aExp, shiftCount;
1343     bits32 aSig;
1344     bits64 aSig64;
1345
1346     aSig = extractFloat32Frac( a );
1347     aExp = extractFloat32Exp( a );
1348     aSign = extractFloat32Sign( a );
1349     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1350     if ( aExp ) aSig |= 0x00800000;
1351     shiftCount = 0xAF - aExp;
1352     aSig64 = aSig;
1353     aSig64 <<= 32;
1354     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1355     return roundAndPackInt32( aSign, aSig64 );
1356
1357 }
1358 #endif /* !SOFTFLOAT_FOR_GCC */
1359
1360 /*
1361 -------------------------------------------------------------------------------
1362 Returns the result of converting the single-precision floating-point value
1363 `a' to the 32-bit two's complement integer format.  The conversion is
1364 performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 Arithmetic, except that the conversion is always rounded toward zero.
1366 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1367 the conversion overflows, the largest integer with the same sign as `a' is
1368 returned.
1369 -------------------------------------------------------------------------------
1370 */
1371 int32 float32_to_int32_round_to_zero( float32 a )
1372 {
1373     flag aSign;
1374     int16 aExp, shiftCount;
1375     bits32 aSig;
1376     int32 z;
1377
1378     aSig = extractFloat32Frac( a );
1379     aExp = extractFloat32Exp( a );
1380     aSign = extractFloat32Sign( a );
1381     shiftCount = aExp - 0x9E;
1382     if ( 0 <= shiftCount ) {
1383         if ( a != 0xCF000000 ) {
1384             float_raise( float_flag_invalid );
1385             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1386         }
1387         return (sbits32) 0x80000000;
1388     }
1389     else if ( aExp <= 0x7E ) {
1390         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1391         return 0;
1392     }
1393     aSig = ( aSig | 0x00800000 )<<8;
1394     z = aSig>>( - shiftCount );
1395     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1396         float_exception_flags |= float_flag_inexact;
1397     }
1398     if ( aSign ) z = - z;
1399     return z;
1400
1401 }
1402
1403 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1404 /*
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 64-bit two's complement integer format.  The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic---which means in particular that the conversion is rounded
1410 according to the current rounding mode.  If `a' is a NaN, the largest
1411 positive integer is returned.  Otherwise, if the conversion overflows, the
1412 largest integer with the same sign as `a' is returned.
1413 -------------------------------------------------------------------------------
1414 */
1415 int64 float32_to_int64( float32 a )
1416 {
1417     flag aSign;
1418     int16 aExp, shiftCount;
1419     bits32 aSig;
1420     bits64 aSig64, aSigExtra;
1421
1422     aSig = extractFloat32Frac( a );
1423     aExp = extractFloat32Exp( a );
1424     aSign = extractFloat32Sign( a );
1425     shiftCount = 0xBE - aExp;
1426     if ( shiftCount < 0 ) {
1427         float_raise( float_flag_invalid );
1428         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429             return LIT64( 0x7FFFFFFFFFFFFFFF );
1430         }
1431         return (sbits64) LIT64( 0x8000000000000000 );
1432     }
1433     if ( aExp ) aSig |= 0x00800000;
1434     aSig64 = aSig;
1435     aSig64 <<= 40;
1436     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1437     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1438
1439 }
1440
1441 /*
1442 -------------------------------------------------------------------------------
1443 Returns the result of converting the single-precision floating-point value
1444 `a' to the 64-bit two's complement integer format.  The conversion is
1445 performed according to the IEC/IEEE Standard for Binary Floating-Point
1446 Arithmetic, except that the conversion is always rounded toward zero.  If
1447 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1448 conversion overflows, the largest integer with the same sign as `a' is
1449 returned.
1450 -------------------------------------------------------------------------------
1451 */
1452 int64 float32_to_int64_round_to_zero( float32 a )
1453 {
1454     flag aSign;
1455     int16 aExp, shiftCount;
1456     bits32 aSig;
1457     bits64 aSig64;
1458     int64 z;
1459
1460     aSig = extractFloat32Frac( a );
1461     aExp = extractFloat32Exp( a );
1462     aSign = extractFloat32Sign( a );
1463     shiftCount = aExp - 0xBE;
1464     if ( 0 <= shiftCount ) {
1465         if ( a != 0xDF000000 ) {
1466             float_raise( float_flag_invalid );
1467             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1468                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1469             }
1470         }
1471         return (sbits64) LIT64( 0x8000000000000000 );
1472     }
1473     else if ( aExp <= 0x7E ) {
1474         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1475         return 0;
1476     }
1477     aSig64 = aSig | 0x00800000;
1478     aSig64 <<= 40;
1479     z = aSig64>>( - shiftCount );
1480     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1481         float_exception_flags |= float_flag_inexact;
1482     }
1483     if ( aSign ) z = - z;
1484     return z;
1485
1486 }
1487 #endif /* !SOFTFLOAT_FOR_GCC */
1488
1489 /*
1490 -------------------------------------------------------------------------------
1491 Returns the result of converting the single-precision floating-point value
1492 `a' to the double-precision floating-point format.  The conversion is
1493 performed according to the IEC/IEEE Standard for Binary Floating-Point
1494 Arithmetic.
1495 -------------------------------------------------------------------------------
1496 */
1497 float64 float32_to_float64( float32 a )
1498 {
1499     flag aSign;
1500     int16 aExp;
1501     bits32 aSig;
1502
1503     aSig = extractFloat32Frac( a );
1504     aExp = extractFloat32Exp( a );
1505     aSign = extractFloat32Sign( a );
1506     if ( aExp == 0xFF ) {
1507         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1508         return packFloat64( aSign, 0x7FF, 0 );
1509     }
1510     if ( aExp == 0 ) {
1511         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1512         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1513         --aExp;
1514     }
1515     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1516
1517 }
1518
1519 #ifdef FLOATX80
1520
1521 /*
1522 -------------------------------------------------------------------------------
1523 Returns the result of converting the single-precision floating-point value
1524 `a' to the extended double-precision floating-point format.  The conversion
1525 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1526 Arithmetic.
1527 -------------------------------------------------------------------------------
1528 */
1529 floatx80 float32_to_floatx80( float32 a )
1530 {
1531     flag aSign;
1532     int16 aExp;
1533     bits32 aSig;
1534
1535     aSig = extractFloat32Frac( a );
1536     aExp = extractFloat32Exp( a );
1537     aSign = extractFloat32Sign( a );
1538     if ( aExp == 0xFF ) {
1539         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1540         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1541     }
1542     if ( aExp == 0 ) {
1543         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1544         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1545     }
1546     aSig |= 0x00800000;
1547     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1548
1549 }
1550
1551 #endif
1552
1553 #ifdef FLOAT128
1554
1555 /*
1556 -------------------------------------------------------------------------------
1557 Returns the result of converting the single-precision floating-point value
1558 `a' to the double-precision floating-point format.  The conversion is
1559 performed according to the IEC/IEEE Standard for Binary Floating-Point
1560 Arithmetic.
1561 -------------------------------------------------------------------------------
1562 */
1563 float128 float32_to_float128( float32 a )
1564 {
1565     flag aSign;
1566     int16 aExp;
1567     bits32 aSig;
1568
1569     aSig = extractFloat32Frac( a );
1570     aExp = extractFloat32Exp( a );
1571     aSign = extractFloat32Sign( a );
1572     if ( aExp == 0xFF ) {
1573         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1574         return packFloat128( aSign, 0x7FFF, 0, 0 );
1575     }
1576     if ( aExp == 0 ) {
1577         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1578         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1579         --aExp;
1580     }
1581     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1582
1583 }
1584
1585 #endif
1586
1587 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1588 /*
1589 -------------------------------------------------------------------------------
1590 Rounds the single-precision floating-point value `a' to an integer, and
1591 returns the result as a single-precision floating-point value.  The
1592 operation is performed according to the IEC/IEEE Standard for Binary
1593 Floating-Point Arithmetic.
1594 -------------------------------------------------------------------------------
1595 */
1596 float32 float32_round_to_int( float32 a )
1597 {
1598     flag aSign;
1599     int16 aExp;
1600     bits32 lastBitMask, roundBitsMask;
1601     int8 roundingMode;
1602     float32 z;
1603
1604     aExp = extractFloat32Exp( a );
1605     if ( 0x96 <= aExp ) {
1606         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1607             return propagateFloat32NaN( a, a );
1608         }
1609         return a;
1610     }
1611     if ( aExp <= 0x7E ) {
1612         if ( (bits32) ( a<<1 ) == 0 ) return a;
1613         float_exception_flags |= float_flag_inexact;
1614         aSign = extractFloat32Sign( a );
1615         switch ( float_rounding_mode ) {
1616          case float_round_nearest_even:
1617             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1618                 return packFloat32( aSign, 0x7F, 0 );
1619             }
1620             break;
1621          case float_round_to_zero:
1622             break;
1623          case float_round_down:
1624             return aSign ? 0xBF800000 : 0;
1625          case float_round_up:
1626             return aSign ? 0x80000000 : 0x3F800000;
1627         }
1628         return packFloat32( aSign, 0, 0 );
1629     }
1630     lastBitMask = 1;
1631     lastBitMask <<= 0x96 - aExp;
1632     roundBitsMask = lastBitMask - 1;
1633     z = a;
1634     roundingMode = float_rounding_mode;
1635     if ( roundingMode == float_round_nearest_even ) {
1636         z += lastBitMask>>1;
1637         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1638     }
1639     else if ( roundingMode != float_round_to_zero ) {
1640         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1641             z += roundBitsMask;
1642         }
1643     }
1644     z &= ~ roundBitsMask;
1645     if ( z != a ) float_exception_flags |= float_flag_inexact;
1646     return z;
1647
1648 }
1649 #endif /* !SOFTFLOAT_FOR_GCC */
1650
1651 /*
1652 -------------------------------------------------------------------------------
1653 Returns the result of adding the absolute values of the single-precision
1654 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1655 before being returned.  `zSign' is ignored if the result is a NaN.
1656 The addition is performed according to the IEC/IEEE Standard for Binary
1657 Floating-Point Arithmetic.
1658 -------------------------------------------------------------------------------
1659 */
1660 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1661 {
1662     int16 aExp, bExp, zExp;
1663     bits32 aSig, bSig, zSig;
1664     int16 expDiff;
1665
1666     aSig = extractFloat32Frac( a );
1667     aExp = extractFloat32Exp( a );
1668     bSig = extractFloat32Frac( b );
1669     bExp = extractFloat32Exp( b );
1670     expDiff = aExp - bExp;
1671     aSig <<= 6;
1672     bSig <<= 6;
1673     if ( 0 < expDiff ) {
1674         if ( aExp == 0xFF ) {
1675             if ( aSig ) return propagateFloat32NaN( a, b );
1676             return a;
1677         }
1678         if ( bExp == 0 ) {
1679             --expDiff;
1680         }
1681         else {
1682             bSig |= 0x20000000;
1683         }
1684         shift32RightJamming( bSig, expDiff, &bSig );
1685         zExp = aExp;
1686     }
1687     else if ( expDiff < 0 ) {
1688         if ( bExp == 0xFF ) {
1689             if ( bSig ) return propagateFloat32NaN( a, b );
1690             return packFloat32( zSign, 0xFF, 0 );
1691         }
1692         if ( aExp == 0 ) {
1693             ++expDiff;
1694         }
1695         else {
1696             aSig |= 0x20000000;
1697         }
1698         shift32RightJamming( aSig, - expDiff, &aSig );
1699         zExp = bExp;
1700     }
1701     else {
1702         if ( aExp == 0xFF ) {
1703             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1704             return a;
1705         }
1706         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1707         zSig = 0x40000000 + aSig + bSig;
1708         zExp = aExp;
1709         goto roundAndPack;
1710     }
1711     aSig |= 0x20000000;
1712     zSig = ( aSig + bSig )<<1;
1713     --zExp;
1714     if ( (sbits32) zSig < 0 ) {
1715         zSig = aSig + bSig;
1716         ++zExp;
1717     }
1718  roundAndPack:
1719     return roundAndPackFloat32( zSign, zExp, zSig );
1720
1721 }
1722
1723 /*
1724 -------------------------------------------------------------------------------
1725 Returns the result of subtracting the absolute values of the single-
1726 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1727 difference is negated before being returned.  `zSign' is ignored if the
1728 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1729 Standard for Binary Floating-Point Arithmetic.
1730 -------------------------------------------------------------------------------
1731 */
1732 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1733 {
1734     int16 aExp, bExp, zExp;
1735     bits32 aSig, bSig, zSig;
1736     int16 expDiff;
1737
1738     aSig = extractFloat32Frac( a );
1739     aExp = extractFloat32Exp( a );
1740     bSig = extractFloat32Frac( b );
1741     bExp = extractFloat32Exp( b );
1742     expDiff = aExp - bExp;
1743     aSig <<= 7;
1744     bSig <<= 7;
1745     if ( 0 < expDiff ) goto aExpBigger;
1746     if ( expDiff < 0 ) goto bExpBigger;
1747     if ( aExp == 0xFF ) {
1748         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1749         float_raise( float_flag_invalid );
1750         return float32_default_nan;
1751     }
1752     if ( aExp == 0 ) {
1753         aExp = 1;
1754         bExp = 1;
1755     }
1756     if ( bSig < aSig ) goto aBigger;
1757     if ( aSig < bSig ) goto bBigger;
1758     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1759  bExpBigger:
1760     if ( bExp == 0xFF ) {
1761         if ( bSig ) return propagateFloat32NaN( a, b );
1762         return packFloat32( zSign ^ 1, 0xFF, 0 );
1763     }
1764     if ( aExp == 0 ) {
1765         ++expDiff;
1766     }
1767     else {
1768         aSig |= 0x40000000;
1769     }
1770     shift32RightJamming( aSig, - expDiff, &aSig );
1771     bSig |= 0x40000000;
1772  bBigger:
1773     zSig = bSig - aSig;
1774     zExp = bExp;
1775     zSign ^= 1;
1776     goto normalizeRoundAndPack;
1777  aExpBigger:
1778     if ( aExp == 0xFF ) {
1779         if ( aSig ) return propagateFloat32NaN( a, b );
1780         return a;
1781     }
1782     if ( bExp == 0 ) {
1783         --expDiff;
1784     }
1785     else {
1786         bSig |= 0x40000000;
1787     }
1788     shift32RightJamming( bSig, expDiff, &bSig );
1789     aSig |= 0x40000000;
1790  aBigger:
1791     zSig = aSig - bSig;
1792     zExp = aExp;
1793  normalizeRoundAndPack:
1794     --zExp;
1795     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1796
1797 }
1798
1799 /*
1800 -------------------------------------------------------------------------------
1801 Returns the result of adding the single-precision floating-point values `a'
1802 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1803 Binary Floating-Point Arithmetic.
1804 -------------------------------------------------------------------------------
1805 */
1806 float32 float32_add( float32 a, float32 b )
1807 {
1808     flag aSign, bSign;
1809
1810     aSign = extractFloat32Sign( a );
1811     bSign = extractFloat32Sign( b );
1812     if ( aSign == bSign ) {
1813         return addFloat32Sigs( a, b, aSign );
1814     }
1815     else {
1816         return subFloat32Sigs( a, b, aSign );
1817     }
1818
1819 }
1820
1821 /*
1822 -------------------------------------------------------------------------------
1823 Returns the result of subtracting the single-precision floating-point values
1824 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1825 for Binary Floating-Point Arithmetic.
1826 -------------------------------------------------------------------------------
1827 */
1828 float32 float32_sub( float32 a, float32 b )
1829 {
1830     flag aSign, bSign;
1831
1832     aSign = extractFloat32Sign( a );
1833     bSign = extractFloat32Sign( b );
1834     if ( aSign == bSign ) {
1835         return subFloat32Sigs( a, b, aSign );
1836     }
1837     else {
1838         return addFloat32Sigs( a, b, aSign );
1839     }
1840
1841 }
1842
1843 /*
1844 -------------------------------------------------------------------------------
1845 Returns the result of multiplying the single-precision floating-point values
1846 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1847 for Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1849 */
1850 float32 float32_mul( float32 a, float32 b )
1851 {
1852     flag aSign, bSign, zSign;
1853     int16 aExp, bExp, zExp;
1854     bits32 aSig, bSig;
1855     bits64 zSig64;
1856     bits32 zSig;
1857
1858     aSig = extractFloat32Frac( a );
1859     aExp = extractFloat32Exp( a );
1860     aSign = extractFloat32Sign( a );
1861     bSig = extractFloat32Frac( b );
1862     bExp = extractFloat32Exp( b );
1863     bSign = extractFloat32Sign( b );
1864     zSign = aSign ^ bSign;
1865     if ( aExp == 0xFF ) {
1866         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1867             return propagateFloat32NaN( a, b );
1868         }
1869         if ( ( bExp | bSig ) == 0 ) {
1870             float_raise( float_flag_invalid );
1871             return float32_default_nan;
1872         }
1873         return packFloat32( zSign, 0xFF, 0 );
1874     }
1875     if ( bExp == 0xFF ) {
1876         if ( bSig ) return propagateFloat32NaN( a, b );
1877         if ( ( aExp | aSig ) == 0 ) {
1878             float_raise( float_flag_invalid );
1879             return float32_default_nan;
1880         }
1881         return packFloat32( zSign, 0xFF, 0 );
1882     }
1883     if ( aExp == 0 ) {
1884         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1885         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1886     }
1887     if ( bExp == 0 ) {
1888         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1889         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1890     }
1891     zExp = aExp + bExp - 0x7F;
1892     aSig = ( aSig | 0x00800000 )<<7;
1893     bSig = ( bSig | 0x00800000 )<<8;
1894     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1895     zSig = zSig64;
1896     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1897         zSig <<= 1;
1898         --zExp;
1899     }
1900     return roundAndPackFloat32( zSign, zExp, zSig );
1901
1902 }
1903
1904 /*
1905 -------------------------------------------------------------------------------
1906 Returns the result of dividing the single-precision floating-point value `a'
1907 by the corresponding value `b'.  The operation is performed according to the
1908 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 -------------------------------------------------------------------------------
1910 */
1911 float32 float32_div( float32 a, float32 b )
1912 {
1913     flag aSign, bSign, zSign;
1914     int16 aExp, bExp, zExp;
1915     bits32 aSig, bSig, zSig;
1916
1917     aSig = extractFloat32Frac( a );
1918     aExp = extractFloat32Exp( a );
1919     aSign = extractFloat32Sign( a );
1920     bSig = extractFloat32Frac( b );
1921     bExp = extractFloat32Exp( b );
1922     bSign = extractFloat32Sign( b );
1923     zSign = aSign ^ bSign;
1924     if ( aExp == 0xFF ) {
1925         if ( aSig ) return propagateFloat32NaN( a, b );
1926         if ( bExp == 0xFF ) {
1927             if ( bSig ) return propagateFloat32NaN( a, b );
1928             float_raise( float_flag_invalid );
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b );
1935         return packFloat32( zSign, 0, 0 );
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) {
1939             if ( ( aExp | aSig ) == 0 ) {
1940                 float_raise( float_flag_invalid );
1941                 return float32_default_nan;
1942             }
1943             float_raise( float_flag_divbyzero );
1944             return packFloat32( zSign, 0xFF, 0 );
1945         }
1946         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1947     }
1948     if ( aExp == 0 ) {
1949         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1950         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1951     }
1952     zExp = aExp - bExp + 0x7D;
1953     aSig = ( aSig | 0x00800000 )<<7;
1954     bSig = ( bSig | 0x00800000 )<<8;
1955     if ( bSig <= ( aSig + aSig ) ) {
1956         aSig >>= 1;
1957         ++zExp;
1958     }
1959     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1960     if ( ( zSig & 0x3F ) == 0 ) {
1961         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig );
1964
1965 }
1966
1967 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1968 /*
1969 -------------------------------------------------------------------------------
1970 Returns the remainder of the single-precision floating-point value `a'
1971 with respect to the corresponding value `b'.  The operation is performed
1972 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1973 -------------------------------------------------------------------------------
1974 */
1975 float32 float32_rem( float32 a, float32 b )
1976 {
1977     flag aSign, bSign, zSign;
1978     int16 aExp, bExp, expDiff;
1979     bits32 aSig, bSig;
1980     bits32 q;
1981     bits64 aSig64, bSig64, q64;
1982     bits32 alternateASig;
1983     sbits32 sigMean;
1984
1985     aSig = extractFloat32Frac( a );
1986     aExp = extractFloat32Exp( a );
1987     aSign = extractFloat32Sign( a );
1988     bSig = extractFloat32Frac( b );
1989     bExp = extractFloat32Exp( b );
1990     bSign = extractFloat32Sign( b );
1991     if ( aExp == 0xFF ) {
1992         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1993             return propagateFloat32NaN( a, b );
1994         }
1995         float_raise( float_flag_invalid );
1996         return float32_default_nan;
1997     }
1998     if ( bExp == 0xFF ) {
1999         if ( bSig ) return propagateFloat32NaN( a, b );
2000         return a;
2001     }
2002     if ( bExp == 0 ) {
2003         if ( bSig == 0 ) {
2004             float_raise( float_flag_invalid );
2005             return float32_default_nan;
2006         }
2007         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2008     }
2009     if ( aExp == 0 ) {
2010         if ( aSig == 0 ) return a;
2011         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2012     }
2013     expDiff = aExp - bExp;
2014     aSig |= 0x00800000;
2015     bSig |= 0x00800000;
2016     if ( expDiff < 32 ) {
2017         aSig <<= 8;
2018         bSig <<= 8;
2019         if ( expDiff < 0 ) {
2020             if ( expDiff < -1 ) return a;
2021             aSig >>= 1;
2022         }
2023         q = ( bSig <= aSig );
2024         if ( q ) aSig -= bSig;
2025         if ( 0 < expDiff ) {
2026             q = ( ( (bits64) aSig )<<32 ) / bSig;
2027             q >>= 32 - expDiff;
2028             bSig >>= 2;
2029             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2030         }
2031         else {
2032             aSig >>= 2;
2033             bSig >>= 2;
2034         }
2035     }
2036     else {
2037         if ( bSig <= aSig ) aSig -= bSig;
2038         aSig64 = ( (bits64) aSig )<<40;
2039         bSig64 = ( (bits64) bSig )<<40;
2040         expDiff -= 64;
2041         while ( 0 < expDiff ) {
2042             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2043             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2044             aSig64 = - ( ( bSig * q64 )<<38 );
2045             expDiff -= 62;
2046         }
2047         expDiff += 64;
2048         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050         q = q64>>( 64 - expDiff );
2051         bSig <<= 6;
2052         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2053     }
2054     do {
2055         alternateASig = aSig;
2056         ++q;
2057         aSig -= bSig;
2058     } while ( 0 <= (sbits32) aSig );
2059     sigMean = aSig + alternateASig;
2060     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2061         aSig = alternateASig;
2062     }
2063     zSign = ( (sbits32) aSig < 0 );
2064     if ( zSign ) aSig = - aSig;
2065     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2066
2067 }
2068 #endif /* !SOFTFLOAT_FOR_GCC */
2069
2070 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2071 /*
2072 -------------------------------------------------------------------------------
2073 Returns the square root of the single-precision floating-point value `a'.
2074 The operation is performed according to the IEC/IEEE Standard for Binary
2075 Floating-Point Arithmetic.
2076 -------------------------------------------------------------------------------
2077 */
2078 float32 float32_sqrt( float32 a )
2079 {
2080     flag aSign;
2081     int16 aExp, zExp;
2082     bits32 aSig, zSig;
2083     bits64 rem, term;
2084
2085     aSig = extractFloat32Frac( a );
2086     aExp = extractFloat32Exp( a );
2087     aSign = extractFloat32Sign( a );
2088     if ( aExp == 0xFF ) {
2089         if ( aSig ) return propagateFloat32NaN( a, 0 );
2090         if ( ! aSign ) return a;
2091         float_raise( float_flag_invalid );
2092         return float32_default_nan;
2093     }
2094     if ( aSign ) {
2095         if ( ( aExp | aSig ) == 0 ) return a;
2096         float_raise( float_flag_invalid );
2097         return float32_default_nan;
2098     }
2099     if ( aExp == 0 ) {
2100         if ( aSig == 0 ) return 0;
2101         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2102     }
2103     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2104     aSig = ( aSig | 0x00800000 )<<8;
2105     zSig = estimateSqrt32( aExp, aSig ) + 2;
2106     if ( ( zSig & 0x7F ) <= 5 ) {
2107         if ( zSig < 2 ) {
2108             zSig = 0x7FFFFFFF;
2109             goto roundAndPack;
2110         }
2111         aSig >>= aExp & 1;
2112         term = ( (bits64) zSig ) * zSig;
2113         rem = ( ( (bits64) aSig )<<32 ) - term;
2114         while ( (sbits64) rem < 0 ) {
2115             --zSig;
2116             rem += ( ( (bits64) zSig )<<1 ) | 1;
2117         }
2118         zSig |= ( rem != 0 );
2119     }
2120     shift32RightJamming( zSig, 1, &zSig );
2121  roundAndPack:
2122     return roundAndPackFloat32( 0, zExp, zSig );
2123
2124 }
2125 #endif /* !SOFTFLOAT_FOR_GCC */
2126
2127 /*
2128 -------------------------------------------------------------------------------
2129 Returns 1 if the single-precision floating-point value `a' is equal to
2130 the corresponding value `b', and 0 otherwise.  The comparison is performed
2131 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132 -------------------------------------------------------------------------------
2133 */
2134 flag float32_eq( float32 a, float32 b )
2135 {
2136
2137     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2138          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2139        ) {
2140         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2141             float_raise( float_flag_invalid );
2142         }
2143         return 0;
2144     }
2145     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2146
2147 }
2148
2149 /*
2150 -------------------------------------------------------------------------------
2151 Returns 1 if the single-precision floating-point value `a' is less than
2152 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2153 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2154 Arithmetic.
2155 -------------------------------------------------------------------------------
2156 */
2157 flag float32_le( float32 a, float32 b )
2158 {
2159     flag aSign, bSign;
2160
2161     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2162          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2163        ) {
2164         float_raise( float_flag_invalid );
2165         return 0;
2166     }
2167     aSign = extractFloat32Sign( a );
2168     bSign = extractFloat32Sign( b );
2169     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2170     return ( a == b ) || ( aSign ^ ( a < b ) );
2171
2172 }
2173
2174 /*
2175 -------------------------------------------------------------------------------
2176 Returns 1 if the single-precision floating-point value `a' is less than
2177 the corresponding value `b', and 0 otherwise.  The comparison is performed
2178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2179 -------------------------------------------------------------------------------
2180 */
2181 flag float32_lt( float32 a, float32 b )
2182 {
2183     flag aSign, bSign;
2184
2185     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2186          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2187        ) {
2188         float_raise( float_flag_invalid );
2189         return 0;
2190     }
2191     aSign = extractFloat32Sign( a );
2192     bSign = extractFloat32Sign( b );
2193     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2194     return ( a != b ) && ( aSign ^ ( a < b ) );
2195
2196 }
2197
2198 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2199 /*
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is equal to
2202 the corresponding value `b', and 0 otherwise.  The invalid exception is
2203 raised if either operand is a NaN.  Otherwise, the comparison is performed
2204 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2205 -------------------------------------------------------------------------------
2206 */
2207 flag float32_eq_signaling( float32 a, float32 b )
2208 {
2209
2210     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2211          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2212        ) {
2213         float_raise( float_flag_invalid );
2214         return 0;
2215     }
2216     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2217
2218 }
2219
2220 /*
2221 -------------------------------------------------------------------------------
2222 Returns 1 if the single-precision floating-point value `a' is less than or
2223 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2224 cause an exception.  Otherwise, the comparison is performed according to the
2225 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2226 -------------------------------------------------------------------------------
2227 */
2228 flag float32_le_quiet( float32 a, float32 b )
2229 {
2230     flag aSign, bSign;
2231
2232     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2233          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2234        ) {
2235         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2236             float_raise( float_flag_invalid );
2237         }
2238         return 0;
2239     }
2240     aSign = extractFloat32Sign( a );
2241     bSign = extractFloat32Sign( b );
2242     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2243     return ( a == b ) || ( aSign ^ ( a < b ) );
2244
2245 }
2246
2247 /*
2248 -------------------------------------------------------------------------------
2249 Returns 1 if the single-precision floating-point value `a' is less than
2250 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2251 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2252 Standard for Binary Floating-Point Arithmetic.
2253 -------------------------------------------------------------------------------
2254 */
2255 flag float32_lt_quiet( float32 a, float32 b )
2256 {
2257     flag aSign, bSign;
2258
2259     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2260          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2261        ) {
2262         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2263             float_raise( float_flag_invalid );
2264         }
2265         return 0;
2266     }
2267     aSign = extractFloat32Sign( a );
2268     bSign = extractFloat32Sign( b );
2269     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2270     return ( a != b ) && ( aSign ^ ( a < b ) );
2271
2272 }
2273 #endif /* !SOFTFLOAT_FOR_GCC */
2274
2275 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2276 /*
2277 -------------------------------------------------------------------------------
2278 Returns the result of converting the double-precision floating-point value
2279 `a' to the 32-bit two's complement integer format.  The conversion is
2280 performed according to the IEC/IEEE Standard for Binary Floating-Point
2281 Arithmetic---which means in particular that the conversion is rounded
2282 according to the current rounding mode.  If `a' is a NaN, the largest
2283 positive integer is returned.  Otherwise, if the conversion overflows, the
2284 largest integer with the same sign as `a' is returned.
2285 -------------------------------------------------------------------------------
2286 */
2287 int32 float64_to_int32( float64 a )
2288 {
2289     flag aSign;
2290     int16 aExp, shiftCount;
2291     bits64 aSig;
2292
2293     aSig = extractFloat64Frac( a );
2294     aExp = extractFloat64Exp( a );
2295     aSign = extractFloat64Sign( a );
2296     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2297     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2298     shiftCount = 0x42C - aExp;
2299     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2300     return roundAndPackInt32( aSign, aSig );
2301
2302 }
2303 #endif /* !SOFTFLOAT_FOR_GCC */
2304
2305 /*
2306 -------------------------------------------------------------------------------
2307 Returns the result of converting the double-precision floating-point value
2308 `a' to the 32-bit two's complement integer format.  The conversion is
2309 performed according to the IEC/IEEE Standard for Binary Floating-Point
2310 Arithmetic, except that the conversion is always rounded toward zero.
2311 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2312 the conversion overflows, the largest integer with the same sign as `a' is
2313 returned.
2314 -------------------------------------------------------------------------------
2315 */
2316 int32 float64_to_int32_round_to_zero( float64 a )
2317 {
2318     flag aSign;
2319     int16 aExp, shiftCount;
2320     bits64 aSig, savedASig;
2321     int32 z;
2322
2323     aSig = extractFloat64Frac( a );
2324     aExp = extractFloat64Exp( a );
2325     aSign = extractFloat64Sign( a );
2326     if ( 0x41E < aExp ) {
2327         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2328         goto invalid;
2329     }
2330     else if ( aExp < 0x3FF ) {
2331         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2332         return 0;
2333     }
2334     aSig |= LIT64( 0x0010000000000000 );
2335     shiftCount = 0x433 - aExp;
2336     savedASig = aSig;
2337     aSig >>= shiftCount;
2338     z = aSig;
2339     if ( aSign ) z = - z;
2340     if ( ( z < 0 ) ^ aSign ) {
2341  invalid:
2342         float_raise( float_flag_invalid );
2343         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2344     }
2345     if ( ( aSig<<shiftCount ) != savedASig ) {
2346         float_exception_flags |= float_flag_inexact;
2347     }
2348     return z;
2349
2350 }
2351
2352 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2353 /*
2354 -------------------------------------------------------------------------------
2355 Returns the result of converting the double-precision floating-point value
2356 `a' to the 64-bit two's complement integer format.  The conversion is
2357 performed according to the IEC/IEEE Standard for Binary Floating-Point
2358 Arithmetic---which means in particular that the conversion is rounded
2359 according to the current rounding mode.  If `a' is a NaN, the largest
2360 positive integer is returned.  Otherwise, if the conversion overflows, the
2361 largest integer with the same sign as `a' is returned.
2362 -------------------------------------------------------------------------------
2363 */
2364 int64 float64_to_int64( float64 a )
2365 {
2366     flag aSign;
2367     int16 aExp, shiftCount;
2368     bits64 aSig, aSigExtra;
2369
2370     aSig = extractFloat64Frac( a );
2371     aExp = extractFloat64Exp( a );
2372     aSign = extractFloat64Sign( a );
2373     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2374     shiftCount = 0x433 - aExp;
2375     if ( shiftCount <= 0 ) {
2376         if ( 0x43E < aExp ) {
2377             float_raise( float_flag_invalid );
2378             if (    ! aSign
2379                  || (    ( aExp == 0x7FF )
2380                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2381                ) {
2382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2383             }
2384             return (sbits64) LIT64( 0x8000000000000000 );
2385         }
2386         aSigExtra = 0;
2387         aSig <<= - shiftCount;
2388     }
2389     else {
2390         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2391     }
2392     return roundAndPackInt64( aSign, aSig, aSigExtra );
2393
2394 }
2395
2396 /*
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format.  The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic, except that the conversion is always rounded toward zero.
2402 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2403 the conversion overflows, the largest integer with the same sign as `a' is
2404 returned.
2405 -------------------------------------------------------------------------------
2406 */
2407 int64 float64_to_int64_round_to_zero( float64 a )
2408 {
2409     flag aSign;
2410     int16 aExp, shiftCount;
2411     bits64 aSig;
2412     int64 z;
2413
2414     aSig = extractFloat64Frac( a );
2415     aExp = extractFloat64Exp( a );
2416     aSign = extractFloat64Sign( a );
2417     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418     shiftCount = aExp - 0x433;
2419     if ( 0 <= shiftCount ) {
2420         if ( 0x43E <= aExp ) {
2421             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2422                 float_raise( float_flag_invalid );
2423                 if (    ! aSign
2424                      || (    ( aExp == 0x7FF )
2425                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2426                    ) {
2427                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2428                 }
2429             }
2430             return (sbits64) LIT64( 0x8000000000000000 );
2431         }
2432         z = aSig<<shiftCount;
2433     }
2434     else {
2435         if ( aExp < 0x3FE ) {
2436             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2437             return 0;
2438         }
2439         z = aSig>>( - shiftCount );
2440         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2441             float_exception_flags |= float_flag_inexact;
2442         }
2443     }
2444     if ( aSign ) z = - z;
2445     return z;
2446
2447 }
2448 #endif /* !SOFTFLOAT_FOR_GCC */
2449
2450 /*
2451 -------------------------------------------------------------------------------
2452 Returns the result of converting the double-precision floating-point value
2453 `a' to the single-precision floating-point format.  The conversion is
2454 performed according to the IEC/IEEE Standard for Binary Floating-Point
2455 Arithmetic.
2456 -------------------------------------------------------------------------------
2457 */
2458 float32 float64_to_float32( float64 a )
2459 {
2460     flag aSign;
2461     int16 aExp;
2462     bits64 aSig;
2463     bits32 zSig;
2464
2465     aSig = extractFloat64Frac( a );
2466     aExp = extractFloat64Exp( a );
2467     aSign = extractFloat64Sign( a );
2468     if ( aExp == 0x7FF ) {
2469         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2470         return packFloat32( aSign, 0xFF, 0 );
2471     }
2472     shift64RightJamming( aSig, 22, &aSig );
2473     zSig = aSig;
2474     if ( aExp || zSig ) {
2475         zSig |= 0x40000000;
2476         aExp -= 0x381;
2477     }
2478     return roundAndPackFloat32( aSign, aExp, zSig );
2479
2480 }
2481
2482 #ifdef FLOATX80
2483
2484 /*
2485 -------------------------------------------------------------------------------
2486 Returns the result of converting the double-precision floating-point value
2487 `a' to the extended double-precision floating-point format.  The conversion
2488 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2489 Arithmetic.
2490 -------------------------------------------------------------------------------
2491 */
2492 floatx80 float64_to_floatx80( float64 a )
2493 {
2494     flag aSign;
2495     int16 aExp;
2496     bits64 aSig;
2497
2498     aSig = extractFloat64Frac( a );
2499     aExp = extractFloat64Exp( a );
2500     aSign = extractFloat64Sign( a );
2501     if ( aExp == 0x7FF ) {
2502         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2503         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2504     }
2505     if ( aExp == 0 ) {
2506         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2507         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2508     }
2509     return
2510         packFloatx80(
2511             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2512
2513 }
2514
2515 #endif
2516
2517 #ifdef FLOAT128
2518
2519 /*
2520 -------------------------------------------------------------------------------
2521 Returns the result of converting the double-precision floating-point value
2522 `a' to the quadruple-precision floating-point format.  The conversion is
2523 performed according to the IEC/IEEE Standard for Binary Floating-Point
2524 Arithmetic.
2525 -------------------------------------------------------------------------------
2526 */
2527 float128 float64_to_float128( float64 a )
2528 {
2529     flag aSign;
2530     int16 aExp;
2531     bits64 aSig, zSig0, zSig1;
2532
2533     aSig = extractFloat64Frac( a );
2534     aExp = extractFloat64Exp( a );
2535     aSign = extractFloat64Sign( a );
2536     if ( aExp == 0x7FF ) {
2537         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2538         return packFloat128( aSign, 0x7FFF, 0, 0 );
2539     }
2540     if ( aExp == 0 ) {
2541         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2542         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2543         --aExp;
2544     }
2545     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2546     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2547
2548 }
2549
2550 #endif
2551
2552 #ifndef SOFTFLOAT_FOR_GCC
2553 /*
2554 -------------------------------------------------------------------------------
2555 Rounds the double-precision floating-point value `a' to an integer, and
2556 returns the result as a double-precision floating-point value.  The
2557 operation is performed according to the IEC/IEEE Standard for Binary
2558 Floating-Point Arithmetic.
2559 -------------------------------------------------------------------------------
2560 */
2561 float64 float64_round_to_int( float64 a )
2562 {
2563     flag aSign;
2564     int16 aExp;
2565     bits64 lastBitMask, roundBitsMask;
2566     int8 roundingMode;
2567     float64 z;
2568
2569     aExp = extractFloat64Exp( a );
2570     if ( 0x433 <= aExp ) {
2571         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2572             return propagateFloat64NaN( a, a );
2573         }
2574         return a;
2575     }
2576     if ( aExp < 0x3FF ) {
2577         if ( (bits64) ( a<<1 ) == 0 ) return a;
2578         float_exception_flags |= float_flag_inexact;
2579         aSign = extractFloat64Sign( a );
2580         switch ( float_rounding_mode ) {
2581          case float_round_nearest_even:
2582             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2583                 return packFloat64( aSign, 0x3FF, 0 );
2584             }
2585             break;
2586          case float_round_to_zero:
2587             break;
2588          case float_round_down:
2589             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2590          case float_round_up:
2591             return
2592             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2593         }
2594         return packFloat64( aSign, 0, 0 );
2595     }
2596     lastBitMask = 1;
2597     lastBitMask <<= 0x433 - aExp;
2598     roundBitsMask = lastBitMask - 1;
2599     z = a;
2600     roundingMode = float_rounding_mode;
2601     if ( roundingMode == float_round_nearest_even ) {
2602         z += lastBitMask>>1;
2603         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2604     }
2605     else if ( roundingMode != float_round_to_zero ) {
2606         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2607             z += roundBitsMask;
2608         }
2609     }
2610     z &= ~ roundBitsMask;
2611     if ( z != a ) float_exception_flags |= float_flag_inexact;
2612     return z;
2613
2614 }
2615 #endif
2616
2617 /*
2618 -------------------------------------------------------------------------------
2619 Returns the result of adding the absolute values of the double-precision
2620 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2621 before being returned.  `zSign' is ignored if the result is a NaN.
2622 The addition is performed according to the IEC/IEEE Standard for Binary
2623 Floating-Point Arithmetic.
2624 -------------------------------------------------------------------------------
2625 */
2626 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2627 {
2628     int16 aExp, bExp, zExp;
2629     bits64 aSig, bSig, zSig;
2630     int16 expDiff;
2631
2632     aSig = extractFloat64Frac( a );
2633     aExp = extractFloat64Exp( a );
2634     bSig = extractFloat64Frac( b );
2635     bExp = extractFloat64Exp( b );
2636     expDiff = aExp - bExp;
2637     aSig <<= 9;
2638     bSig <<= 9;
2639     if ( 0 < expDiff ) {
2640         if ( aExp == 0x7FF ) {
2641             if ( aSig ) return propagateFloat64NaN( a, b );
2642             return a;
2643         }
2644         if ( bExp == 0 ) {
2645             --expDiff;
2646         }
2647         else {
2648             bSig |= LIT64( 0x2000000000000000 );
2649         }
2650         shift64RightJamming( bSig, expDiff, &bSig );
2651         zExp = aExp;
2652     }
2653     else if ( expDiff < 0 ) {
2654         if ( bExp == 0x7FF ) {
2655             if ( bSig ) return propagateFloat64NaN( a, b );
2656             return packFloat64( zSign, 0x7FF, 0 );
2657         }
2658         if ( aExp == 0 ) {
2659             ++expDiff;
2660         }
2661         else {
2662             aSig |= LIT64( 0x2000000000000000 );
2663         }
2664         shift64RightJamming( aSig, - expDiff, &aSig );
2665         zExp = bExp;
2666     }
2667     else {
2668         if ( aExp == 0x7FF ) {
2669             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2670             return a;
2671         }
2672         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2673         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2674         zExp = aExp;
2675         goto roundAndPack;
2676     }
2677     aSig |= LIT64( 0x2000000000000000 );
2678     zSig = ( aSig + bSig )<<1;
2679     --zExp;
2680     if ( (sbits64) zSig < 0 ) {
2681         zSig = aSig + bSig;
2682         ++zExp;
2683     }
2684  roundAndPack:
2685     return roundAndPackFloat64( zSign, zExp, zSig );
2686
2687 }
2688
2689 /*
2690 -------------------------------------------------------------------------------
2691 Returns the result of subtracting the absolute values of the double-
2692 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2693 difference is negated before being returned.  `zSign' is ignored if the
2694 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2695 Standard for Binary Floating-Point Arithmetic.
2696 -------------------------------------------------------------------------------
2697 */
2698 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2699 {
2700     int16 aExp, bExp, zExp;
2701     bits64 aSig, bSig, zSig;
2702     int16 expDiff;
2703
2704     aSig = extractFloat64Frac( a );
2705     aExp = extractFloat64Exp( a );
2706     bSig = extractFloat64Frac( b );
2707     bExp = extractFloat64Exp( b );
2708     expDiff = aExp - bExp;
2709     aSig <<= 10;
2710     bSig <<= 10;
2711     if ( 0 < expDiff ) goto aExpBigger;
2712     if ( expDiff < 0 ) goto bExpBigger;
2713     if ( aExp == 0x7FF ) {
2714         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2715         float_raise( float_flag_invalid );
2716         return float64_default_nan;
2717     }
2718     if ( aExp == 0 ) {
2719         aExp = 1;
2720         bExp = 1;
2721     }
2722     if ( bSig < aSig ) goto aBigger;
2723     if ( aSig < bSig ) goto bBigger;
2724     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2725  bExpBigger:
2726     if ( bExp == 0x7FF ) {
2727         if ( bSig ) return propagateFloat64NaN( a, b );
2728         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2729     }
2730     if ( aExp == 0 ) {
2731         ++expDiff;
2732     }
2733     else {
2734         aSig |= LIT64( 0x4000000000000000 );
2735     }
2736     shift64RightJamming( aSig, - expDiff, &aSig );
2737     bSig |= LIT64( 0x4000000000000000 );
2738  bBigger:
2739     zSig = bSig - aSig;
2740     zExp = bExp;
2741     zSign ^= 1;
2742     goto normalizeRoundAndPack;
2743  aExpBigger:
2744     if ( aExp == 0x7FF ) {
2745         if ( aSig ) return propagateFloat64NaN( a, b );
2746         return a;
2747     }
2748     if ( bExp == 0 ) {
2749         --expDiff;
2750     }
2751     else {
2752         bSig |= LIT64( 0x4000000000000000 );
2753     }
2754     shift64RightJamming( bSig, expDiff, &bSig );
2755     aSig |= LIT64( 0x4000000000000000 );
2756  aBigger:
2757     zSig = aSig - bSig;
2758     zExp = aExp;
2759  normalizeRoundAndPack:
2760     --zExp;
2761     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2762
2763 }
2764
2765 /*
2766 -------------------------------------------------------------------------------
2767 Returns the result of adding the double-precision floating-point values `a'
2768 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2769 Binary Floating-Point Arithmetic.
2770 -------------------------------------------------------------------------------
2771 */
2772 float64 float64_add( float64 a, float64 b )
2773 {
2774     flag aSign, bSign;
2775
2776     aSign = extractFloat64Sign( a );
2777     bSign = extractFloat64Sign( b );
2778     if ( aSign == bSign ) {
2779         return addFloat64Sigs( a, b, aSign );
2780     }
2781     else {
2782         return subFloat64Sigs( a, b, aSign );
2783     }
2784
2785 }
2786
2787 /*
2788 -------------------------------------------------------------------------------
2789 Returns the result of subtracting the double-precision floating-point values
2790 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2791 for Binary Floating-Point Arithmetic.
2792 -------------------------------------------------------------------------------
2793 */
2794 float64 float64_sub( float64 a, float64 b )
2795 {
2796     flag aSign, bSign;
2797
2798     aSign = extractFloat64Sign( a );
2799     bSign = extractFloat64Sign( b );
2800     if ( aSign == bSign ) {
2801         return subFloat64Sigs( a, b, aSign );
2802     }
2803     else {
2804         return addFloat64Sigs( a, b, aSign );
2805     }
2806
2807 }
2808
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of multiplying the double-precision floating-point values
2812 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2813 for Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2815 */
2816 float64 float64_mul( float64 a, float64 b )
2817 {
2818     flag aSign, bSign, zSign;
2819     int16 aExp, bExp, zExp;
2820     bits64 aSig, bSig, zSig0, zSig1;
2821
2822     aSig = extractFloat64Frac( a );
2823     aExp = extractFloat64Exp( a );
2824     aSign = extractFloat64Sign( a );
2825     bSig = extractFloat64Frac( b );
2826     bExp = extractFloat64Exp( b );
2827     bSign = extractFloat64Sign( b );
2828     zSign = aSign ^ bSign;
2829     if ( aExp == 0x7FF ) {
2830         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2831             return propagateFloat64NaN( a, b );
2832         }
2833         if ( ( bExp | bSig ) == 0 ) {
2834             float_raise( float_flag_invalid );
2835             return float64_default_nan;
2836         }
2837         return packFloat64( zSign, 0x7FF, 0 );
2838     }
2839     if ( bExp == 0x7FF ) {
2840         if ( bSig ) return propagateFloat64NaN( a, b );
2841         if ( ( aExp | aSig ) == 0 ) {
2842             float_raise( float_flag_invalid );
2843             return float64_default_nan;
2844         }
2845         return packFloat64( zSign, 0x7FF, 0 );
2846     }
2847     if ( aExp == 0 ) {
2848         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2849         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2850     }
2851     if ( bExp == 0 ) {
2852         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2853         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2854     }
2855     zExp = aExp + bExp - 0x3FF;
2856     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2857     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2859     zSig0 |= ( zSig1 != 0 );
2860     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2861         zSig0 <<= 1;
2862         --zExp;
2863     }
2864     return roundAndPackFloat64( zSign, zExp, zSig0 );
2865
2866 }
2867
2868 /*
2869 -------------------------------------------------------------------------------
2870 Returns the result of dividing the double-precision floating-point value `a'
2871 by the corresponding value `b'.  The operation is performed according to
2872 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2873 -------------------------------------------------------------------------------
2874 */
2875 float64 float64_div( float64 a, float64 b )
2876 {
2877     flag aSign, bSign, zSign;
2878     int16 aExp, bExp, zExp;
2879     bits64 aSig, bSig, zSig;
2880     bits64 rem0, rem1;
2881     bits64 term0, term1;
2882
2883     aSig = extractFloat64Frac( a );
2884     aExp = extractFloat64Exp( a );
2885     aSign = extractFloat64Sign( a );
2886     bSig = extractFloat64Frac( b );
2887     bExp = extractFloat64Exp( b );
2888     bSign = extractFloat64Sign( b );
2889     zSign = aSign ^ bSign;
2890     if ( aExp == 0x7FF ) {
2891         if ( aSig ) return propagateFloat64NaN( a, b );
2892         if ( bExp == 0x7FF ) {
2893             if ( bSig ) return propagateFloat64NaN( a, b );
2894             float_raise( float_flag_invalid );
2895             return float64_default_nan;
2896         }
2897         return packFloat64( zSign, 0x7FF, 0 );
2898     }
2899     if ( bExp == 0x7FF ) {
2900         if ( bSig ) return propagateFloat64NaN( a, b );
2901         return packFloat64( zSign, 0, 0 );
2902     }
2903     if ( bExp == 0 ) {
2904         if ( bSig == 0 ) {
2905             if ( ( aExp | aSig ) == 0 ) {
2906                 float_raise( float_flag_invalid );
2907                 return float64_default_nan;
2908             }
2909             float_raise( float_flag_divbyzero );
2910             return packFloat64( zSign, 0x7FF, 0 );
2911         }
2912         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2913     }
2914     if ( aExp == 0 ) {
2915         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2916         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917     }
2918     zExp = aExp - bExp + 0x3FD;
2919     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2920     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2921     if ( bSig <= ( aSig + aSig ) ) {
2922         aSig >>= 1;
2923         ++zExp;
2924     }
2925     zSig = estimateDiv128To64( aSig, 0, bSig );
2926     if ( ( zSig & 0x1FF ) <= 2 ) {
2927         mul64To128( bSig, zSig, &term0, &term1 );
2928         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2929         while ( (sbits64) rem0 < 0 ) {
2930             --zSig;
2931             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2932         }
2933         zSig |= ( rem1 != 0 );
2934     }
2935     return roundAndPackFloat64( zSign, zExp, zSig );
2936
2937 }
2938
2939 #ifndef SOFTFLOAT_FOR_GCC
2940 /*
2941 -------------------------------------------------------------------------------
2942 Returns the remainder of the double-precision floating-point value `a'
2943 with respect to the corresponding value `b'.  The operation is performed
2944 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2945 -------------------------------------------------------------------------------
2946 */
2947 float64 float64_rem( float64 a, float64 b )
2948 {
2949     flag aSign, bSign, zSign;
2950     int16 aExp, bExp, expDiff;
2951     bits64 aSig, bSig;
2952     bits64 q, alternateASig;
2953     sbits64 sigMean;
2954
2955     aSig = extractFloat64Frac( a );
2956     aExp = extractFloat64Exp( a );
2957     aSign = extractFloat64Sign( a );
2958     bSig = extractFloat64Frac( b );
2959     bExp = extractFloat64Exp( b );
2960     bSign = extractFloat64Sign( b );
2961     if ( aExp == 0x7FF ) {
2962         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2963             return propagateFloat64NaN( a, b );
2964         }
2965         float_raise( float_flag_invalid );
2966         return float64_default_nan;
2967     }
2968     if ( bExp == 0x7FF ) {
2969         if ( bSig ) return propagateFloat64NaN( a, b );
2970         return a;
2971     }
2972     if ( bExp == 0 ) {
2973         if ( bSig == 0 ) {
2974             float_raise( float_flag_invalid );
2975             return float64_default_nan;
2976         }
2977         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2978     }
2979     if ( aExp == 0 ) {
2980         if ( aSig == 0 ) return a;
2981         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2982     }
2983     expDiff = aExp - bExp;
2984     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2985     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2986     if ( expDiff < 0 ) {
2987         if ( expDiff < -1 ) return a;
2988         aSig >>= 1;
2989     }
2990     q = ( bSig <= aSig );
2991     if ( q ) aSig -= bSig;
2992     expDiff -= 64;
2993     while ( 0 < expDiff ) {
2994         q = estimateDiv128To64( aSig, 0, bSig );
2995         q = ( 2 < q ) ? q - 2 : 0;
2996         aSig = - ( ( bSig>>2 ) * q );
2997         expDiff -= 62;
2998     }
2999     expDiff += 64;
3000     if ( 0 < expDiff ) {
3001         q = estimateDiv128To64( aSig, 0, bSig );
3002         q = ( 2 < q ) ? q - 2 : 0;
3003         q >>= 64 - expDiff;
3004         bSig >>= 2;
3005         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3006     }
3007     else {
3008         aSig >>= 2;
3009         bSig >>= 2;
3010     }
3011     do {
3012         alternateASig = aSig;
3013         ++q;
3014         aSig -= bSig;
3015     } while ( 0 <= (sbits64) aSig );
3016     sigMean = aSig + alternateASig;
3017     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3018         aSig = alternateASig;
3019     }
3020     zSign = ( (sbits64) aSig < 0 );
3021     if ( zSign ) aSig = - aSig;
3022     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3023
3024 }
3025
3026 /*
3027 -------------------------------------------------------------------------------
3028 Returns the square root of the double-precision floating-point value `a'.
3029 The operation is performed according to the IEC/IEEE Standard for Binary
3030 Floating-Point Arithmetic.
3031 -------------------------------------------------------------------------------
3032 */
3033 float64 float64_sqrt( float64 a )
3034 {
3035     flag aSign;
3036     int16 aExp, zExp;
3037     bits64 aSig, zSig, doubleZSig;
3038     bits64 rem0, rem1, term0, term1;
3039
3040     aSig = extractFloat64Frac( a );
3041     aExp = extractFloat64Exp( a );
3042     aSign = extractFloat64Sign( a );
3043     if ( aExp == 0x7FF ) {
3044         if ( aSig ) return propagateFloat64NaN( a, a );
3045         if ( ! aSign ) return a;
3046         float_raise( float_flag_invalid );
3047         return float64_default_nan;
3048     }
3049     if ( aSign ) {
3050         if ( ( aExp | aSig ) == 0 ) return a;
3051         float_raise( float_flag_invalid );
3052         return float64_default_nan;
3053     }
3054     if ( aExp == 0 ) {
3055         if ( aSig == 0 ) return 0;
3056         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3057     }
3058     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3059     aSig |= LIT64( 0x0010000000000000 );
3060     zSig = estimateSqrt32( aExp, aSig>>21 );
3061     aSig <<= 9 - ( aExp & 1 );
3062     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3063     if ( ( zSig & 0x1FF ) <= 5 ) {
3064         doubleZSig = zSig<<1;
3065         mul64To128( zSig, zSig, &term0, &term1 );
3066         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3067         while ( (sbits64) rem0 < 0 ) {
3068             --zSig;
3069             doubleZSig -= 2;
3070             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3071         }
3072         zSig |= ( ( rem0 | rem1 ) != 0 );
3073     }
3074     return roundAndPackFloat64( 0, zExp, zSig );
3075
3076 }
3077 #endif
3078
3079 /*
3080 -------------------------------------------------------------------------------
3081 Returns 1 if the double-precision floating-point value `a' is equal to the
3082 corresponding value `b', and 0 otherwise.  The comparison is performed
3083 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3084 -------------------------------------------------------------------------------
3085 */
3086 flag float64_eq( float64 a, float64 b )
3087 {
3088
3089     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3090          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3091        ) {
3092         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3093             float_raise( float_flag_invalid );
3094         }
3095         return 0;
3096     }
3097     return ( a == b ) ||
3098         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3099
3100 }
3101
3102 /*
3103 -------------------------------------------------------------------------------
3104 Returns 1 if the double-precision floating-point value `a' is less than or
3105 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3106 performed according to the IEC/IEEE Standard for Binary Floating-Point
3107 Arithmetic.
3108 -------------------------------------------------------------------------------
3109 */
3110 flag float64_le( float64 a, float64 b )
3111 {
3112     flag aSign, bSign;
3113
3114     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3115          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3116        ) {
3117         float_raise( float_flag_invalid );
3118         return 0;
3119     }
3120     aSign = extractFloat64Sign( a );
3121     bSign = extractFloat64Sign( b );
3122     if ( aSign != bSign )
3123         return aSign ||
3124             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3125               0 );
3126     return ( a == b ) ||
3127         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3128
3129 }
3130
3131 /*
3132 -------------------------------------------------------------------------------
3133 Returns 1 if the double-precision floating-point value `a' is less than
3134 the corresponding value `b', and 0 otherwise.  The comparison is performed
3135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3136 -------------------------------------------------------------------------------
3137 */
3138 flag float64_lt( float64 a, float64 b )
3139 {
3140     flag aSign, bSign;
3141
3142     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3143          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3144        ) {
3145         float_raise( float_flag_invalid );
3146         return 0;
3147     }
3148     aSign = extractFloat64Sign( a );
3149     bSign = extractFloat64Sign( b );
3150     if ( aSign != bSign )
3151         return aSign &&
3152             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3153               0 );
3154     return ( a != b ) &&
3155         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3156
3157 }
3158
3159 #ifndef SOFTFLOAT_FOR_GCC
3160 /*
3161 -------------------------------------------------------------------------------
3162 Returns 1 if the double-precision floating-point value `a' is equal to the
3163 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3164 if either operand is a NaN.  Otherwise, the comparison is performed
3165 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3166 -------------------------------------------------------------------------------
3167 */
3168 flag float64_eq_signaling( float64 a, float64 b )
3169 {
3170
3171     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3172          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3173        ) {
3174         float_raise( float_flag_invalid );
3175         return 0;
3176     }
3177     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3178
3179 }
3180
3181 /*
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than or
3184 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3185 cause an exception.  Otherwise, the comparison is performed according to the
3186 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3187 -------------------------------------------------------------------------------
3188 */
3189 flag float64_le_quiet( float64 a, float64 b )
3190 {
3191     flag aSign, bSign;
3192
3193     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3194          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3195        ) {
3196         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3197             float_raise( float_flag_invalid );
3198         }
3199         return 0;
3200     }
3201     aSign = extractFloat64Sign( a );
3202     bSign = extractFloat64Sign( b );
3203     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3204     return ( a == b ) || ( aSign ^ ( a < b ) );
3205
3206 }
3207
3208 /*
3209 -------------------------------------------------------------------------------
3210 Returns 1 if the double-precision floating-point value `a' is less than
3211 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3212 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3213 Standard for Binary Floating-Point Arithmetic.
3214 -------------------------------------------------------------------------------
3215 */
3216 flag float64_lt_quiet( float64 a, float64 b )
3217 {
3218     flag aSign, bSign;
3219
3220     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3221          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3222        ) {
3223         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3224             float_raise( float_flag_invalid );
3225         }
3226         return 0;
3227     }
3228     aSign = extractFloat64Sign( a );
3229     bSign = extractFloat64Sign( b );
3230     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3231     return ( a != b ) && ( aSign ^ ( a < b ) );
3232
3233 }
3234 #endif
3235
3236 #ifdef FLOATX80
3237
3238 /*
3239 -------------------------------------------------------------------------------
3240 Returns the result of converting the extended double-precision floating-
3241 point value `a' to the 32-bit two's complement integer format.  The
3242 conversion is performed according to the IEC/IEEE Standard for Binary
3243 Floating-Point Arithmetic---which means in particular that the conversion
3244 is rounded according to the current rounding mode.  If `a' is a NaN, the
3245 largest positive integer is returned.  Otherwise, if the conversion
3246 overflows, the largest integer with the same sign as `a' is returned.
3247 -------------------------------------------------------------------------------
3248 */
3249 int32 floatx80_to_int32( floatx80 a )
3250 {
3251     flag aSign;
3252     int32 aExp, shiftCount;
3253     bits64 aSig;
3254
3255     aSig = extractFloatx80Frac( a );
3256     aExp = extractFloatx80Exp( a );
3257     aSign = extractFloatx80Sign( a );
3258     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3259     shiftCount = 0x4037 - aExp;
3260     if ( shiftCount <= 0 ) shiftCount = 1;
3261     shift64RightJamming( aSig, shiftCount, &aSig );
3262     return roundAndPackInt32( aSign, aSig );
3263
3264 }
3265
3266 /*
3267 -------------------------------------------------------------------------------
3268 Returns the result of converting the extended double-precision floating-
3269 point value `a' to the 32-bit two's complement integer format.  The
3270 conversion is performed according to the IEC/IEEE Standard for Binary
3271 Floating-Point Arithmetic, except that the conversion is always rounded
3272 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3273 Otherwise, if the conversion overflows, the largest integer with the same
3274 sign as `a' is returned.
3275 -------------------------------------------------------------------------------
3276 */
3277 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3278 {
3279     flag aSign;
3280     int32 aExp, shiftCount;
3281     bits64 aSig, savedASig;
3282     int32 z;
3283
3284     aSig = extractFloatx80Frac( a );
3285     aExp = extractFloatx80Exp( a );
3286     aSign = extractFloatx80Sign( a );
3287     if ( 0x401E < aExp ) {
3288         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3289         goto invalid;
3290     }
3291     else if ( aExp < 0x3FFF ) {
3292         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3293         return 0;
3294     }
3295     shiftCount = 0x403E - aExp;
3296     savedASig = aSig;
3297     aSig >>= shiftCount;
3298     z = aSig;
3299     if ( aSign ) z = - z;
3300     if ( ( z < 0 ) ^ aSign ) {
3301  invalid:
3302         float_raise( float_flag_invalid );
3303         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3304     }
3305     if ( ( aSig<<shiftCount ) != savedASig ) {
3306         float_exception_flags |= float_flag_inexact;
3307     }
3308     return z;
3309
3310 }
3311
3312 /*
3313 -------------------------------------------------------------------------------
3314 Returns the result of converting the extended double-precision floating-
3315 point value `a' to the 64-bit two's complement integer format.  The
3316 conversion is performed according to the IEC/IEEE Standard for Binary
3317 Floating-Point Arithmetic---which means in particular that the conversion
3318 is rounded according to the current rounding mode.  If `a' is a NaN,
3319 the largest positive integer is returned.  Otherwise, if the conversion
3320 overflows, the largest integer with the same sign as `a' is returned.
3321 -------------------------------------------------------------------------------
3322 */
3323 int64 floatx80_to_int64( floatx80 a )
3324 {
3325     flag aSign;
3326     int32 aExp, shiftCount;
3327     bits64 aSig, aSigExtra;
3328
3329     aSig = extractFloatx80Frac( a );
3330     aExp = extractFloatx80Exp( a );
3331     aSign = extractFloatx80Sign( a );
3332     shiftCount = 0x403E - aExp;
3333     if ( shiftCount <= 0 ) {
3334         if ( shiftCount ) {
3335             float_raise( float_flag_invalid );
3336             if (    ! aSign
3337                  || (    ( aExp == 0x7FFF )
3338                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3339                ) {
3340                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3341             }
3342             return (sbits64) LIT64( 0x8000000000000000 );
3343         }
3344         aSigExtra = 0;
3345     }
3346     else {
3347         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3348     }
3349     return roundAndPackInt64( aSign, aSig, aSigExtra );
3350
3351 }
3352
3353 /*
3354 -------------------------------------------------------------------------------
3355 Returns the result of converting the extended double-precision floating-
3356 point value `a' to the 64-bit two's complement integer format.  The
3357 conversion is performed according to the IEC/IEEE Standard for Binary
3358 Floating-Point Arithmetic, except that the conversion is always rounded
3359 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3360 Otherwise, if the conversion overflows, the largest integer with the same
3361 sign as `a' is returned.
3362 -------------------------------------------------------------------------------
3363 */
3364 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3365 {
3366     flag aSign;
3367     int32 aExp, shiftCount;
3368     bits64 aSig;
3369     int64 z;
3370
3371     aSig = extractFloatx80Frac( a );
3372     aExp = extractFloatx80Exp( a );
3373     aSign = extractFloatx80Sign( a );
3374     shiftCount = aExp - 0x403E;
3375     if ( 0 <= shiftCount ) {
3376         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3377         if ( ( a.high != 0xC03E ) || aSig ) {
3378             float_raise( float_flag_invalid );
3379             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3380                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3381             }
3382         }
3383         return (sbits64) LIT64( 0x8000000000000000 );
3384     }
3385     else if ( aExp < 0x3FFF ) {
3386         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3387         return 0;
3388     }
3389     z = aSig>>( - shiftCount );
3390     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3391         float_exception_flags |= float_flag_inexact;
3392     }
3393     if ( aSign ) z = - z;
3394     return z;
3395
3396 }
3397
3398 /*
3399 -------------------------------------------------------------------------------
3400 Returns the result of converting the extended double-precision floating-
3401 point value `a' to the single-precision floating-point format.  The
3402 conversion is performed according to the IEC/IEEE Standard for Binary
3403 Floating-Point Arithmetic.
3404 -------------------------------------------------------------------------------
3405 */
3406 float32 floatx80_to_float32( floatx80 a )
3407 {
3408     flag aSign;
3409     int32 aExp;
3410     bits64 aSig;
3411
3412     aSig = extractFloatx80Frac( a );
3413     aExp = extractFloatx80Exp( a );
3414     aSign = extractFloatx80Sign( a );
3415     if ( aExp == 0x7FFF ) {
3416         if ( (bits64) ( aSig<<1 ) ) {
3417             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3418         }
3419         return packFloat32( aSign, 0xFF, 0 );
3420     }
3421     shift64RightJamming( aSig, 33, &aSig );
3422     if ( aExp || aSig ) aExp -= 0x3F81;
3423     return roundAndPackFloat32( aSign, aExp, aSig );
3424
3425 }
3426
3427 /*
3428 -------------------------------------------------------------------------------
3429 Returns the result of converting the extended double-precision floating-
3430 point value `a' to the double-precision floating-point format.  The
3431 conversion is performed according to the IEC/IEEE Standard for Binary
3432 Floating-Point Arithmetic.
3433 -------------------------------------------------------------------------------
3434 */
3435 float64 floatx80_to_float64( floatx80 a )
3436 {
3437     flag aSign;
3438     int32 aExp;
3439     bits64 aSig, zSig;
3440
3441     aSig = extractFloatx80Frac( a );
3442     aExp = extractFloatx80Exp( a );
3443     aSign = extractFloatx80Sign( a );
3444     if ( aExp == 0x7FFF ) {
3445         if ( (bits64) ( aSig<<1 ) ) {
3446             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3447         }
3448         return packFloat64( aSign, 0x7FF, 0 );
3449     }
3450     shift64RightJamming( aSig, 1, &zSig );
3451     if ( aExp || aSig ) aExp -= 0x3C01;
3452     return roundAndPackFloat64( aSign, aExp, zSig );
3453
3454 }
3455
3456 #ifdef FLOAT128
3457
3458 /*
3459 -------------------------------------------------------------------------------
3460 Returns the result of converting the extended double-precision floating-
3461 point value `a' to the quadruple-precision floating-point format.  The
3462 conversion is performed according to the IEC/IEEE Standard for Binary
3463 Floating-Point Arithmetic.
3464 -------------------------------------------------------------------------------
3465 */
3466 float128 floatx80_to_float128( floatx80 a )
3467 {
3468     flag aSign;
3469     int16 aExp;
3470     bits64 aSig, zSig0, zSig1;
3471
3472     aSig = extractFloatx80Frac( a );
3473     aExp = extractFloatx80Exp( a );
3474     aSign = extractFloatx80Sign( a );
3475     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3476         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3477     }
3478     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3479     return packFloat128( aSign, aExp, zSig0, zSig1 );
3480
3481 }
3482
3483 #endif
3484
3485 /*
3486 -------------------------------------------------------------------------------
3487 Rounds the extended double-precision floating-point value `a' to an integer,
3488 and returns the result as an extended quadruple-precision floating-point
3489 value.  The operation is performed according to the IEC/IEEE Standard for
3490 Binary Floating-Point Arithmetic.
3491 -------------------------------------------------------------------------------
3492 */
3493 floatx80 floatx80_round_to_int( floatx80 a )
3494 {
3495     flag aSign;
3496     int32 aExp;
3497     bits64 lastBitMask, roundBitsMask;
3498     int8 roundingMode;
3499     floatx80 z;
3500
3501     aExp = extractFloatx80Exp( a );
3502     if ( 0x403E <= aExp ) {
3503         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3504             return propagateFloatx80NaN( a, a );
3505         }
3506         return a;
3507     }
3508     if ( aExp < 0x3FFF ) {
3509         if (    ( aExp == 0 )
3510              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3511             return a;
3512         }
3513         float_exception_flags |= float_flag_inexact;
3514         aSign = extractFloatx80Sign( a );
3515         switch ( float_rounding_mode ) {
3516          case float_round_nearest_even:
3517             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3518                ) {
3519                 return
3520                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3521             }
3522             break;
3523          case float_round_to_zero:
3524             break;
3525          case float_round_down:
3526             return
3527                   aSign ?
3528                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3529                 : packFloatx80( 0, 0, 0 );
3530          case float_round_up:
3531             return
3532                   aSign ? packFloatx80( 1, 0, 0 )
3533                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3534         }
3535         return packFloatx80( aSign, 0, 0 );
3536     }
3537     lastBitMask = 1;
3538     lastBitMask <<= 0x403E - aExp;
3539     roundBitsMask = lastBitMask - 1;
3540     z = a;
3541     roundingMode = float_rounding_mode;
3542     if ( roundingMode == float_round_nearest_even ) {
3543         z.low += lastBitMask>>1;
3544         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3545     }
3546     else if ( roundingMode != float_round_to_zero ) {
3547         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3548             z.low += roundBitsMask;
3549         }
3550     }
3551     z.low &= ~ roundBitsMask;
3552     if ( z.low == 0 ) {
3553         ++z.high;
3554         z.low = LIT64( 0x8000000000000000 );
3555     }
3556     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3557     return z;
3558
3559 }
3560
3561 /*
3562 -------------------------------------------------------------------------------
3563 Returns the result of adding the absolute values of the extended double-
3564 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3565 negated before being returned.  `zSign' is ignored if the result is a NaN.
3566 The addition is performed according to the IEC/IEEE Standard for Binary
3567 Floating-Point Arithmetic.
3568 -------------------------------------------------------------------------------
3569 */
3570 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3571 {
3572     int32 aExp, bExp, zExp;
3573     bits64 aSig, bSig, zSig0, zSig1;
3574     int32 expDiff;
3575
3576     aSig = extractFloatx80Frac( a );
3577     aExp = extractFloatx80Exp( a );
3578     bSig = extractFloatx80Frac( b );
3579     bExp = extractFloatx80Exp( b );
3580     expDiff = aExp - bExp;
3581     if ( 0 < expDiff ) {
3582         if ( aExp == 0x7FFF ) {
3583             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3584             return a;
3585         }
3586         if ( bExp == 0 ) --expDiff;
3587         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3588         zExp = aExp;
3589     }
3590     else if ( expDiff < 0 ) {
3591         if ( bExp == 0x7FFF ) {
3592             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3593             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3594         }
3595         if ( aExp == 0 ) ++expDiff;
3596         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3597         zExp = bExp;
3598     }
3599     else {
3600         if ( aExp == 0x7FFF ) {
3601             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3602                 return propagateFloatx80NaN( a, b );
3603             }
3604             return a;
3605         }
3606         zSig1 = 0;
3607         zSig0 = aSig + bSig;
3608         if ( aExp == 0 ) {
3609             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3610             goto roundAndPack;
3611         }
3612         zExp = aExp;
3613         goto shiftRight1;
3614     }
3615     zSig0 = aSig + bSig;
3616     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3617  shiftRight1:
3618     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3619     zSig0 |= LIT64( 0x8000000000000000 );
3620     ++zExp;
3621  roundAndPack:
3622     return
3623         roundAndPackFloatx80(
3624             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3625
3626 }
3627
3628 /*
3629 -------------------------------------------------------------------------------
3630 Returns the result of subtracting the absolute values of the extended
3631 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3632 difference is negated before being returned.  `zSign' is ignored if the
3633 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3634 Standard for Binary Floating-Point Arithmetic.
3635 -------------------------------------------------------------------------------
3636 */
3637 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3638 {
3639     int32 aExp, bExp, zExp;
3640     bits64 aSig, bSig, zSig0, zSig1;
3641     int32 expDiff;
3642     floatx80 z;
3643
3644     aSig = extractFloatx80Frac( a );
3645     aExp = extractFloatx80Exp( a );
3646     bSig = extractFloatx80Frac( b );
3647     bExp = extractFloatx80Exp( b );
3648     expDiff = aExp - bExp;
3649     if ( 0 < expDiff ) goto aExpBigger;
3650     if ( expDiff < 0 ) goto bExpBigger;
3651     if ( aExp == 0x7FFF ) {
3652         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3653             return propagateFloatx80NaN( a, b );
3654         }
3655         float_raise( float_flag_invalid );
3656         z.low = floatx80_default_nan_low;
3657         z.high = floatx80_default_nan_high;
3658         return z;
3659     }
3660     if ( aExp == 0 ) {
3661         aExp = 1;
3662         bExp = 1;
3663     }
3664     zSig1 = 0;
3665     if ( bSig < aSig ) goto aBigger;
3666     if ( aSig < bSig ) goto bBigger;
3667     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3668  bExpBigger:
3669     if ( bExp == 0x7FFF ) {
3670         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3671         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672     }
3673     if ( aExp == 0 ) ++expDiff;
3674     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3675  bBigger:
3676     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3677     zExp = bExp;
3678     zSign ^= 1;
3679     goto normalizeRoundAndPack;
3680  aExpBigger:
3681     if ( aExp == 0x7FFF ) {
3682         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3683         return a;
3684     }
3685     if ( bExp == 0 ) --expDiff;
3686     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3687  aBigger:
3688     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3689     zExp = aExp;
3690  normalizeRoundAndPack:
3691     return
3692         normalizeRoundAndPackFloatx80(
3693             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3694
3695 }
3696
3697 /*
3698 -------------------------------------------------------------------------------
3699 Returns the result of adding the extended double-precision floating-point
3700 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3701 Standard for Binary Floating-Point Arithmetic.
3702 -------------------------------------------------------------------------------
3703 */
3704 floatx80 floatx80_add( floatx80 a, floatx80 b )
3705 {
3706     flag aSign, bSign;
3707
3708     aSign = extractFloatx80Sign( a );
3709     bSign = extractFloatx80Sign( b );
3710     if ( aSign == bSign ) {
3711         return addFloatx80Sigs( a, b, aSign );
3712     }
3713     else {
3714         return subFloatx80Sigs( a, b, aSign );
3715     }
3716
3717 }
3718
3719 /*
3720 -------------------------------------------------------------------------------
3721 Returns the result of subtracting the extended double-precision floating-
3722 point values `a' and `b'.  The operation is performed according to the
3723 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3724 -------------------------------------------------------------------------------
3725 */
3726 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3727 {
3728     flag aSign, bSign;
3729
3730     aSign = extractFloatx80Sign( a );
3731     bSign = extractFloatx80Sign( b );
3732     if ( aSign == bSign ) {
3733         return subFloatx80Sigs( a, b, aSign );
3734     }
3735     else {
3736         return addFloatx80Sigs( a, b, aSign );
3737     }
3738
3739 }
3740
3741 /*
3742 -------------------------------------------------------------------------------
3743 Returns the result of multiplying the extended double-precision floating-
3744 point values `a' and `b'.  The operation is performed according to the
3745 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3747 */
3748 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3749 {
3750     flag aSign, bSign, zSign;
3751     int32 aExp, bExp, zExp;
3752     bits64 aSig, bSig, zSig0, zSig1;
3753     floatx80 z;
3754
3755     aSig = extractFloatx80Frac( a );
3756     aExp = extractFloatx80Exp( a );
3757     aSign = extractFloatx80Sign( a );
3758     bSig = extractFloatx80Frac( b );
3759     bExp = extractFloatx80Exp( b );
3760     bSign = extractFloatx80Sign( b );
3761     zSign = aSign ^ bSign;
3762     if ( aExp == 0x7FFF ) {
3763         if (    (bits64) ( aSig<<1 )
3764              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3765             return propagateFloatx80NaN( a, b );
3766         }
3767         if ( ( bExp | bSig ) == 0 ) goto invalid;
3768         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3769     }
3770     if ( bExp == 0x7FFF ) {
3771         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3772         if ( ( aExp | aSig ) == 0 ) {
3773  invalid:
3774             float_raise( float_flag_invalid );
3775             z.low = floatx80_default_nan_low;
3776             z.high = floatx80_default_nan_high;
3777             return z;
3778         }
3779         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3780     }
3781     if ( aExp == 0 ) {
3782         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3783         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3784     }
3785     if ( bExp == 0 ) {
3786         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3787         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3788     }
3789     zExp = aExp + bExp - 0x3FFE;
3790     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3791     if ( 0 < (sbits64) zSig0 ) {
3792         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3793         --zExp;
3794     }
3795     return
3796         roundAndPackFloatx80(
3797             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3798
3799 }
3800
3801 /*
3802 -------------------------------------------------------------------------------
3803 Returns the result of dividing the extended double-precision floating-point
3804 value `a' by the corresponding value `b'.  The operation is performed
3805 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3806 -------------------------------------------------------------------------------
3807 */
3808 floatx80 floatx80_div( floatx80 a, floatx80 b )
3809 {
3810     flag aSign, bSign, zSign;
3811     int32 aExp, bExp, zExp;
3812     bits64 aSig, bSig, zSig0, zSig1;
3813     bits64 rem0, rem1, rem2, term0, term1, term2;
3814     floatx80 z;
3815
3816     aSig = extractFloatx80Frac( a );
3817     aExp = extractFloatx80Exp( a );
3818     aSign = extractFloatx80Sign( a );
3819     bSig = extractFloatx80Frac( b );
3820     bExp = extractFloatx80Exp( b );
3821     bSign = extractFloatx80Sign( b );
3822     zSign = aSign ^ bSign;
3823     if ( aExp == 0x7FFF ) {
3824         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3825         if ( bExp == 0x7FFF ) {
3826             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827             goto invalid;
3828         }
3829         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3830     }
3831     if ( bExp == 0x7FFF ) {
3832         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3833         return packFloatx80( zSign, 0, 0 );
3834     }
3835     if ( bExp == 0 ) {
3836         if ( bSig == 0 ) {
3837             if ( ( aExp | aSig ) == 0 ) {
3838  invalid:
3839                 float_raise( float_flag_invalid );
3840                 z.low = floatx80_default_nan_low;
3841                 z.high = floatx80_default_nan_high;
3842                 return z;
3843             }
3844             float_raise( float_flag_divbyzero );
3845             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3846         }
3847         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3848     }
3849     if ( aExp == 0 ) {
3850         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3851         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3852     }
3853     zExp = aExp - bExp + 0x3FFE;
3854     rem1 = 0;
3855     if ( bSig <= aSig ) {
3856         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3857         ++zExp;
3858     }
3859     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3860     mul64To128( bSig, zSig0, &term0, &term1 );
3861     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3862     while ( (sbits64) rem0 < 0 ) {
3863         --zSig0;
3864         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3865     }
3866     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3867     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3868         mul64To128( bSig, zSig1, &term1, &term2 );
3869         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3870         while ( (sbits64) rem1 < 0 ) {
3871             --zSig1;
3872             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3873         }
3874         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3875     }
3876     return
3877         roundAndPackFloatx80(
3878             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3879
3880 }
3881
3882 /*
3883 -------------------------------------------------------------------------------
3884 Returns the remainder of the extended double-precision floating-point value
3885 `a' with respect to the corresponding value `b'.  The operation is performed
3886 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3887 -------------------------------------------------------------------------------
3888 */
3889 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3890 {
3891     flag aSign, bSign, zSign;
3892     int32 aExp, bExp, expDiff;
3893     bits64 aSig0, aSig1, bSig;
3894     bits64 q, term0, term1, alternateASig0, alternateASig1;
3895     floatx80 z;
3896
3897     aSig0 = extractFloatx80Frac( a );
3898     aExp = extractFloatx80Exp( a );
3899     aSign = extractFloatx80Sign( a );
3900     bSig = extractFloatx80Frac( b );
3901     bExp = extractFloatx80Exp( b );
3902     bSign = extractFloatx80Sign( b );
3903     if ( aExp == 0x7FFF ) {
3904         if (    (bits64) ( aSig0<<1 )
3905              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3906             return propagateFloatx80NaN( a, b );
3907         }
3908         goto invalid;
3909     }
3910     if ( bExp == 0x7FFF ) {
3911         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3912         return a;
3913     }
3914     if ( bExp == 0 ) {
3915         if ( bSig == 0 ) {
3916  invalid:
3917             float_raise( float_flag_invalid );
3918             z.low = floatx80_default_nan_low;
3919             z.high = floatx80_default_nan_high;
3920             return z;
3921         }
3922         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3923     }
3924     if ( aExp == 0 ) {
3925         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3926         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3927     }
3928     bSig |= LIT64( 0x8000000000000000 );
3929     zSign = aSign;
3930     expDiff = aExp - bExp;
3931     aSig1 = 0;
3932     if ( expDiff < 0 ) {
3933         if ( expDiff < -1 ) return a;
3934         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3935         expDiff = 0;
3936     }
3937     q = ( bSig <= aSig0 );
3938     if ( q ) aSig0 -= bSig;
3939     expDiff -= 64;
3940     while ( 0 < expDiff ) {
3941         q = estimateDiv128To64( aSig0, aSig1, bSig );
3942         q = ( 2 < q ) ? q - 2 : 0;
3943         mul64To128( bSig, q, &term0, &term1 );
3944         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3945         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3946         expDiff -= 62;
3947     }
3948     expDiff += 64;
3949     if ( 0 < expDiff ) {
3950         q = estimateDiv128To64( aSig0, aSig1, bSig );
3951         q = ( 2 < q ) ? q - 2 : 0;
3952         q >>= 64 - expDiff;
3953         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3954         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3955         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3956         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3957             ++q;
3958             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3959         }
3960     }
3961     else {
3962         term1 = 0;
3963         term0 = bSig;
3964     }
3965     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3966     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3967          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3968               && ( q & 1 ) )
3969        ) {
3970         aSig0 = alternateASig0;
3971         aSig1 = alternateASig1;
3972         zSign = ! zSign;
3973     }
3974     return
3975         normalizeRoundAndPackFloatx80(
3976             80, zSign, bExp + expDiff, aSig0, aSig1 );
3977
3978 }
3979
3980 /*
3981 -------------------------------------------------------------------------------
3982 Returns the square root of the extended double-precision floating-point
3983 value `a'.  The operation is performed according to the IEC/IEEE Standard
3984 for Binary Floating-Point Arithmetic.
3985 -------------------------------------------------------------------------------
3986 */
3987 floatx80 floatx80_sqrt( floatx80 a )
3988 {
3989     flag aSign;
3990     int32 aExp, zExp;
3991     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3992     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3993     floatx80 z;
3994
3995     aSig0 = extractFloatx80Frac( a );
3996     aExp = extractFloatx80Exp( a );
3997     aSign = extractFloatx80Sign( a );
3998     if ( aExp == 0x7FFF ) {
3999         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4000         if ( ! aSign ) return a;
4001         goto invalid;
4002     }
4003     if ( aSign ) {
4004         if ( ( aExp | aSig0 ) == 0 ) return a;
4005  invalid:
4006         float_raise( float_flag_invalid );
4007         z.low = floatx80_default_nan_low;
4008         z.high = floatx80_default_nan_high;
4009         return z;
4010     }
4011     if ( aExp == 0 ) {
4012         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4013         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4014     }
4015     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4016     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4017     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4018     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4019     doubleZSig0 = zSig0<<1;
4020     mul64To128( zSig0, zSig0, &term0, &term1 );
4021     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4022     while ( (sbits64) rem0 < 0 ) {
4023         --zSig0;
4024         doubleZSig0 -= 2;
4025         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4026     }
4027     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4028     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4029         if ( zSig1 == 0 ) zSig1 = 1;
4030         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4031         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4032         mul64To128( zSig1, zSig1, &term2, &term3 );
4033         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4034         while ( (sbits64) rem1 < 0 ) {
4035             --zSig1;
4036             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4037             term3 |= 1;
4038             term2 |= doubleZSig0;
4039             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4040         }
4041         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4042     }
4043     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4044     zSig0 |= doubleZSig0;
4045     return
4046         roundAndPackFloatx80(
4047             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4048
4049 }
4050
4051 /*
4052 -------------------------------------------------------------------------------
4053 Returns 1 if the extended double-precision floating-point value `a' is
4054 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4055 performed according to the IEC/IEEE Standard for Binary Floating-Point
4056 Arithmetic.
4057 -------------------------------------------------------------------------------
4058 */
4059 flag floatx80_eq( floatx80 a, floatx80 b )
4060 {
4061
4062     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4063               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4064          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4065               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4066        ) {
4067         if (    floatx80_is_signaling_nan( a )
4068              || floatx80_is_signaling_nan( b ) ) {
4069             float_raise( float_flag_invalid );
4070         }
4071         return 0;
4072     }
4073     return
4074            ( a.low == b.low )
4075         && (    ( a.high == b.high )
4076              || (    ( a.low == 0 )
4077                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4078            );
4079
4080 }
4081
4082 /*
4083 -------------------------------------------------------------------------------
4084 Returns 1 if the extended double-precision floating-point value `a' is
4085 less than or equal to the corresponding value `b', and 0 otherwise.  The
4086 comparison is performed according to the IEC/IEEE Standard for Binary
4087 Floating-Point Arithmetic.
4088 -------------------------------------------------------------------------------
4089 */
4090 flag floatx80_le( floatx80 a, floatx80 b )
4091 {
4092     flag aSign, bSign;
4093
4094     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4095               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4096          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4097               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4098        ) {
4099         float_raise( float_flag_invalid );
4100         return 0;
4101     }
4102     aSign = extractFloatx80Sign( a );
4103     bSign = extractFloatx80Sign( b );
4104     if ( aSign != bSign ) {
4105         return
4106                aSign
4107             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4108                  == 0 );
4109     }
4110     return
4111           aSign ? le128( b.high, b.low, a.high, a.low )
4112         : le128( a.high, a.low, b.high, b.low );
4113
4114 }
4115
4116 /*
4117 -------------------------------------------------------------------------------
4118 Returns 1 if the extended double-precision floating-point value `a' is
4119 less than the corresponding value `b', and 0 otherwise.  The comparison
4120 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4121 Arithmetic.
4122 -------------------------------------------------------------------------------
4123 */
4124 flag floatx80_lt( floatx80 a, floatx80 b )
4125 {
4126     flag aSign, bSign;
4127
4128     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4129               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4130          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4131               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4132        ) {
4133         float_raise( float_flag_invalid );
4134         return 0;
4135     }
4136     aSign = extractFloatx80Sign( a );
4137     bSign = extractFloatx80Sign( b );
4138     if ( aSign != bSign ) {
4139         return
4140                aSign
4141             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4142                  != 0 );
4143     }
4144     return
4145           aSign ? lt128( b.high, b.low, a.high, a.low )
4146         : lt128( a.high, a.low, b.high, b.low );
4147
4148 }
4149
4150 /*
4151 -------------------------------------------------------------------------------
4152 Returns 1 if the extended double-precision floating-point value `a' is equal
4153 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4154 raised if either operand is a NaN.  Otherwise, the comparison is performed
4155 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4156 -------------------------------------------------------------------------------
4157 */
4158 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4159 {
4160
4161     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4162               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4163          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4164               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4165        ) {
4166         float_raise( float_flag_invalid );
4167         return 0;
4168     }
4169     return
4170            ( a.low == b.low )
4171         && (    ( a.high == b.high )
4172              || (    ( a.low == 0 )
4173                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4174            );
4175
4176 }
4177
4178 /*
4179 -------------------------------------------------------------------------------
4180 Returns 1 if the extended double-precision floating-point value `a' is less
4181 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4182 do not cause an exception.  Otherwise, the comparison is performed according
4183 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184 -------------------------------------------------------------------------------
4185 */
4186 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4187 {
4188     flag aSign, bSign;
4189
4190     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4191               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4193               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4194        ) {
4195         if (    floatx80_is_signaling_nan( a )
4196              || floatx80_is_signaling_nan( b ) ) {
4197             float_raise( float_flag_invalid );
4198         }
4199         return 0;
4200     }
4201     aSign = extractFloatx80Sign( a );
4202     bSign = extractFloatx80Sign( b );
4203     if ( aSign != bSign ) {
4204         return
4205                aSign
4206             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4207                  == 0 );
4208     }
4209     return
4210           aSign ? le128( b.high, b.low, a.high, a.low )
4211         : le128( a.high, a.low, b.high, b.low );
4212
4213 }
4214
4215 /*
4216 -------------------------------------------------------------------------------
4217 Returns 1 if the extended double-precision floating-point value `a' is less
4218 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4219 an exception.  Otherwise, the comparison is performed according to the
4220 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4221 -------------------------------------------------------------------------------
4222 */
4223 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4224 {
4225     flag aSign, bSign;
4226
4227     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4228               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4229          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4230               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4231        ) {
4232         if (    floatx80_is_signaling_nan( a )
4233              || floatx80_is_signaling_nan( b ) ) {
4234             float_raise( float_flag_invalid );
4235         }
4236         return 0;
4237     }
4238     aSign = extractFloatx80Sign( a );
4239     bSign = extractFloatx80Sign( b );
4240     if ( aSign != bSign ) {
4241         return
4242                aSign
4243             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4244                  != 0 );
4245     }
4246     return
4247           aSign ? lt128( b.high, b.low, a.high, a.low )
4248         : lt128( a.high, a.low, b.high, b.low );
4249
4250 }
4251
4252 #endif
4253
4254 #ifdef FLOAT128
4255
4256 /*
4257 -------------------------------------------------------------------------------
4258 Returns the result of converting the quadruple-precision floating-point
4259 value `a' to the 32-bit two's complement integer format.  The conversion
4260 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4261 Arithmetic---which means in particular that the conversion is rounded
4262 according to the current rounding mode.  If `a' is a NaN, the largest
4263 positive integer is returned.  Otherwise, if the conversion overflows, the
4264 largest integer with the same sign as `a' is returned.
4265 -------------------------------------------------------------------------------
4266 */
4267 int32 float128_to_int32( float128 a )
4268 {
4269     flag aSign;
4270     int32 aExp, shiftCount;
4271     bits64 aSig0, aSig1;
4272
4273     aSig1 = extractFloat128Frac1( a );
4274     aSig0 = extractFloat128Frac0( a );
4275     aExp = extractFloat128Exp( a );
4276     aSign = extractFloat128Sign( a );
4277     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4278     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4279     aSig0 |= ( aSig1 != 0 );
4280     shiftCount = 0x4028 - aExp;
4281     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4282     return roundAndPackInt32( aSign, aSig0 );
4283
4284 }
4285
4286 /*
4287 -------------------------------------------------------------------------------
4288 Returns the result of converting the quadruple-precision floating-point
4289 value `a' to the 32-bit two's complement integer format.  The conversion
4290 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4291 Arithmetic, except that the conversion is always rounded toward zero.  If
4292 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4293 conversion overflows, the largest integer with the same sign as `a' is
4294 returned.
4295 -------------------------------------------------------------------------------
4296 */
4297 int32 float128_to_int32_round_to_zero( float128 a )
4298 {
4299     flag aSign;
4300     int32 aExp, shiftCount;
4301     bits64 aSig0, aSig1, savedASig;
4302     int32 z;
4303
4304     aSig1 = extractFloat128Frac1( a );
4305     aSig0 = extractFloat128Frac0( a );
4306     aExp = extractFloat128Exp( a );
4307     aSign = extractFloat128Sign( a );
4308     aSig0 |= ( aSig1 != 0 );
4309     if ( 0x401E < aExp ) {
4310         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4311         goto invalid;
4312     }
4313     else if ( aExp < 0x3FFF ) {
4314         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4315         return 0;
4316     }
4317     aSig0 |= LIT64( 0x0001000000000000 );
4318     shiftCount = 0x402F - aExp;
4319     savedASig = aSig0;
4320     aSig0 >>= shiftCount;
4321     z = aSig0;
4322     if ( aSign ) z = - z;
4323     if ( ( z < 0 ) ^ aSign ) {
4324  invalid:
4325         float_raise( float_flag_invalid );
4326         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4327     }
4328     if ( ( aSig0<<shiftCount ) != savedASig ) {
4329         float_exception_flags |= float_flag_inexact;
4330     }
4331     return z;
4332
4333 }
4334
4335 /*
4336 -------------------------------------------------------------------------------
4337 Returns the result of converting the quadruple-precision floating-point
4338 value `a' to the 64-bit two's complement integer format.  The conversion
4339 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4340 Arithmetic---which means in particular that the conversion is rounded
4341 according to the current rounding mode.  If `a' is a NaN, the largest
4342 positive integer is returned.  Otherwise, if the conversion overflows, the
4343 largest integer with the same sign as `a' is returned.
4344 -------------------------------------------------------------------------------
4345 */
4346 int64 float128_to_int64( float128 a )
4347 {
4348     flag aSign;
4349     int32 aExp, shiftCount;
4350     bits64 aSig0, aSig1;
4351
4352     aSig1 = extractFloat128Frac1( a );
4353     aSig0 = extractFloat128Frac0( a );
4354     aExp = extractFloat128Exp( a );
4355     aSign = extractFloat128Sign( a );
4356     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4357     shiftCount = 0x402F - aExp;
4358     if ( shiftCount <= 0 ) {
4359         if ( 0x403E < aExp ) {
4360             float_raise( float_flag_invalid );
4361             if (    ! aSign
4362                  || (    ( aExp == 0x7FFF )
4363                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4364                     )
4365                ) {
4366                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4367             }
4368             return (sbits64) LIT64( 0x8000000000000000 );
4369         }
4370         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4371     }
4372     else {
4373         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4374     }
4375     return roundAndPackInt64( aSign, aSig0, aSig1 );
4376
4377 }
4378
4379 /*
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format.  The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic, except that the conversion is always rounded toward zero.
4385 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4386 the conversion overflows, the largest integer with the same sign as `a' is
4387 returned.
4388 -------------------------------------------------------------------------------
4389 */
4390 int64 float128_to_int64_round_to_zero( float128 a )
4391 {
4392     flag aSign;
4393     int32 aExp, shiftCount;
4394     bits64 aSig0, aSig1;
4395     int64 z;
4396
4397     aSig1 = extractFloat128Frac1( a );
4398     aSig0 = extractFloat128Frac0( a );
4399     aExp = extractFloat128Exp( a );
4400     aSign = extractFloat128Sign( a );
4401     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4402     shiftCount = aExp - 0x402F;
4403     if ( 0 < shiftCount ) {
4404         if ( 0x403E <= aExp ) {
4405             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4406             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4407                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4408                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4409             }
4410             else {
4411                 float_raise( float_flag_invalid );
4412                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4413                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4414                 }
4415             }
4416             return (sbits64) LIT64( 0x8000000000000000 );
4417         }
4418         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4419         if ( (bits64) ( aSig1<<shiftCount ) ) {
4420             float_exception_flags |= float_flag_inexact;
4421         }
4422     }
4423     else {
4424         if ( aExp < 0x3FFF ) {
4425             if ( aExp | aSig0 | aSig1 ) {
4426                 float_exception_flags |= float_flag_inexact;
4427             }
4428             return 0;
4429         }
4430         z = aSig0>>( - shiftCount );
4431         if (    aSig1
4432              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4433             float_exception_flags |= float_flag_inexact;
4434         }
4435     }
4436     if ( aSign ) z = - z;
4437     return z;
4438
4439 }
4440
4441 /*
4442 -------------------------------------------------------------------------------
4443 Returns the result of converting the quadruple-precision floating-point
4444 value `a' to the single-precision floating-point format.  The conversion
4445 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4446 Arithmetic.
4447 -------------------------------------------------------------------------------
4448 */
4449 float32 float128_to_float32( float128 a )
4450 {
4451     flag aSign;
4452     int32 aExp;
4453     bits64 aSig0, aSig1;
4454     bits32 zSig;
4455
4456     aSig1 = extractFloat128Frac1( a );
4457     aSig0 = extractFloat128Frac0( a );
4458     aExp = extractFloat128Exp( a );
4459     aSign = extractFloat128Sign( a );
4460     if ( aExp == 0x7FFF ) {
4461         if ( aSig0 | aSig1 ) {
4462             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4463         }
4464         return packFloat32( aSign, 0xFF, 0 );
4465     }
4466     aSig0 |= ( aSig1 != 0 );
4467     shift64RightJamming( aSig0, 18, &aSig0 );
4468     zSig = aSig0;
4469     if ( aExp || zSig ) {
4470         zSig |= 0x40000000;
4471         aExp -= 0x3F81;
4472     }
4473     return roundAndPackFloat32( aSign, aExp, zSig );
4474
4475 }
4476
4477 /*
4478 -------------------------------------------------------------------------------
4479 Returns the result of converting the quadruple-precision floating-point
4480 value `a' to the double-precision floating-point format.  The conversion
4481 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4482 Arithmetic.
4483 -------------------------------------------------------------------------------
4484 */
4485 float64 float128_to_float64( float128 a )
4486 {
4487     flag aSign;
4488     int32 aExp;
4489     bits64 aSig0, aSig1;
4490
4491     aSig1 = extractFloat128Frac1( a );
4492     aSig0 = extractFloat128Frac0( a );
4493     aExp = extractFloat128Exp( a );
4494     aSign = extractFloat128Sign( a );
4495     if ( aExp == 0x7FFF ) {
4496         if ( aSig0 | aSig1 ) {
4497             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4498         }
4499         return packFloat64( aSign, 0x7FF, 0 );
4500     }
4501     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4502     aSig0 |= ( aSig1 != 0 );
4503     if ( aExp || aSig0 ) {
4504         aSig0 |= LIT64( 0x4000000000000000 );
4505         aExp -= 0x3C01;
4506     }
4507     return roundAndPackFloat64( aSign, aExp, aSig0 );
4508
4509 }
4510
4511 #ifdef FLOATX80
4512
4513 /*
4514 -------------------------------------------------------------------------------
4515 Returns the result of converting the quadruple-precision floating-point
4516 value `a' to the extended double-precision floating-point format.  The
4517 conversion is performed according to the IEC/IEEE Standard for Binary
4518 Floating-Point Arithmetic.
4519 -------------------------------------------------------------------------------
4520 */
4521 floatx80 float128_to_floatx80( float128 a )
4522 {
4523     flag aSign;
4524     int32 aExp;
4525     bits64 aSig0, aSig1;
4526
4527     aSig1 = extractFloat128Frac1( a );
4528     aSig0 = extractFloat128Frac0( a );
4529     aExp = extractFloat128Exp( a );
4530     aSign = extractFloat128Sign( a );
4531     if ( aExp == 0x7FFF ) {
4532         if ( aSig0 | aSig1 ) {
4533             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4534         }
4535         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4536     }
4537     if ( aExp == 0 ) {
4538         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4539         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4540     }
4541     else {
4542         aSig0 |= LIT64( 0x0001000000000000 );
4543     }
4544     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4545     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4546
4547 }
4548
4549 #endif
4550
4551 /*
4552 -------------------------------------------------------------------------------
4553 Rounds the quadruple-precision floating-point value `a' to an integer, and
4554 returns the result as a quadruple-precision floating-point value.  The
4555 operation is performed according to the IEC/IEEE Standard for Binary
4556 Floating-Point Arithmetic.
4557 -------------------------------------------------------------------------------
4558 */
4559 float128 float128_round_to_int( float128 a )
4560 {
4561     flag aSign;
4562     int32 aExp;
4563     bits64 lastBitMask, roundBitsMask;
4564     int8 roundingMode;
4565     float128 z;
4566
4567     aExp = extractFloat128Exp( a );
4568     if ( 0x402F <= aExp ) {
4569         if ( 0x406F <= aExp ) {
4570             if (    ( aExp == 0x7FFF )
4571                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4572                ) {
4573                 return propagateFloat128NaN( a, a );
4574             }
4575             return a;
4576         }
4577         lastBitMask = 1;
4578         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4579         roundBitsMask = lastBitMask - 1;
4580         z = a;
4581         roundingMode = float_rounding_mode;
4582         if ( roundingMode == float_round_nearest_even ) {
4583             if ( lastBitMask ) {
4584                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4585                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4586             }
4587             else {
4588                 if ( (sbits64) z.low < 0 ) {
4589                     ++z.high;
4590                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4591                 }
4592             }
4593         }
4594         else if ( roundingMode != float_round_to_zero ) {
4595             if (   extractFloat128Sign( z )
4596                  ^ ( roundingMode == float_round_up ) ) {
4597                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4598             }
4599         }
4600         z.low &= ~ roundBitsMask;
4601     }
4602     else {
4603         if ( aExp < 0x3FFF ) {
4604             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4605             float_exception_flags |= float_flag_inexact;
4606             aSign = extractFloat128Sign( a );
4607             switch ( float_rounding_mode ) {
4608              case float_round_nearest_even:
4609                 if (    ( aExp == 0x3FFE )
4610                      && (   extractFloat128Frac0( a )
4611                           | extractFloat128Frac1( a ) )
4612                    ) {
4613                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4614                 }
4615                 break;
4616              case float_round_to_zero:
4617                 break;
4618              case float_round_down:
4619                 return
4620                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621                     : packFloat128( 0, 0, 0, 0 );
4622              case float_round_up:
4623                 return
4624                       aSign ? packFloat128( 1, 0, 0, 0 )
4625                     : packFloat128( 0, 0x3FFF, 0, 0 );
4626             }
4627             return packFloat128( aSign, 0, 0, 0 );
4628         }
4629         lastBitMask = 1;
4630         lastBitMask <<= 0x402F - aExp;
4631         roundBitsMask = lastBitMask - 1;
4632         z.low = 0;
4633         z.high = a.high;
4634         roundingMode = float_rounding_mode;
4635         if ( roundingMode == float_round_nearest_even ) {
4636             z.high += lastBitMask>>1;
4637             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638                 z.high &= ~ lastBitMask;
4639             }
4640         }
4641         else if ( roundingMode != float_round_to_zero ) {
4642             if (   extractFloat128Sign( z )
4643                  ^ ( roundingMode == float_round_up ) ) {
4644                 z.high |= ( a.low != 0 );
4645                 z.high += roundBitsMask;
4646             }
4647         }
4648         z.high &= ~ roundBitsMask;
4649     }
4650     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651         float_exception_flags |= float_flag_inexact;
4652     }
4653     return z;
4654
4655 }
4656
4657 /*
4658 -------------------------------------------------------------------------------
4659 Returns the result of adding the absolute values of the quadruple-precision
4660 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4661 before being returned.  `zSign' is ignored if the result is a NaN.
4662 The addition is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4665 */
4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4667 {
4668     int32 aExp, bExp, zExp;
4669     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4670     int32 expDiff;
4671
4672     aSig1 = extractFloat128Frac1( a );
4673     aSig0 = extractFloat128Frac0( a );
4674     aExp = extractFloat128Exp( a );
4675     bSig1 = extractFloat128Frac1( b );
4676     bSig0 = extractFloat128Frac0( b );
4677     bExp = extractFloat128Exp( b );
4678     expDiff = aExp - bExp;
4679     if ( 0 < expDiff ) {
4680         if ( aExp == 0x7FFF ) {
4681             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4682             return a;
4683         }
4684         if ( bExp == 0 ) {
4685             --expDiff;
4686         }
4687         else {
4688             bSig0 |= LIT64( 0x0001000000000000 );
4689         }
4690         shift128ExtraRightJamming(
4691             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4692         zExp = aExp;
4693     }
4694     else if ( expDiff < 0 ) {
4695         if ( bExp == 0x7FFF ) {
4696             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697             return packFloat128( zSign, 0x7FFF, 0, 0 );
4698         }
4699         if ( aExp == 0 ) {
4700             ++expDiff;
4701         }
4702         else {
4703             aSig0 |= LIT64( 0x0001000000000000 );
4704         }
4705         shift128ExtraRightJamming(
4706             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4707         zExp = bExp;
4708     }
4709     else {
4710         if ( aExp == 0x7FFF ) {
4711             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712                 return propagateFloat128NaN( a, b );
4713             }
4714             return a;
4715         }
4716         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4718         zSig2 = 0;
4719         zSig0 |= LIT64( 0x0002000000000000 );
4720         zExp = aExp;
4721         goto shiftRight1;
4722     }
4723     aSig0 |= LIT64( 0x0001000000000000 );
4724     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4725     --zExp;
4726     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4727     ++zExp;
4728  shiftRight1:
4729     shift128ExtraRightJamming(
4730         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4731  roundAndPack:
4732     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4733
4734 }
4735
4736 /*
4737 -------------------------------------------------------------------------------
4738 Returns the result of subtracting the absolute values of the quadruple-
4739 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4740 difference is negated before being returned.  `zSign' is ignored if the
4741 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4742 Standard for Binary Floating-Point Arithmetic.
4743 -------------------------------------------------------------------------------
4744 */
4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4746 {
4747     int32 aExp, bExp, zExp;
4748     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4749     int32 expDiff;
4750     float128 z;
4751
4752     aSig1 = extractFloat128Frac1( a );
4753     aSig0 = extractFloat128Frac0( a );
4754     aExp = extractFloat128Exp( a );
4755     bSig1 = extractFloat128Frac1( b );
4756     bSig0 = extractFloat128Frac0( b );
4757     bExp = extractFloat128Exp( b );
4758     expDiff = aExp - bExp;
4759     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761     if ( 0 < expDiff ) goto aExpBigger;
4762     if ( expDiff < 0 ) goto bExpBigger;
4763     if ( aExp == 0x7FFF ) {
4764         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765             return propagateFloat128NaN( a, b );
4766         }
4767         float_raise( float_flag_invalid );
4768         z.low = float128_default_nan_low;
4769         z.high = float128_default_nan_high;
4770         return z;
4771     }
4772     if ( aExp == 0 ) {
4773         aExp = 1;
4774         bExp = 1;
4775     }
4776     if ( bSig0 < aSig0 ) goto aBigger;
4777     if ( aSig0 < bSig0 ) goto bBigger;
4778     if ( bSig1 < aSig1 ) goto aBigger;
4779     if ( aSig1 < bSig1 ) goto bBigger;
4780     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4781  bExpBigger:
4782     if ( bExp == 0x7FFF ) {
4783         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4785     }
4786     if ( aExp == 0 ) {
4787         ++expDiff;
4788     }
4789     else {
4790         aSig0 |= LIT64( 0x4000000000000000 );
4791     }
4792     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793     bSig0 |= LIT64( 0x4000000000000000 );
4794  bBigger:
4795     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4796     zExp = bExp;
4797     zSign ^= 1;
4798     goto normalizeRoundAndPack;
4799  aExpBigger:
4800     if ( aExp == 0x7FFF ) {
4801         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4802         return a;
4803     }
4804     if ( bExp == 0 ) {
4805         --expDiff;
4806     }
4807     else {
4808         bSig0 |= LIT64( 0x4000000000000000 );
4809     }
4810     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811     aSig0 |= LIT64( 0x4000000000000000 );
4812  aBigger:
4813     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814     zExp = aExp;
4815  normalizeRoundAndPack:
4816     --zExp;
4817     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4818
4819 }
4820
4821 /*
4822 -------------------------------------------------------------------------------
4823 Returns the result of adding the quadruple-precision floating-point values
4824 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4825 for Binary Floating-Point Arithmetic.
4826 -------------------------------------------------------------------------------
4827 */
4828 float128 float128_add( float128 a, float128 b )
4829 {
4830     flag aSign, bSign;
4831
4832     aSign = extractFloat128Sign( a );
4833     bSign = extractFloat128Sign( b );
4834     if ( aSign == bSign ) {
4835         return addFloat128Sigs( a, b, aSign );
4836     }
4837     else {
4838         return subFloat128Sigs( a, b, aSign );
4839     }
4840
4841 }
4842
4843 /*
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the quadruple-precision floating-point
4846 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4847 Standard for Binary Floating-Point Arithmetic.
4848 -------------------------------------------------------------------------------
4849 */
4850 float128 float128_sub( float128 a, float128 b )
4851 {
4852     flag aSign, bSign;
4853
4854     aSign = extractFloat128Sign( a );
4855     bSign = extractFloat128Sign( b );
4856     if ( aSign == bSign ) {
4857         return subFloat128Sigs( a, b, aSign );
4858     }
4859     else {
4860         return addFloat128Sigs( a, b, aSign );
4861     }
4862
4863 }
4864
4865 /*
4866 -------------------------------------------------------------------------------
4867 Returns the result of multiplying the quadruple-precision floating-point
4868 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4869 Standard for Binary Floating-Point Arithmetic.
4870 -------------------------------------------------------------------------------
4871 */
4872 float128 float128_mul( float128 a, float128 b )
4873 {
4874     flag aSign, bSign, zSign;
4875     int32 aExp, bExp, zExp;
4876     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4877     float128 z;
4878
4879     aSig1 = extractFloat128Frac1( a );
4880     aSig0 = extractFloat128Frac0( a );
4881     aExp = extractFloat128Exp( a );
4882     aSign = extractFloat128Sign( a );
4883     bSig1 = extractFloat128Frac1( b );
4884     bSig0 = extractFloat128Frac0( b );
4885     bExp = extractFloat128Exp( b );
4886     bSign = extractFloat128Sign( b );
4887     zSign = aSign ^ bSign;
4888     if ( aExp == 0x7FFF ) {
4889         if (    ( aSig0 | aSig1 )
4890              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891             return propagateFloat128NaN( a, b );
4892         }
4893         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894         return packFloat128( zSign, 0x7FFF, 0, 0 );
4895     }
4896     if ( bExp == 0x7FFF ) {
4897         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4899  invalid:
4900             float_raise( float_flag_invalid );
4901             z.low = float128_default_nan_low;
4902             z.high = float128_default_nan_high;
4903             return z;
4904         }
4905         return packFloat128( zSign, 0x7FFF, 0, 0 );
4906     }
4907     if ( aExp == 0 ) {
4908         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4910     }
4911     if ( bExp == 0 ) {
4912         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4914     }
4915     zExp = aExp + bExp - 0x4000;
4916     aSig0 |= LIT64( 0x0001000000000000 );
4917     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920     zSig2 |= ( zSig3 != 0 );
4921     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922         shift128ExtraRightJamming(
4923             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4924         ++zExp;
4925     }
4926     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4927
4928 }
4929
4930 /*
4931 -------------------------------------------------------------------------------
4932 Returns the result of dividing the quadruple-precision floating-point value
4933 `a' by the corresponding value `b'.  The operation is performed according to
4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4936 */
4937 float128 float128_div( float128 a, float128 b )
4938 {
4939     flag aSign, bSign, zSign;
4940     int32 aExp, bExp, zExp;
4941     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4943     float128 z;
4944
4945     aSig1 = extractFloat128Frac1( a );
4946     aSig0 = extractFloat128Frac0( a );
4947     aExp = extractFloat128Exp( a );
4948     aSign = extractFloat128Sign( a );
4949     bSig1 = extractFloat128Frac1( b );
4950     bSig0 = extractFloat128Frac0( b );
4951     bExp = extractFloat128Exp( b );
4952     bSign = extractFloat128Sign( b );
4953     zSign = aSign ^ bSign;
4954     if ( aExp == 0x7FFF ) {
4955         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956         if ( bExp == 0x7FFF ) {
4957             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4958             goto invalid;
4959         }
4960         return packFloat128( zSign, 0x7FFF, 0, 0 );
4961     }
4962     if ( bExp == 0x7FFF ) {
4963         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964         return packFloat128( zSign, 0, 0, 0 );
4965     }
4966     if ( bExp == 0 ) {
4967         if ( ( bSig0 | bSig1 ) == 0 ) {
4968             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969  invalid:
4970                 float_raise( float_flag_invalid );
4971                 z.low = float128_default_nan_low;
4972                 z.high = float128_default_nan_high;
4973                 return z;
4974             }
4975             float_raise( float_flag_divbyzero );
4976             return packFloat128( zSign, 0x7FFF, 0, 0 );
4977         }
4978         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4979     }
4980     if ( aExp == 0 ) {
4981         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4983     }
4984     zExp = aExp - bExp + 0x3FFD;
4985     shortShift128Left(
4986         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4987     shortShift128Left(
4988         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4991         ++zExp;
4992     }
4993     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996     while ( (sbits64) rem0 < 0 ) {
4997         --zSig0;
4998         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4999     }
5000     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004         while ( (sbits64) rem1 < 0 ) {
5005             --zSig1;
5006             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5007         }
5008         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5009     }
5010     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012
5013 }
5014
5015 /*
5016 -------------------------------------------------------------------------------
5017 Returns the remainder of the quadruple-precision floating-point value `a'
5018 with respect to the corresponding value `b'.  The operation is performed
5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5021 */
5022 float128 float128_rem( float128 a, float128 b )
5023 {
5024     flag aSign, bSign, zSign;
5025     int32 aExp, bExp, expDiff;
5026     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5028     sbits64 sigMean0;
5029     float128 z;
5030
5031     aSig1 = extractFloat128Frac1( a );
5032     aSig0 = extractFloat128Frac0( a );
5033     aExp = extractFloat128Exp( a );
5034     aSign = extractFloat128Sign( a );
5035     bSig1 = extractFloat128Frac1( b );
5036     bSig0 = extractFloat128Frac0( b );
5037     bExp = extractFloat128Exp( b );
5038     bSign = extractFloat128Sign( b );
5039     if ( aExp == 0x7FFF ) {
5040         if (    ( aSig0 | aSig1 )
5041              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042             return propagateFloat128NaN( a, b );
5043         }
5044         goto invalid;
5045     }
5046     if ( bExp == 0x7FFF ) {
5047         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048         return a;
5049     }
5050     if ( bExp == 0 ) {
5051         if ( ( bSig0 | bSig1 ) == 0 ) {
5052  invalid:
5053             float_raise( float_flag_invalid );
5054             z.low = float128_default_nan_low;
5055             z.high = float128_default_nan_high;
5056             return z;
5057         }
5058         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5059     }
5060     if ( aExp == 0 ) {
5061         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5063     }
5064     expDiff = aExp - bExp;
5065     if ( expDiff < -1 ) return a;
5066     shortShift128Left(
5067         aSig0 | LIT64( 0x0001000000000000 ),
5068         aSig1,
5069         15 - ( expDiff < 0 ),
5070         &aSig0,
5071         &aSig1
5072     );
5073     shortShift128Left(
5074         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075     q = le128( bSig0, bSig1, aSig0, aSig1 );
5076     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5077     expDiff -= 64;
5078     while ( 0 < expDiff ) {
5079         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080         q = ( 4 < q ) ? q - 4 : 0;
5081         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5085         expDiff -= 61;
5086     }
5087     if ( -64 < expDiff ) {
5088         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089         q = ( 4 < q ) ? q - 4 : 0;
5090         q >>= - expDiff;
5091         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5092         expDiff += 52;
5093         if ( expDiff < 0 ) {
5094             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5095         }
5096         else {
5097             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5098         }
5099         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5101     }
5102     else {
5103         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5105     }
5106     do {
5107         alternateASig0 = aSig0;
5108         alternateASig1 = aSig1;
5109         ++q;
5110         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111     } while ( 0 <= (sbits64) aSig0 );
5112     add128(
5113         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114     if (    ( sigMean0 < 0 )
5115          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116         aSig0 = alternateASig0;
5117         aSig1 = alternateASig1;
5118     }
5119     zSign = ( (sbits64) aSig0 < 0 );
5120     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5121     return
5122         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5123
5124 }
5125
5126 /*
5127 -------------------------------------------------------------------------------
5128 Returns the square root of the quadruple-precision floating-point value `a'.
5129 The operation is performed according to the IEC/IEEE Standard for Binary
5130 Floating-Point Arithmetic.
5131 -------------------------------------------------------------------------------
5132 */
5133 float128 float128_sqrt( float128 a )
5134 {
5135     flag aSign;
5136     int32 aExp, zExp;
5137     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5139     float128 z;
5140
5141     aSig1 = extractFloat128Frac1( a );
5142     aSig0 = extractFloat128Frac0( a );
5143     aExp = extractFloat128Exp( a );
5144     aSign = extractFloat128Sign( a );
5145     if ( aExp == 0x7FFF ) {
5146         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147         if ( ! aSign ) return a;
5148         goto invalid;
5149     }
5150     if ( aSign ) {
5151         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5152  invalid:
5153         float_raise( float_flag_invalid );
5154         z.low = float128_default_nan_low;
5155         z.high = float128_default_nan_high;
5156         return z;
5157     }
5158     if ( aExp == 0 ) {
5159         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5161     }
5162     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163     aSig0 |= LIT64( 0x0001000000000000 );
5164     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167     doubleZSig0 = zSig0<<1;
5168     mul64To128( zSig0, zSig0, &term0, &term1 );
5169     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170     while ( (sbits64) rem0 < 0 ) {
5171         --zSig0;
5172         doubleZSig0 -= 2;
5173         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5174     }
5175     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177         if ( zSig1 == 0 ) zSig1 = 1;
5178         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180         mul64To128( zSig1, zSig1, &term2, &term3 );
5181         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182         while ( (sbits64) rem1 < 0 ) {
5183             --zSig1;
5184             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5185             term3 |= 1;
5186             term2 |= doubleZSig0;
5187             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5188         }
5189         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5190     }
5191     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5193
5194 }
5195
5196 /*
5197 -------------------------------------------------------------------------------
5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 the corresponding value `b', and 0 otherwise.  The comparison is performed
5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5202 */
5203 flag float128_eq( float128 a, float128 b )
5204 {
5205
5206     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5207               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5209               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5210        ) {
5211         if (    float128_is_signaling_nan( a )
5212              || float128_is_signaling_nan( b ) ) {
5213             float_raise( float_flag_invalid );
5214         }
5215         return 0;
5216     }
5217     return
5218            ( a.low == b.low )
5219         && (    ( a.high == b.high )
5220              || (    ( a.low == 0 )
5221                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5222            );
5223
5224 }
5225
5226 /*
5227 -------------------------------------------------------------------------------
5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5231 Arithmetic.
5232 -------------------------------------------------------------------------------
5233 */
5234 flag float128_le( float128 a, float128 b )
5235 {
5236     flag aSign, bSign;
5237
5238     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5239               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5241               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5242        ) {
5243         float_raise( float_flag_invalid );
5244         return 0;
5245     }
5246     aSign = extractFloat128Sign( a );
5247     bSign = extractFloat128Sign( b );
5248     if ( aSign != bSign ) {
5249         return
5250                aSign
5251             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5252                  == 0 );
5253     }
5254     return
5255           aSign ? le128( b.high, b.low, a.high, a.low )
5256         : le128( a.high, a.low, b.high, b.low );
5257
5258 }
5259
5260 /*
5261 -------------------------------------------------------------------------------
5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 the corresponding value `b', and 0 otherwise.  The comparison is performed
5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 -------------------------------------------------------------------------------
5266 */
5267 flag float128_lt( float128 a, float128 b )
5268 {
5269     flag aSign, bSign;
5270
5271     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5272               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5274               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5275        ) {
5276         float_raise( float_flag_invalid );
5277         return 0;
5278     }
5279     aSign = extractFloat128Sign( a );
5280     bSign = extractFloat128Sign( b );
5281     if ( aSign != bSign ) {
5282         return
5283                aSign
5284             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5285                  != 0 );
5286     }
5287     return
5288           aSign ? lt128( b.high, b.low, a.high, a.low )
5289         : lt128( a.high, a.low, b.high, b.low );
5290
5291 }
5292
5293 /*
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise.  The invalid exception is
5297 raised if either operand is a NaN.  Otherwise, the comparison is performed
5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 -------------------------------------------------------------------------------
5300 */
5301 flag float128_eq_signaling( float128 a, float128 b )
5302 {
5303
5304     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5305               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5307               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5308        ) {
5309         float_raise( float_flag_invalid );
5310         return 0;
5311     }
5312     return
5313            ( a.low == b.low )
5314         && (    ( a.high == b.high )
5315              || (    ( a.low == 0 )
5316                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317            );
5318
5319 }
5320
5321 /*
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5325 cause an exception.  Otherwise, the comparison is performed according to the
5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 -------------------------------------------------------------------------------
5328 */
5329 flag float128_le_quiet( float128 a, float128 b )
5330 {
5331     flag aSign, bSign;
5332
5333     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5334               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5336               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337        ) {
5338         if (    float128_is_signaling_nan( a )
5339              || float128_is_signaling_nan( b ) ) {
5340             float_raise( float_flag_invalid );
5341         }
5342         return 0;
5343     }
5344     aSign = extractFloat128Sign( a );
5345     bSign = extractFloat128Sign( b );
5346     if ( aSign != bSign ) {
5347         return
5348                aSign
5349             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350                  == 0 );
5351     }
5352     return
5353           aSign ? le128( b.high, b.low, a.high, a.low )
5354         : le128( a.high, a.low, b.high, b.low );
5355
5356 }
5357
5358 /*
5359 -------------------------------------------------------------------------------
5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5362 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5363 Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5365 */
5366 flag float128_lt_quiet( float128 a, float128 b )
5367 {
5368     flag aSign, bSign;
5369
5370     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5371               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5373               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5374        ) {
5375         if (    float128_is_signaling_nan( a )
5376              || float128_is_signaling_nan( b ) ) {
5377             float_raise( float_flag_invalid );
5378         }
5379         return 0;
5380     }
5381     aSign = extractFloat128Sign( a );
5382     bSign = extractFloat128Sign( b );
5383     if ( aSign != bSign ) {
5384         return
5385                aSign
5386             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387                  != 0 );
5388     }
5389     return
5390           aSign ? lt128( b.high, b.low, a.high, a.low )
5391         : lt128( a.high, a.low, b.high, b.low );
5392
5393 }
5394
5395 #endif
5396
5397
5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5399
5400 /*
5401  * These two routines are not part of the original softfloat distribution.
5402  *
5403  * They are based on the corresponding conversions to integer but return
5404  * unsigned numbers instead since these functions are required by GCC.
5405  *
5406  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5407  *
5408  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5409  */
5410
5411 /*
5412 -------------------------------------------------------------------------------
5413 Returns the result of converting the double-precision floating-point value
5414 `a' to the 32-bit unsigned integer format.  The conversion is
5415 performed according to the IEC/IEEE Standard for Binary Floating-point
5416 Arithmetic, except that the conversion is always rounded toward zero.  If
5417 `a' is a NaN, the largest positive integer is returned.  If the conversion
5418 overflows, the largest integer positive is returned.
5419 -------------------------------------------------------------------------------
5420 */
5421 uint32 float64_to_uint32_round_to_zero( float64 a )
5422 {
5423     flag aSign;
5424     int16 aExp, shiftCount;
5425     bits64 aSig, savedASig;
5426     uint32 z;
5427
5428     aSig = extractFloat64Frac( a );
5429     aExp = extractFloat64Exp( a );
5430     aSign = extractFloat64Sign( a );
5431
5432     if (aSign) {
5433         float_raise( float_flag_invalid );
5434         return(0);
5435     }
5436
5437     if ( 0x41E < aExp ) {
5438         float_raise( float_flag_invalid );
5439         return 0xffffffff;
5440     }
5441     else if ( aExp < 0x3FF ) {
5442         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5443         return 0;
5444     }
5445     aSig |= LIT64( 0x0010000000000000 );
5446     shiftCount = 0x433 - aExp;
5447     savedASig = aSig;
5448     aSig >>= shiftCount;
5449     z = aSig;
5450     if ( ( aSig<<shiftCount ) != savedASig ) {
5451         float_exception_flags |= float_flag_inexact;
5452     }
5453     return z;
5454
5455 }
5456
5457 /*
5458 -------------------------------------------------------------------------------
5459 Returns the result of converting the single-precision floating-point value
5460 `a' to the 32-bit unsigned integer format.  The conversion is
5461 performed according to the IEC/IEEE Standard for Binary Floating-point
5462 Arithmetic, except that the conversion is always rounded toward zero.  If
5463 `a' is a NaN, the largest positive integer is returned.  If the conversion
5464 overflows, the largest positive integer is returned.
5465 -------------------------------------------------------------------------------
5466 */
5467 uint32 float32_to_uint32_round_to_zero( float32 a )
5468 {
5469     flag aSign;
5470     int16 aExp, shiftCount;
5471     bits32 aSig;
5472     uint32 z;
5473
5474     aSig = extractFloat32Frac( a );
5475     aExp = extractFloat32Exp( a );
5476     aSign = extractFloat32Sign( a );
5477     shiftCount = aExp - 0x9E;
5478
5479     if (aSign) {
5480         float_raise( float_flag_invalid );
5481         return(0);
5482     }
5483     if ( 0 < shiftCount ) {
5484         float_raise( float_flag_invalid );
5485         return 0xFFFFFFFF;
5486     }
5487     else if ( aExp <= 0x7E ) {
5488         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5489         return 0;
5490     }
5491     aSig = ( aSig | 0x800000 )<<8;
5492     z = aSig>>( - shiftCount );
5493     if ( aSig<<( shiftCount & 31 ) ) {
5494         float_exception_flags |= float_flag_inexact;
5495     }
5496     return z;
5497
5498 }
5499
5500 #endif