lib/libc/softfloat/bits64/softfloat.c

   1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
   2
   3 /*
   4  * This version hacked for use with gcc -msoft-float by bjh21.
   5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
   6  *  itself).
   7  */
   8
   9 /*
  10  * Things you may want to define:
  11  *
  12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
  13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
  14  *   properly renamed.
  15  */
  16
  17 /*
  18 ===============================================================================
  19
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 #include <sys/cdefs.h>
  48 __FBSDID("$FreeBSD$");
  49
  50 #ifdef SOFTFLOAT_FOR_GCC
  51 #include "softfloat-for-gcc.h"
  52 #endif
  53
  54 #include "milieu.h"
  55 #include "softfloat.h"
  56
  57 /*
  58  * Conversions between floats as stored in memory and floats as
  59  * SoftFloat uses them
  60  */
  61 #ifndef FLOAT64_DEMANGLE
  62 #define FLOAT64_DEMANGLE(a)     (a)
  63 #endif
  64 #ifndef FLOAT64_MANGLE
  65 #define FLOAT64_MANGLE(a)       (a)
  66 #endif
  67
  68 /*
  69 -------------------------------------------------------------------------------
  70 Floating-point rounding mode, extended double-precision rounding precision,
  71 and exception flags.
  72 -------------------------------------------------------------------------------
  73 */
  74 int float_rounding_mode = float_round_nearest_even;
  75 int float_exception_flags = 0;
  76 #ifdef FLOATX80
  77 int8 floatx80_rounding_precision = 80;
  78 #endif
  79
  80 /*
  81 -------------------------------------------------------------------------------
  82 Primitive arithmetic functions, including multi-word arithmetic, and
  83 division and square root approximations.  (Can be specialized to target if
  84 desired.)
  85 -------------------------------------------------------------------------------
  86 */
  87 #include "softfloat-macros"
  88
  89 /*
  90 -------------------------------------------------------------------------------
  91 Functions and definitions to determine:  (1) whether tininess for underflow
  92 is detected before or after rounding by default, (2) what (if anything)
  93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  95 are propagated from function inputs to output.  These details are target-
  96 specific.
  97 -------------------------------------------------------------------------------
  98 */
  99 #include "softfloat-specialize"
 100
 101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
 102 /*
 103 -------------------------------------------------------------------------------
 104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 105 and 7, and returns the properly rounded 32-bit integer corresponding to the
 106 input.  If `zSign' is 1, the input is negated before being converted to an
 107 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 108 is simply rounded to an integer, with the inexact exception raised if the
 109 input cannot be represented exactly as an integer.  However, if the fixed-
 110 point input is too large, the invalid exception is raised and the largest
 111 positive or negative integer is returned.
 112 -------------------------------------------------------------------------------
 113 */
 114 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
 115 {
 116     int8 roundingMode;
 117     flag roundNearestEven;
 118     int8 roundIncrement, roundBits;
 119     int32 z;
 120
 121     roundingMode = float_rounding_mode;
 122     roundNearestEven = ( roundingMode == float_round_nearest_even );
 123     roundIncrement = 0x40;
 124     if ( ! roundNearestEven ) {
 125         if ( roundingMode == float_round_to_zero ) {
 126             roundIncrement = 0;
 127         }
 128         else {
 129             roundIncrement = 0x7F;
 130             if ( zSign ) {
 131                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 132             }
 133             else {
 134                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 135             }
 136         }
 137     }
 138     roundBits = absZ & 0x7F;
 139     absZ = ( absZ + roundIncrement )>>7;
 140     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 141     z = absZ;
 142     if ( zSign ) z = - z;
 143     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 144         float_raise( float_flag_invalid );
 145         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 146     }
 147     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 148     return z;
 149
 150 }
 151
 152 /*
 153 -------------------------------------------------------------------------------
 154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 155 `absZ1', with binary point between bits 63 and 64 (between the input words),
 156 and returns the properly rounded 64-bit integer corresponding to the input.
 157 If `zSign' is 1, the input is negated before being converted to an integer.
 158 Ordinarily, the fixed-point input is simply rounded to an integer, with
 159 the inexact exception raised if the input cannot be represented exactly as
 160 an integer.  However, if the fixed-point input is too large, the invalid
 161 exception is raised and the largest positive or negative integer is
 162 returned.
 163 -------------------------------------------------------------------------------
 164 */
 165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 166 {
 167     int8 roundingMode;
 168     flag roundNearestEven, increment;
 169     int64 z;
 170
 171     roundingMode = float_rounding_mode;
 172     roundNearestEven = ( roundingMode == float_round_nearest_even );
 173     increment = ( (sbits64) absZ1 < 0 );
 174     if ( ! roundNearestEven ) {
 175         if ( roundingMode == float_round_to_zero ) {
 176             increment = 0;
 177         }
 178         else {
 179             if ( zSign ) {
 180                 increment = ( roundingMode == float_round_down ) && absZ1;
 181             }
 182             else {
 183                 increment = ( roundingMode == float_round_up ) && absZ1;
 184             }
 185         }
 186     }
 187     if ( increment ) {
 188         ++absZ0;
 189         if ( absZ0 == 0 ) goto overflow;
 190         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 191     }
 192     z = absZ0;
 193     if ( zSign ) z = - z;
 194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 195  overflow:
 196         float_raise( float_flag_invalid );
 197         return
 198               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 199             : LIT64( 0x7FFFFFFFFFFFFFFF );
 200     }
 201     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 202     return z;
 203
 204 }
 205 #endif
 206
 207 /*
 208 -------------------------------------------------------------------------------
 209 Returns the fraction bits of the single-precision floating-point value `a'.
 210 -------------------------------------------------------------------------------
 211 */
 212 INLINE bits32 extractFloat32Frac( float32 a )
 213 {
 214
 215     return a & 0x007FFFFF;
 216
 217 }
 218
 219 /*
 220 -------------------------------------------------------------------------------
 221 Returns the exponent bits of the single-precision floating-point value `a'.
 222 -------------------------------------------------------------------------------
 223 */
 224 INLINE int16 extractFloat32Exp( float32 a )
 225 {
 226
 227     return ( a>>23 ) & 0xFF;
 228
 229 }
 230
 231 /*
 232 -------------------------------------------------------------------------------
 233 Returns the sign bit of the single-precision floating-point value `a'.
 234 -------------------------------------------------------------------------------
 235 */
 236 INLINE flag extractFloat32Sign( float32 a )
 237 {
 238
 239     return a>>31;
 240
 241 }
 242
 243 /*
 244 -------------------------------------------------------------------------------
 245 Normalizes the subnormal single-precision floating-point value represented
 246 by the denormalized significand `aSig'.  The normalized exponent and
 247 significand are stored at the locations pointed to by `zExpPtr' and
 248 `zSigPtr', respectively.
 249 -------------------------------------------------------------------------------
 250 */
 251 static void
 252  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 253 {
 254     int8 shiftCount;
 255
 256     shiftCount = countLeadingZeros32( aSig ) - 8;
 257     *zSigPtr = aSig<<shiftCount;
 258     *zExpPtr = 1 - shiftCount;
 259
 260 }
 261
 262 /*
 263 -------------------------------------------------------------------------------
 264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 265 single-precision floating-point value, returning the result.  After being
 266 shifted into the proper positions, the three fields are simply added
 267 together to form the result.  This means that any integer portion of `zSig'
 268 will be added into the exponent.  Since a properly normalized significand
 269 will have an integer portion equal to 1, the `zExp' input should be 1 less
 270 than the desired result exponent whenever `zSig' is a complete, normalized
 271 significand.
 272 -------------------------------------------------------------------------------
 273 */
 274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 275 {
 276
 277     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 278
 279 }
 280
 281 /*
 282 -------------------------------------------------------------------------------
 283 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 284 and significand `zSig', and returns the proper single-precision floating-
 285 point value corresponding to the abstract input.  Ordinarily, the abstract
 286 value is simply rounded and packed into the single-precision format, with
 287 the inexact exception raised if the abstract input cannot be represented
 288 exactly.  However, if the abstract value is too large, the overflow and
 289 inexact exceptions are raised and an infinity or maximal finite value is
 290 returned.  If the abstract value is too small, the input value is rounded to
 291 a subnormal number, and the underflow and inexact exceptions are raised if
 292 the abstract input cannot be represented exactly as a subnormal single-
 293 precision floating-point number.
 294     The input significand `zSig' has its binary point between bits 30
 295 and 29, which is 7 bits to the left of the usual location.  This shifted
 296 significand must be normalized or smaller.  If `zSig' is not normalized,
 297 `zExp' must be 0; in that case, the result returned is a subnormal number,
 298 and it must not require rounding.  In the usual case that `zSig' is
 299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 300 The handling of underflow and overflow follows the IEC/IEEE Standard for
 301 Binary Floating-Point Arithmetic.
 302 -------------------------------------------------------------------------------
 303 */
 304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 305 {
 306     int8 roundingMode;
 307     flag roundNearestEven;
 308     int8 roundIncrement, roundBits;
 309     flag isTiny;
 310
 311     roundingMode = float_rounding_mode;
 312     roundNearestEven = ( roundingMode == float_round_nearest_even );
 313     roundIncrement = 0x40;
 314     if ( ! roundNearestEven ) {
 315         if ( roundingMode == float_round_to_zero ) {
 316             roundIncrement = 0;
 317         }
 318         else {
 319             roundIncrement = 0x7F;
 320             if ( zSign ) {
 321                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 322             }
 323             else {
 324                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 325             }
 326         }
 327     }
 328     roundBits = zSig & 0x7F;
 329     if ( 0xFD <= (bits16) zExp ) {
 330         if (    ( 0xFD < zExp )
 331              || (    ( zExp == 0xFD )
 332                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 333            ) {
 334             float_raise( float_flag_overflow | float_flag_inexact );
 335             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 336         }
 337         if ( zExp < 0 ) {
 338             isTiny =
 339                    ( float_detect_tininess == float_tininess_before_rounding )
 340                 || ( zExp < -1 )
 341                 || ( zSig + roundIncrement < 0x80000000 );
 342             shift32RightJamming( zSig, - zExp, &zSig );
 343             zExp = 0;
 344             roundBits = zSig & 0x7F;
 345             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 346         }
 347     }
 348     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 349     zSig = ( zSig + roundIncrement )>>7;
 350     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 351     if ( zSig == 0 ) zExp = 0;
 352     return packFloat32( zSign, zExp, zSig );
 353
 354 }
 355
 356 /*
 357 -------------------------------------------------------------------------------
 358 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 359 and significand `zSig', and returns the proper single-precision floating-
 360 point value corresponding to the abstract input.  This routine is just like
 361 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 363 floating-point exponent.
 364 -------------------------------------------------------------------------------
 365 */
 366 static float32
 367  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 368 {
 369     int8 shiftCount;
 370
 371     shiftCount = countLeadingZeros32( zSig ) - 1;
 372     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 373
 374 }
 375
 376 /*
 377 -------------------------------------------------------------------------------
 378 Returns the fraction bits of the double-precision floating-point value `a'.
 379 -------------------------------------------------------------------------------
 380 */
 381 INLINE bits64 extractFloat64Frac( float64 a )
 382 {
 383
 384     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
 385
 386 }
 387
 388 /*
 389 -------------------------------------------------------------------------------
 390 Returns the exponent bits of the double-precision floating-point value `a'.
 391 -------------------------------------------------------------------------------
 392 */
 393 INLINE int16 extractFloat64Exp( float64 a )
 394 {
 395
 396     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
 397
 398 }
 399
 400 /*
 401 -------------------------------------------------------------------------------
 402 Returns the sign bit of the double-precision floating-point value `a'.
 403 -------------------------------------------------------------------------------
 404 */
 405 INLINE flag extractFloat64Sign( float64 a )
 406 {
 407
 408     return FLOAT64_DEMANGLE(a)>>63;
 409
 410 }
 411
 412 /*
 413 -------------------------------------------------------------------------------
 414 Normalizes the subnormal double-precision floating-point value represented
 415 by the denormalized significand `aSig'.  The normalized exponent and
 416 significand are stored at the locations pointed to by `zExpPtr' and
 417 `zSigPtr', respectively.
 418 -------------------------------------------------------------------------------
 419 */
 420 static void
 421  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 422 {
 423     int8 shiftCount;
 424
 425     shiftCount = countLeadingZeros64( aSig ) - 11;
 426     *zSigPtr = aSig<<shiftCount;
 427     *zExpPtr = 1 - shiftCount;
 428
 429 }
 430
 431 /*
 432 -------------------------------------------------------------------------------
 433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 434 double-precision floating-point value, returning the result.  After being
 435 shifted into the proper positions, the three fields are simply added
 436 together to form the result.  This means that any integer portion of `zSig'
 437 will be added into the exponent.  Since a properly normalized significand
 438 will have an integer portion equal to 1, the `zExp' input should be 1 less
 439 than the desired result exponent whenever `zSig' is a complete, normalized
 440 significand.
 441 -------------------------------------------------------------------------------
 442 */
 443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 444 {
 445
 446     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
 447                            ( ( (bits64) zExp )<<52 ) + zSig );
 448
 449 }
 450
 451 /*
 452 -------------------------------------------------------------------------------
 453 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 454 and significand `zSig', and returns the proper double-precision floating-
 455 point value corresponding to the abstract input.  Ordinarily, the abstract
 456 value is simply rounded and packed into the double-precision format, with
 457 the inexact exception raised if the abstract input cannot be represented
 458 exactly.  However, if the abstract value is too large, the overflow and
 459 inexact exceptions are raised and an infinity or maximal finite value is
 460 returned.  If the abstract value is too small, the input value is rounded to
 461 a subnormal number, and the underflow and inexact exceptions are raised if
 462 the abstract input cannot be represented exactly as a subnormal double-
 463 precision floating-point number.
 464     The input significand `zSig' has its binary point between bits 62
 465 and 61, which is 10 bits to the left of the usual location.  This shifted
 466 significand must be normalized or smaller.  If `zSig' is not normalized,
 467 `zExp' must be 0; in that case, the result returned is a subnormal number,
 468 and it must not require rounding.  In the usual case that `zSig' is
 469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 470 The handling of underflow and overflow follows the IEC/IEEE Standard for
 471 Binary Floating-Point Arithmetic.
 472 -------------------------------------------------------------------------------
 473 */
 474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 475 {
 476     int8 roundingMode;
 477     flag roundNearestEven;
 478     int16 roundIncrement, roundBits;
 479     flag isTiny;
 480
 481     roundingMode = float_rounding_mode;
 482     roundNearestEven = ( roundingMode == float_round_nearest_even );
 483     roundIncrement = 0x200;
 484     if ( ! roundNearestEven ) {
 485         if ( roundingMode == float_round_to_zero ) {
 486             roundIncrement = 0;
 487         }
 488         else {
 489             roundIncrement = 0x3FF;
 490             if ( zSign ) {
 491                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 492             }
 493             else {
 494                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 495             }
 496         }
 497     }
 498     roundBits = zSig & 0x3FF;
 499     if ( 0x7FD <= (bits16) zExp ) {
 500         if (    ( 0x7FD < zExp )
 501              || (    ( zExp == 0x7FD )
 502                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 503            ) {
 504             float_raise( float_flag_overflow | float_flag_inexact );
 505             return FLOAT64_MANGLE(
 506                 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
 507                 ( roundIncrement == 0 ));
 508         }
 509         if ( zExp < 0 ) {
 510             isTiny =
 511                    ( float_detect_tininess == float_tininess_before_rounding )
 512                 || ( zExp < -1 )
 513                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 514             shift64RightJamming( zSig, - zExp, &zSig );
 515             zExp = 0;
 516             roundBits = zSig & 0x3FF;
 517             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 518         }
 519     }
 520     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 521     zSig = ( zSig + roundIncrement )>>10;
 522     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 523     if ( zSig == 0 ) zExp = 0;
 524     return packFloat64( zSign, zExp, zSig );
 525
 526 }
 527
 528 /*
 529 -------------------------------------------------------------------------------
 530 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 531 and significand `zSig', and returns the proper double-precision floating-
 532 point value corresponding to the abstract input.  This routine is just like
 533 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 535 floating-point exponent.
 536 -------------------------------------------------------------------------------
 537 */
 538 static float64
 539  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 540 {
 541     int8 shiftCount;
 542
 543     shiftCount = countLeadingZeros64( zSig ) - 1;
 544     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 545
 546 }
 547
 548 #ifdef FLOATX80
 549
 550 /*
 551 -------------------------------------------------------------------------------
 552 Returns the fraction bits of the extended double-precision floating-point
 553 value `a'.
 554 -------------------------------------------------------------------------------
 555 */
 556 INLINE bits64 extractFloatx80Frac( floatx80 a )
 557 {
 558
 559     return a.low;
 560
 561 }
 562
 563 /*
 564 -------------------------------------------------------------------------------
 565 Returns the exponent bits of the extended double-precision floating-point
 566 value `a'.
 567 -------------------------------------------------------------------------------
 568 */
 569 INLINE int32 extractFloatx80Exp( floatx80 a )
 570 {
 571
 572     return a.high & 0x7FFF;
 573
 574 }
 575
 576 /*
 577 -------------------------------------------------------------------------------
 578 Returns the sign bit of the extended double-precision floating-point value
 579 `a'.
 580 -------------------------------------------------------------------------------
 581 */
 582 INLINE flag extractFloatx80Sign( floatx80 a )
 583 {
 584
 585     return a.high>>15;
 586
 587 }
 588
 589 /*
 590 -------------------------------------------------------------------------------
 591 Normalizes the subnormal extended double-precision floating-point value
 592 represented by the denormalized significand `aSig'.  The normalized exponent
 593 and significand are stored at the locations pointed to by `zExpPtr' and
 594 `zSigPtr', respectively.
 595 -------------------------------------------------------------------------------
 596 */
 597 static void
 598  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 599 {
 600     int8 shiftCount;
 601
 602     shiftCount = countLeadingZeros64( aSig );
 603     *zSigPtr = aSig<<shiftCount;
 604     *zExpPtr = 1 - shiftCount;
 605
 606 }
 607
 608 /*
 609 -------------------------------------------------------------------------------
 610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 611 extended double-precision floating-point value, returning the result.
 612 -------------------------------------------------------------------------------
 613 */
 614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 615 {
 616     floatx80 z;
 617
 618     z.low = zSig;
 619     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 620     return z;
 621
 622 }
 623
 624 /*
 625 -------------------------------------------------------------------------------
 626 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 627 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 628 and returns the proper extended double-precision floating-point value
 629 corresponding to the abstract input.  Ordinarily, the abstract value is
 630 rounded and packed into the extended double-precision format, with the
 631 inexact exception raised if the abstract input cannot be represented
 632 exactly.  However, if the abstract value is too large, the overflow and
 633 inexact exceptions are raised and an infinity or maximal finite value is
 634 returned.  If the abstract value is too small, the input value is rounded to
 635 a subnormal number, and the underflow and inexact exceptions are raised if
 636 the abstract input cannot be represented exactly as a subnormal extended
 637 double-precision floating-point number.
 638     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 639 number of bits as single or double precision, respectively.  Otherwise, the
 640 result is rounded to the full precision of the extended double-precision
 641 format.
 642     The input significand must be normalized or smaller.  If the input
 643 significand is not normalized, `zExp' must be 0; in that case, the result
 644 returned is a subnormal number, and it must not require rounding.  The
 645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 646 Floating-Point Arithmetic.
 647 -------------------------------------------------------------------------------
 648 */
 649 static floatx80
 650  roundAndPackFloatx80(
 651      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 652  )
 653 {
 654     int8 roundingMode;
 655     flag roundNearestEven, increment, isTiny;
 656     int64 roundIncrement, roundMask, roundBits;
 657
 658     roundingMode = float_rounding_mode;
 659     roundNearestEven = ( roundingMode == float_round_nearest_even );
 660     if ( roundingPrecision == 80 ) goto precision80;
 661     if ( roundingPrecision == 64 ) {
 662         roundIncrement = LIT64( 0x0000000000000400 );
 663         roundMask = LIT64( 0x00000000000007FF );
 664     }
 665     else if ( roundingPrecision == 32 ) {
 666         roundIncrement = LIT64( 0x0000008000000000 );
 667         roundMask = LIT64( 0x000000FFFFFFFFFF );
 668     }
 669     else {
 670         goto precision80;
 671     }
 672     zSig0 |= ( zSig1 != 0 );
 673     if ( ! roundNearestEven ) {
 674         if ( roundingMode == float_round_to_zero ) {
 675             roundIncrement = 0;
 676         }
 677         else {
 678             roundIncrement = roundMask;
 679             if ( zSign ) {
 680                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 681             }
 682             else {
 683                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 684             }
 685         }
 686     }
 687     roundBits = zSig0 & roundMask;
 688     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 689         if (    ( 0x7FFE < zExp )
 690              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 691            ) {
 692             goto overflow;
 693         }
 694         if ( zExp <= 0 ) {
 695             isTiny =
 696                    ( float_detect_tininess == float_tininess_before_rounding )
 697                 || ( zExp < 0 )
 698                 || ( zSig0 <= zSig0 + roundIncrement );
 699             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 700             zExp = 0;
 701             roundBits = zSig0 & roundMask;
 702             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 703             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 704             zSig0 += roundIncrement;
 705             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 706             roundIncrement = roundMask + 1;
 707             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 708                 roundMask |= roundIncrement;
 709             }
 710             zSig0 &= ~ roundMask;
 711             return packFloatx80( zSign, zExp, zSig0 );
 712         }
 713     }
 714     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 715     zSig0 += roundIncrement;
 716     if ( zSig0 < roundIncrement ) {
 717         ++zExp;
 718         zSig0 = LIT64( 0x8000000000000000 );
 719     }
 720     roundIncrement = roundMask + 1;
 721     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 722         roundMask |= roundIncrement;
 723     }
 724     zSig0 &= ~ roundMask;
 725     if ( zSig0 == 0 ) zExp = 0;
 726     return packFloatx80( zSign, zExp, zSig0 );
 727  precision80:
 728     increment = ( (sbits64) zSig1 < 0 );
 729     if ( ! roundNearestEven ) {
 730         if ( roundingMode == float_round_to_zero ) {
 731             increment = 0;
 732         }
 733         else {
 734             if ( zSign ) {
 735                 increment = ( roundingMode == float_round_down ) && zSig1;
 736             }
 737             else {
 738                 increment = ( roundingMode == float_round_up ) && zSig1;
 739             }
 740         }
 741     }
 742     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 743         if (    ( 0x7FFE < zExp )
 744              || (    ( zExp == 0x7FFE )
 745                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 746                   && increment
 747                 )
 748            ) {
 749             roundMask = 0;
 750  overflow:
 751             float_raise( float_flag_overflow | float_flag_inexact );
 752             if (    ( roundingMode == float_round_to_zero )
 753                  || ( zSign && ( roundingMode == float_round_up ) )
 754                  || ( ! zSign && ( roundingMode == float_round_down ) )
 755                ) {
 756                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 757             }
 758             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 759         }
 760         if ( zExp <= 0 ) {
 761             isTiny =
 762                    ( float_detect_tininess == float_tininess_before_rounding )
 763                 || ( zExp < 0 )
 764                 || ! increment
 765                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 766             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 767             zExp = 0;
 768             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 769             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 770             if ( roundNearestEven ) {
 771                 increment = ( (sbits64) zSig1 < 0 );
 772             }
 773             else {
 774                 if ( zSign ) {
 775                     increment = ( roundingMode == float_round_down ) && zSig1;
 776                 }
 777                 else {
 778                     increment = ( roundingMode == float_round_up ) && zSig1;
 779                 }
 780             }
 781             if ( increment ) {
 782                 ++zSig0;
 783                 zSig0 &=
 784                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 785                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 786             }
 787             return packFloatx80( zSign, zExp, zSig0 );
 788         }
 789     }
 790     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 791     if ( increment ) {
 792         ++zSig0;
 793         if ( zSig0 == 0 ) {
 794             ++zExp;
 795             zSig0 = LIT64( 0x8000000000000000 );
 796         }
 797         else {
 798             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 799         }
 800     }
 801     else {
 802         if ( zSig0 == 0 ) zExp = 0;
 803     }
 804     return packFloatx80( zSign, zExp, zSig0 );
 805
 806 }
 807
 808 /*
 809 -------------------------------------------------------------------------------
 810 Takes an abstract floating-point value having sign `zSign', exponent
 811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 812 and returns the proper extended double-precision floating-point value
 813 corresponding to the abstract input.  This routine is just like
 814 `roundAndPackFloatx80' except that the input significand does not have to be
 815 normalized.
 816 -------------------------------------------------------------------------------
 817 */
 818 static floatx80
 819  normalizeRoundAndPackFloatx80(
 820      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 821  )
 822 {
 823     int8 shiftCount;
 824
 825     if ( zSig0 == 0 ) {
 826         zSig0 = zSig1;
 827         zSig1 = 0;
 828         zExp -= 64;
 829     }
 830     shiftCount = countLeadingZeros64( zSig0 );
 831     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 832     zExp -= shiftCount;
 833     return
 834         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 835
 836 }
 837
 838 #endif
 839
 840 #ifdef FLOAT128
 841
 842 /*
 843 -------------------------------------------------------------------------------
 844 Returns the least-significant 64 fraction bits of the quadruple-precision
 845 floating-point value `a'.
 846 -------------------------------------------------------------------------------
 847 */
 848 INLINE bits64 extractFloat128Frac1( float128 a )
 849 {
 850
 851     return a.low;
 852
 853 }
 854
 855 /*
 856 -------------------------------------------------------------------------------
 857 Returns the most-significant 48 fraction bits of the quadruple-precision
 858 floating-point value `a'.
 859 -------------------------------------------------------------------------------
 860 */
 861 INLINE bits64 extractFloat128Frac0( float128 a )
 862 {
 863
 864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 865
 866 }
 867
 868 /*
 869 -------------------------------------------------------------------------------
 870 Returns the exponent bits of the quadruple-precision floating-point value
 871 `a'.
 872 -------------------------------------------------------------------------------
 873 */
 874 INLINE int32 extractFloat128Exp( float128 a )
 875 {
 876
 877     return ( a.high>>48 ) & 0x7FFF;
 878
 879 }
 880
 881 /*
 882 -------------------------------------------------------------------------------
 883 Returns the sign bit of the quadruple-precision floating-point value `a'.
 884 -------------------------------------------------------------------------------
 885 */
 886 INLINE flag extractFloat128Sign( float128 a )
 887 {
 888
 889     return a.high>>63;
 890
 891 }
 892
 893 /*
 894 -------------------------------------------------------------------------------
 895 Normalizes the subnormal quadruple-precision floating-point value
 896 represented by the denormalized significand formed by the concatenation of
 897 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 898 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 899 significand are stored at the location pointed to by `zSig0Ptr', and the
 900 least significant 64 bits of the normalized significand are stored at the
 901 location pointed to by `zSig1Ptr'.
 902 -------------------------------------------------------------------------------
 903 */
 904 static void
 905  normalizeFloat128Subnormal(
 906      bits64 aSig0,
 907      bits64 aSig1,
 908      int32 *zExpPtr,
 909      bits64 *zSig0Ptr,
 910      bits64 *zSig1Ptr
 911  )
 912 {
 913     int8 shiftCount;
 914
 915     if ( aSig0 == 0 ) {
 916         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 917         if ( shiftCount < 0 ) {
 918             *zSig0Ptr = aSig1>>( - shiftCount );
 919             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 920         }
 921         else {
 922             *zSig0Ptr = aSig1<<shiftCount;
 923             *zSig1Ptr = 0;
 924         }
 925         *zExpPtr = - shiftCount - 63;
 926     }
 927     else {
 928         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 929         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 930         *zExpPtr = 1 - shiftCount;
 931     }
 932
 933 }
 934
 935 /*
 936 -------------------------------------------------------------------------------
 937 Packs the sign `zSign', the exponent `zExp', and the significand formed
 938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 939 floating-point value, returning the result.  After being shifted into the
 940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 941 added together to form the most significant 32 bits of the result.  This
 942 means that any integer portion of `zSig0' will be added into the exponent.
 943 Since a properly normalized significand will have an integer portion equal
 944 to 1, the `zExp' input should be 1 less than the desired result exponent
 945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 946 significand.
 947 -------------------------------------------------------------------------------
 948 */
 949 INLINE float128
 950  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 951 {
 952     float128 z;
 953
 954     z.low = zSig1;
 955     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 956     return z;
 957
 958 }
 959
 960 /*
 961 -------------------------------------------------------------------------------
 962 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 and `zSig2', and returns the proper quadruple-precision floating-point value
 965 corresponding to the abstract input.  Ordinarily, the abstract value is
 966 simply rounded and packed into the quadruple-precision format, with the
 967 inexact exception raised if the abstract input cannot be represented
 968 exactly.  However, if the abstract value is too large, the overflow and
 969 inexact exceptions are raised and an infinity or maximal finite value is
 970 returned.  If the abstract value is too small, the input value is rounded to
 971 a subnormal number, and the underflow and inexact exceptions are raised if
 972 the abstract input cannot be represented exactly as a subnormal quadruple-
 973 precision floating-point number.
 974     The input significand must be normalized or smaller.  If the input
 975 significand is not normalized, `zExp' must be 0; in that case, the result
 976 returned is a subnormal number, and it must not require rounding.  In the
 977 usual case that the input significand is normalized, `zExp' must be 1 less
 978 than the ``true'' floating-point exponent.  The handling of underflow and
 979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 -------------------------------------------------------------------------------
 981 */
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = float_rounding_mode;
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (sbits64) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (bits32) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact );
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             isTiny =
1034                    ( float_detect_tininess == float_tininess_before_rounding )
1035                 || ( zExp < -1 )
1036                 || ! increment
1037                 || lt128(
1038                        zSig0,
1039                        zSig1,
1040                        LIT64( 0x0001FFFFFFFFFFFF ),
1041                        LIT64( 0xFFFFFFFFFFFFFFFF )
1042                    );
1043             shift128ExtraRightJamming(
1044                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1045             zExp = 0;
1046             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047             if ( roundNearestEven ) {
1048                 increment = ( (sbits64) zSig2 < 0 );
1049             }
1050             else {
1051                 if ( zSign ) {
1052                     increment = ( roundingMode == float_round_down ) && zSig2;
1053                 }
1054                 else {
1055                     increment = ( roundingMode == float_round_up ) && zSig2;
1056                 }
1057             }
1058         }
1059     }
1060     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1061     if ( increment ) {
1062         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1064     }
1065     else {
1066         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1067     }
1068     return packFloat128( zSign, zExp, zSig0, zSig1 );
1069
1070 }
1071
1072 /*
1073 -------------------------------------------------------------------------------
1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 returns the proper quadruple-precision floating-point value corresponding
1077 to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 except that the input significand has fewer bits and does not have to be
1079 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 point exponent.
1081 -------------------------------------------------------------------------------
1082 */
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1086 {
1087     int8 shiftCount;
1088     bits64 zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106
1107 }
1108
1109 #endif
1110
1111 /*
1112 -------------------------------------------------------------------------------
1113 Returns the result of converting the 32-bit two's complement integer `a'
1114 to the single-precision floating-point format.  The conversion is performed
1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116 -------------------------------------------------------------------------------
1117 */
1118 float32 int32_to_float32( int32 a )
1119 {
1120     flag zSign;
1121
1122     if ( a == 0 ) return 0;
1123     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1124     zSign = ( a < 0 );
1125     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1126
1127 }
1128
1129 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
1130 float32 uint32_to_float32( uint32 a )
1131 {
1132     if ( a == 0 ) return 0;
1133     if ( a & (bits32) 0x80000000 )
1134         return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1135     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1136 }
1137 #endif
1138
1139
1140 /*
1141 -------------------------------------------------------------------------------
1142 Returns the result of converting the 32-bit two's complement integer `a'
1143 to the double-precision floating-point format.  The conversion is performed
1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1145 -------------------------------------------------------------------------------
1146 */
1147 float64 int32_to_float64( int32 a )
1148 {
1149     flag zSign;
1150     uint32 absA;
1151     int8 shiftCount;
1152     bits64 zSig;
1153
1154     if ( a == 0 ) return 0;
1155     zSign = ( a < 0 );
1156     absA = zSign ? - a : a;
1157     shiftCount = countLeadingZeros32( absA ) + 21;
1158     zSig = absA;
1159     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1160
1161 }
1162
1163 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
1164 float64 uint32_to_float64( uint32 a )
1165 {
1166     int8 shiftCount;
1167     bits64 zSig = a;
1168
1169     if ( a == 0 ) return 0;
1170     shiftCount = countLeadingZeros32( a ) + 21;
1171     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1172
1173 }
1174 #endif
1175
1176 #ifdef FLOATX80
1177
1178 /*
1179 -------------------------------------------------------------------------------
1180 Returns the result of converting the 32-bit two's complement integer `a'
1181 to the extended double-precision floating-point format.  The conversion
1182 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1183 Arithmetic.
1184 -------------------------------------------------------------------------------
1185 */
1186 floatx80 int32_to_floatx80( int32 a )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     bits64 zSig;
1192
1193     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 32;
1197     zSig = absA;
1198     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1199
1200 }
1201
1202 floatx80 uint32_to_floatx80( uint32 a )
1203 {
1204     int8 shiftCount;
1205     bits64 zSig = a;
1206
1207     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1208     shiftCount = countLeadingZeros32( a ) + 32;
1209     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1210
1211 }
1212
1213 #endif
1214
1215 #ifdef FLOAT128
1216
1217 /*
1218 -------------------------------------------------------------------------------
1219 Returns the result of converting the 32-bit two's complement integer `a' to
1220 the quadruple-precision floating-point format.  The conversion is performed
1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 -------------------------------------------------------------------------------
1223 */
1224 float128 int32_to_float128( int32 a )
1225 {
1226     flag zSign;
1227     uint32 absA;
1228     int8 shiftCount;
1229     bits64 zSig0;
1230
1231     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1232     zSign = ( a < 0 );
1233     absA = zSign ? - a : a;
1234     shiftCount = countLeadingZeros32( absA ) + 17;
1235     zSig0 = absA;
1236     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1237
1238 }
1239
1240 float128 uint32_to_float128( uint32 a )
1241 {
1242     int8 shiftCount;
1243     bits64 zSig0 = a;
1244
1245     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1246     shiftCount = countLeadingZeros32( a ) + 17;
1247     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1248
1249 }
1250
1251 #endif
1252
1253 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1254 /*
1255 -------------------------------------------------------------------------------
1256 Returns the result of converting the 64-bit two's complement integer `a'
1257 to the single-precision floating-point format.  The conversion is performed
1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 -------------------------------------------------------------------------------
1260 */
1261 float32 int64_to_float32( int64 a )
1262 {
1263     flag zSign;
1264     uint64 absA;
1265     int8 shiftCount;
1266
1267     if ( a == 0 ) return 0;
1268     zSign = ( a < 0 );
1269     absA = zSign ? - a : a;
1270     shiftCount = countLeadingZeros64( absA ) - 40;
1271     if ( 0 <= shiftCount ) {
1272         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1273     }
1274     else {
1275         shiftCount += 7;
1276         if ( shiftCount < 0 ) {
1277             shift64RightJamming( absA, - shiftCount, &absA );
1278         }
1279         else {
1280             absA <<= shiftCount;
1281         }
1282         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1283     }
1284
1285 }
1286
1287 /*
1288 -------------------------------------------------------------------------------
1289 Returns the result of converting the 64-bit two's complement integer `a'
1290 to the double-precision floating-point format.  The conversion is performed
1291 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1292 -------------------------------------------------------------------------------
1293 */
1294 float64 int64_to_float64( int64 a )
1295 {
1296     flag zSign;
1297
1298     if ( a == 0 ) return 0;
1299     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1300         return packFloat64( 1, 0x43E, 0 );
1301     }
1302     zSign = ( a < 0 );
1303     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1304
1305 }
1306
1307 #ifdef FLOATX80
1308
1309 /*
1310 -------------------------------------------------------------------------------
1311 Returns the result of converting the 64-bit two's complement integer `a'
1312 to the extended double-precision floating-point format.  The conversion
1313 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1314 Arithmetic.
1315 -------------------------------------------------------------------------------
1316 */
1317 floatx80 int64_to_floatx80( int64 a )
1318 {
1319     flag zSign;
1320     uint64 absA;
1321     int8 shiftCount;
1322
1323     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA );
1327     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1328
1329 }
1330
1331 #endif
1332
1333 #endif /* !SOFTFLOAT_FOR_GCC */
1334
1335 #ifdef FLOAT128
1336
1337 /*
1338 -------------------------------------------------------------------------------
1339 Returns the result of converting the 64-bit two's complement integer `a' to
1340 the quadruple-precision floating-point format.  The conversion is performed
1341 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1342 -------------------------------------------------------------------------------
1343 */
1344 float128 int64_to_float128( int64 a )
1345 {
1346     flag zSign;
1347     uint64 absA;
1348     int8 shiftCount;
1349     int32 zExp;
1350     bits64 zSig0, zSig1;
1351
1352     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1353     zSign = ( a < 0 );
1354     absA = zSign ? - a : a;
1355     shiftCount = countLeadingZeros64( absA ) + 49;
1356     zExp = 0x406E - shiftCount;
1357     if ( 64 <= shiftCount ) {
1358         zSig1 = 0;
1359         zSig0 = absA;
1360         shiftCount -= 64;
1361     }
1362     else {
1363         zSig1 = absA;
1364         zSig0 = 0;
1365     }
1366     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1367     return packFloat128( zSign, zExp, zSig0, zSig1 );
1368
1369 }
1370
1371 #endif
1372
1373 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1374 /*
1375 -------------------------------------------------------------------------------
1376 Returns the result of converting the single-precision floating-point value
1377 `a' to the 32-bit two's complement integer format.  The conversion is
1378 performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 Arithmetic---which means in particular that the conversion is rounded
1380 according to the current rounding mode.  If `a' is a NaN, the largest
1381 positive integer is returned.  Otherwise, if the conversion overflows, the
1382 largest integer with the same sign as `a' is returned.
1383 -------------------------------------------------------------------------------
1384 */
1385 int32 float32_to_int32( float32 a )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     bits32 aSig;
1390     bits64 aSig64;
1391
1392     aSig = extractFloat32Frac( a );
1393     aExp = extractFloat32Exp( a );
1394     aSign = extractFloat32Sign( a );
1395     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1396     if ( aExp ) aSig |= 0x00800000;
1397     shiftCount = 0xAF - aExp;
1398     aSig64 = aSig;
1399     aSig64 <<= 32;
1400     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1401     return roundAndPackInt32( aSign, aSig64 );
1402
1403 }
1404 #endif /* !SOFTFLOAT_FOR_GCC */
1405
1406 /*
1407 -------------------------------------------------------------------------------
1408 Returns the result of converting the single-precision floating-point value
1409 `a' to the 32-bit two's complement integer format.  The conversion is
1410 performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 Arithmetic, except that the conversion is always rounded toward zero.
1412 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1413 the conversion overflows, the largest integer with the same sign as `a' is
1414 returned.
1415 -------------------------------------------------------------------------------
1416 */
1417 int32 float32_to_int32_round_to_zero( float32 a )
1418 {
1419     flag aSign;
1420     int16 aExp, shiftCount;
1421     bits32 aSig;
1422     int32 z;
1423
1424     aSig = extractFloat32Frac( a );
1425     aExp = extractFloat32Exp( a );
1426     aSign = extractFloat32Sign( a );
1427     shiftCount = aExp - 0x9E;
1428     if ( 0 <= shiftCount ) {
1429         if ( a != 0xCF000000 ) {
1430             float_raise( float_flag_invalid );
1431             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1432         }
1433         return (sbits32) 0x80000000;
1434     }
1435     else if ( aExp <= 0x7E ) {
1436         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1437         return 0;
1438     }
1439     aSig = ( aSig | 0x00800000 )<<8;
1440     z = aSig>>( - shiftCount );
1441     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1442         float_exception_flags |= float_flag_inexact;
1443     }
1444     if ( aSign ) z = - z;
1445     return z;
1446
1447 }
1448
1449 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1450 /*
1451 -------------------------------------------------------------------------------
1452 Returns the result of converting the single-precision floating-point value
1453 `a' to the 64-bit two's complement integer format.  The conversion is
1454 performed according to the IEC/IEEE Standard for Binary Floating-Point
1455 Arithmetic---which means in particular that the conversion is rounded
1456 according to the current rounding mode.  If `a' is a NaN, the largest
1457 positive integer is returned.  Otherwise, if the conversion overflows, the
1458 largest integer with the same sign as `a' is returned.
1459 -------------------------------------------------------------------------------
1460 */
1461 int64 float32_to_int64( float32 a )
1462 {
1463     flag aSign;
1464     int16 aExp, shiftCount;
1465     bits32 aSig;
1466     bits64 aSig64, aSigExtra;
1467
1468     aSig = extractFloat32Frac( a );
1469     aExp = extractFloat32Exp( a );
1470     aSign = extractFloat32Sign( a );
1471     shiftCount = 0xBE - aExp;
1472     if ( shiftCount < 0 ) {
1473         float_raise( float_flag_invalid );
1474         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1475             return LIT64( 0x7FFFFFFFFFFFFFFF );
1476         }
1477         return (sbits64) LIT64( 0x8000000000000000 );
1478     }
1479     if ( aExp ) aSig |= 0x00800000;
1480     aSig64 = aSig;
1481     aSig64 <<= 40;
1482     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1483     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1484
1485 }
1486
1487 /*
1488 -------------------------------------------------------------------------------
1489 Returns the result of converting the single-precision floating-point value
1490 `a' to the 64-bit two's complement integer format.  The conversion is
1491 performed according to the IEC/IEEE Standard for Binary Floating-Point
1492 Arithmetic, except that the conversion is always rounded toward zero.  If
1493 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1494 conversion overflows, the largest integer with the same sign as `a' is
1495 returned.
1496 -------------------------------------------------------------------------------
1497 */
1498 int64 float32_to_int64_round_to_zero( float32 a )
1499 {
1500     flag aSign;
1501     int16 aExp, shiftCount;
1502     bits32 aSig;
1503     bits64 aSig64;
1504     int64 z;
1505
1506     aSig = extractFloat32Frac( a );
1507     aExp = extractFloat32Exp( a );
1508     aSign = extractFloat32Sign( a );
1509     shiftCount = aExp - 0xBE;
1510     if ( 0 <= shiftCount ) {
1511         if ( a != 0xDF000000 ) {
1512             float_raise( float_flag_invalid );
1513             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1514                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1515             }
1516         }
1517         return (sbits64) LIT64( 0x8000000000000000 );
1518     }
1519     else if ( aExp <= 0x7E ) {
1520         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1521         return 0;
1522     }
1523     aSig64 = aSig | 0x00800000;
1524     aSig64 <<= 40;
1525     z = aSig64>>( - shiftCount );
1526     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1527         float_exception_flags |= float_flag_inexact;
1528     }
1529     if ( aSign ) z = - z;
1530     return z;
1531
1532 }
1533 #endif /* !SOFTFLOAT_FOR_GCC */
1534
1535 /*
1536 -------------------------------------------------------------------------------
1537 Returns the result of converting the single-precision floating-point value
1538 `a' to the double-precision floating-point format.  The conversion is
1539 performed according to the IEC/IEEE Standard for Binary Floating-Point
1540 Arithmetic.
1541 -------------------------------------------------------------------------------
1542 */
1543 float64 float32_to_float64( float32 a )
1544 {
1545     flag aSign;
1546     int16 aExp;
1547     bits32 aSig;
1548
1549     aSig = extractFloat32Frac( a );
1550     aExp = extractFloat32Exp( a );
1551     aSign = extractFloat32Sign( a );
1552     if ( aExp == 0xFF ) {
1553         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1554         return packFloat64( aSign, 0x7FF, 0 );
1555     }
1556     if ( aExp == 0 ) {
1557         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1558         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1559         --aExp;
1560     }
1561     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1562
1563 }
1564
1565 #ifdef FLOATX80
1566
1567 /*
1568 -------------------------------------------------------------------------------
1569 Returns the result of converting the single-precision floating-point value
1570 `a' to the extended double-precision floating-point format.  The conversion
1571 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1572 Arithmetic.
1573 -------------------------------------------------------------------------------
1574 */
1575 floatx80 float32_to_floatx80( float32 a )
1576 {
1577     flag aSign;
1578     int16 aExp;
1579     bits32 aSig;
1580
1581     aSig = extractFloat32Frac( a );
1582     aExp = extractFloat32Exp( a );
1583     aSign = extractFloat32Sign( a );
1584     if ( aExp == 0xFF ) {
1585         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1586         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1587     }
1588     if ( aExp == 0 ) {
1589         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1591     }
1592     aSig |= 0x00800000;
1593     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1594
1595 }
1596
1597 #endif
1598
1599 #ifdef FLOAT128
1600
1601 /*
1602 -------------------------------------------------------------------------------
1603 Returns the result of converting the single-precision floating-point value
1604 `a' to the double-precision floating-point format.  The conversion is
1605 performed according to the IEC/IEEE Standard for Binary Floating-Point
1606 Arithmetic.
1607 -------------------------------------------------------------------------------
1608 */
1609 float128 float32_to_float128( float32 a )
1610 {
1611     flag aSign;
1612     int16 aExp;
1613     bits32 aSig;
1614
1615     aSig = extractFloat32Frac( a );
1616     aExp = extractFloat32Exp( a );
1617     aSign = extractFloat32Sign( a );
1618     if ( aExp == 0xFF ) {
1619         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1620         return packFloat128( aSign, 0x7FFF, 0, 0 );
1621     }
1622     if ( aExp == 0 ) {
1623         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1624         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1625         --aExp;
1626     }
1627     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1628
1629 }
1630
1631 #endif
1632
1633 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1634 /*
1635 -------------------------------------------------------------------------------
1636 Rounds the single-precision floating-point value `a' to an integer, and
1637 returns the result as a single-precision floating-point value.  The
1638 operation is performed according to the IEC/IEEE Standard for Binary
1639 Floating-Point Arithmetic.
1640 -------------------------------------------------------------------------------
1641 */
1642 float32 float32_round_to_int( float32 a )
1643 {
1644     flag aSign;
1645     int16 aExp;
1646     bits32 lastBitMask, roundBitsMask;
1647     int8 roundingMode;
1648     float32 z;
1649
1650     aExp = extractFloat32Exp( a );
1651     if ( 0x96 <= aExp ) {
1652         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1653             return propagateFloat32NaN( a, a );
1654         }
1655         return a;
1656     }
1657     if ( aExp <= 0x7E ) {
1658         if ( (bits32) ( a<<1 ) == 0 ) return a;
1659         float_exception_flags |= float_flag_inexact;
1660         aSign = extractFloat32Sign( a );
1661         switch ( float_rounding_mode ) {
1662          case float_round_nearest_even:
1663             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1664                 return packFloat32( aSign, 0x7F, 0 );
1665             }
1666             break;
1667          case float_round_to_zero:
1668             break;
1669          case float_round_down:
1670             return aSign ? 0xBF800000 : 0;
1671          case float_round_up:
1672             return aSign ? 0x80000000 : 0x3F800000;
1673         }
1674         return packFloat32( aSign, 0, 0 );
1675     }
1676     lastBitMask = 1;
1677     lastBitMask <<= 0x96 - aExp;
1678     roundBitsMask = lastBitMask - 1;
1679     z = a;
1680     roundingMode = float_rounding_mode;
1681     if ( roundingMode == float_round_nearest_even ) {
1682         z += lastBitMask>>1;
1683         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1684     }
1685     else if ( roundingMode != float_round_to_zero ) {
1686         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1687             z += roundBitsMask;
1688         }
1689     }
1690     z &= ~ roundBitsMask;
1691     if ( z != a ) float_exception_flags |= float_flag_inexact;
1692     return z;
1693
1694 }
1695 #endif /* !SOFTFLOAT_FOR_GCC */
1696
1697 /*
1698 -------------------------------------------------------------------------------
1699 Returns the result of adding the absolute values of the single-precision
1700 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1701 before being returned.  `zSign' is ignored if the result is a NaN.
1702 The addition is performed according to the IEC/IEEE Standard for Binary
1703 Floating-Point Arithmetic.
1704 -------------------------------------------------------------------------------
1705 */
1706 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1707 {
1708     int16 aExp, bExp, zExp;
1709     bits32 aSig, bSig, zSig;
1710     int16 expDiff;
1711
1712     aSig = extractFloat32Frac( a );
1713     aExp = extractFloat32Exp( a );
1714     bSig = extractFloat32Frac( b );
1715     bExp = extractFloat32Exp( b );
1716     expDiff = aExp - bExp;
1717     aSig <<= 6;
1718     bSig <<= 6;
1719     if ( 0 < expDiff ) {
1720         if ( aExp == 0xFF ) {
1721             if ( aSig ) return propagateFloat32NaN( a, b );
1722             return a;
1723         }
1724         if ( bExp == 0 ) {
1725             --expDiff;
1726         }
1727         else {
1728             bSig |= 0x20000000;
1729         }
1730         shift32RightJamming( bSig, expDiff, &bSig );
1731         zExp = aExp;
1732     }
1733     else if ( expDiff < 0 ) {
1734         if ( bExp == 0xFF ) {
1735             if ( bSig ) return propagateFloat32NaN( a, b );
1736             return packFloat32( zSign, 0xFF, 0 );
1737         }
1738         if ( aExp == 0 ) {
1739             ++expDiff;
1740         }
1741         else {
1742             aSig |= 0x20000000;
1743         }
1744         shift32RightJamming( aSig, - expDiff, &aSig );
1745         zExp = bExp;
1746     }
1747     else {
1748         if ( aExp == 0xFF ) {
1749             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1750             return a;
1751         }
1752         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1753         zSig = 0x40000000 + aSig + bSig;
1754         zExp = aExp;
1755         goto roundAndPack;
1756     }
1757     aSig |= 0x20000000;
1758     zSig = ( aSig + bSig )<<1;
1759     --zExp;
1760     if ( (sbits32) zSig < 0 ) {
1761         zSig = aSig + bSig;
1762         ++zExp;
1763     }
1764  roundAndPack:
1765     return roundAndPackFloat32( zSign, zExp, zSig );
1766
1767 }
1768
1769 /*
1770 -------------------------------------------------------------------------------
1771 Returns the result of subtracting the absolute values of the single-
1772 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1773 difference is negated before being returned.  `zSign' is ignored if the
1774 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1775 Standard for Binary Floating-Point Arithmetic.
1776 -------------------------------------------------------------------------------
1777 */
1778 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1779 {
1780     int16 aExp, bExp, zExp;
1781     bits32 aSig, bSig, zSig;
1782     int16 expDiff;
1783
1784     aSig = extractFloat32Frac( a );
1785     aExp = extractFloat32Exp( a );
1786     bSig = extractFloat32Frac( b );
1787     bExp = extractFloat32Exp( b );
1788     expDiff = aExp - bExp;
1789     aSig <<= 7;
1790     bSig <<= 7;
1791     if ( 0 < expDiff ) goto aExpBigger;
1792     if ( expDiff < 0 ) goto bExpBigger;
1793     if ( aExp == 0xFF ) {
1794         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1795         float_raise( float_flag_invalid );
1796         return float32_default_nan;
1797     }
1798     if ( aExp == 0 ) {
1799         aExp = 1;
1800         bExp = 1;
1801     }
1802     if ( bSig < aSig ) goto aBigger;
1803     if ( aSig < bSig ) goto bBigger;
1804     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1805  bExpBigger:
1806     if ( bExp == 0xFF ) {
1807         if ( bSig ) return propagateFloat32NaN( a, b );
1808         return packFloat32( zSign ^ 1, 0xFF, 0 );
1809     }
1810     if ( aExp == 0 ) {
1811         ++expDiff;
1812     }
1813     else {
1814         aSig |= 0x40000000;
1815     }
1816     shift32RightJamming( aSig, - expDiff, &aSig );
1817     bSig |= 0x40000000;
1818  bBigger:
1819     zSig = bSig - aSig;
1820     zExp = bExp;
1821     zSign ^= 1;
1822     goto normalizeRoundAndPack;
1823  aExpBigger:
1824     if ( aExp == 0xFF ) {
1825         if ( aSig ) return propagateFloat32NaN( a, b );
1826         return a;
1827     }
1828     if ( bExp == 0 ) {
1829         --expDiff;
1830     }
1831     else {
1832         bSig |= 0x40000000;
1833     }
1834     shift32RightJamming( bSig, expDiff, &bSig );
1835     aSig |= 0x40000000;
1836  aBigger:
1837     zSig = aSig - bSig;
1838     zExp = aExp;
1839  normalizeRoundAndPack:
1840     --zExp;
1841     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1842
1843 }
1844
1845 /*
1846 -------------------------------------------------------------------------------
1847 Returns the result of adding the single-precision floating-point values `a'
1848 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1849 Binary Floating-Point Arithmetic.
1850 -------------------------------------------------------------------------------
1851 */
1852 float32 float32_add( float32 a, float32 b )
1853 {
1854     flag aSign, bSign;
1855
1856     aSign = extractFloat32Sign( a );
1857     bSign = extractFloat32Sign( b );
1858     if ( aSign == bSign ) {
1859         return addFloat32Sigs( a, b, aSign );
1860     }
1861     else {
1862         return subFloat32Sigs( a, b, aSign );
1863     }
1864
1865 }
1866
1867 /*
1868 -------------------------------------------------------------------------------
1869 Returns the result of subtracting the single-precision floating-point values
1870 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1871 for Binary Floating-Point Arithmetic.
1872 -------------------------------------------------------------------------------
1873 */
1874 float32 float32_sub( float32 a, float32 b )
1875 {
1876     flag aSign, bSign;
1877
1878     aSign = extractFloat32Sign( a );
1879     bSign = extractFloat32Sign( b );
1880     if ( aSign == bSign ) {
1881         return subFloat32Sigs( a, b, aSign );
1882     }
1883     else {
1884         return addFloat32Sigs( a, b, aSign );
1885     }
1886
1887 }
1888
1889 /*
1890 -------------------------------------------------------------------------------
1891 Returns the result of multiplying the single-precision floating-point values
1892 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1893 for Binary Floating-Point Arithmetic.
1894 -------------------------------------------------------------------------------
1895 */
1896 float32 float32_mul( float32 a, float32 b )
1897 {
1898     flag aSign, bSign, zSign;
1899     int16 aExp, bExp, zExp;
1900     bits32 aSig, bSig;
1901     bits64 zSig64;
1902     bits32 zSig;
1903
1904     aSig = extractFloat32Frac( a );
1905     aExp = extractFloat32Exp( a );
1906     aSign = extractFloat32Sign( a );
1907     bSig = extractFloat32Frac( b );
1908     bExp = extractFloat32Exp( b );
1909     bSign = extractFloat32Sign( b );
1910     zSign = aSign ^ bSign;
1911     if ( aExp == 0xFF ) {
1912         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913             return propagateFloat32NaN( a, b );
1914         }
1915         if ( ( bExp | bSig ) == 0 ) {
1916             float_raise( float_flag_invalid );
1917             return float32_default_nan;
1918         }
1919         return packFloat32( zSign, 0xFF, 0 );
1920     }
1921     if ( bExp == 0xFF ) {
1922         if ( bSig ) return propagateFloat32NaN( a, b );
1923         if ( ( aExp | aSig ) == 0 ) {
1924             float_raise( float_flag_invalid );
1925             return float32_default_nan;
1926         }
1927         return packFloat32( zSign, 0xFF, 0 );
1928     }
1929     if ( aExp == 0 ) {
1930         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1932     }
1933     if ( bExp == 0 ) {
1934         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1936     }
1937     zExp = aExp + bExp - 0x7F;
1938     aSig = ( aSig | 0x00800000 )<<7;
1939     bSig = ( bSig | 0x00800000 )<<8;
1940     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1941     zSig = zSig64;
1942     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1943         zSig <<= 1;
1944         --zExp;
1945     }
1946     return roundAndPackFloat32( zSign, zExp, zSig );
1947
1948 }
1949
1950 /*
1951 -------------------------------------------------------------------------------
1952 Returns the result of dividing the single-precision floating-point value `a'
1953 by the corresponding value `b'.  The operation is performed according to the
1954 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1955 -------------------------------------------------------------------------------
1956 */
1957 float32 float32_div( float32 a, float32 b )
1958 {
1959     flag aSign, bSign, zSign;
1960     int16 aExp, bExp, zExp;
1961     bits32 aSig, bSig, zSig;
1962
1963     aSig = extractFloat32Frac( a );
1964     aExp = extractFloat32Exp( a );
1965     aSign = extractFloat32Sign( a );
1966     bSig = extractFloat32Frac( b );
1967     bExp = extractFloat32Exp( b );
1968     bSign = extractFloat32Sign( b );
1969     zSign = aSign ^ bSign;
1970     if ( aExp == 0xFF ) {
1971         if ( aSig ) return propagateFloat32NaN( a, b );
1972         if ( bExp == 0xFF ) {
1973             if ( bSig ) return propagateFloat32NaN( a, b );
1974             float_raise( float_flag_invalid );
1975             return float32_default_nan;
1976         }
1977         return packFloat32( zSign, 0xFF, 0 );
1978     }
1979     if ( bExp == 0xFF ) {
1980         if ( bSig ) return propagateFloat32NaN( a, b );
1981         return packFloat32( zSign, 0, 0 );
1982     }
1983     if ( bExp == 0 ) {
1984         if ( bSig == 0 ) {
1985             if ( ( aExp | aSig ) == 0 ) {
1986                 float_raise( float_flag_invalid );
1987                 return float32_default_nan;
1988             }
1989             float_raise( float_flag_divbyzero );
1990             return packFloat32( zSign, 0xFF, 0 );
1991         }
1992         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1993     }
1994     if ( aExp == 0 ) {
1995         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1996         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1997     }
1998     zExp = aExp - bExp + 0x7D;
1999     aSig = ( aSig | 0x00800000 )<<7;
2000     bSig = ( bSig | 0x00800000 )<<8;
2001     if ( bSig <= ( aSig + aSig ) ) {
2002         aSig >>= 1;
2003         ++zExp;
2004     }
2005     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2006     if ( ( zSig & 0x3F ) == 0 ) {
2007         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2008     }
2009     return roundAndPackFloat32( zSign, zExp, zSig );
2010
2011 }
2012
2013 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2014 /*
2015 -------------------------------------------------------------------------------
2016 Returns the remainder of the single-precision floating-point value `a'
2017 with respect to the corresponding value `b'.  The operation is performed
2018 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 -------------------------------------------------------------------------------
2020 */
2021 float32 float32_rem( float32 a, float32 b )
2022 {
2023     flag aSign, bSign, zSign;
2024     int16 aExp, bExp, expDiff;
2025     bits32 aSig, bSig;
2026     bits32 q;
2027     bits64 aSig64, bSig64, q64;
2028     bits32 alternateASig;
2029     sbits32 sigMean;
2030
2031     aSig = extractFloat32Frac( a );
2032     aExp = extractFloat32Exp( a );
2033     aSign = extractFloat32Sign( a );
2034     bSig = extractFloat32Frac( b );
2035     bExp = extractFloat32Exp( b );
2036     bSign = extractFloat32Sign( b );
2037     if ( aExp == 0xFF ) {
2038         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039             return propagateFloat32NaN( a, b );
2040         }
2041         float_raise( float_flag_invalid );
2042         return float32_default_nan;
2043     }
2044     if ( bExp == 0xFF ) {
2045         if ( bSig ) return propagateFloat32NaN( a, b );
2046         return a;
2047     }
2048     if ( bExp == 0 ) {
2049         if ( bSig == 0 ) {
2050             float_raise( float_flag_invalid );
2051             return float32_default_nan;
2052         }
2053         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2054     }
2055     if ( aExp == 0 ) {
2056         if ( aSig == 0 ) return a;
2057         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2058     }
2059     expDiff = aExp - bExp;
2060     aSig |= 0x00800000;
2061     bSig |= 0x00800000;
2062     if ( expDiff < 32 ) {
2063         aSig <<= 8;
2064         bSig <<= 8;
2065         if ( expDiff < 0 ) {
2066             if ( expDiff < -1 ) return a;
2067             aSig >>= 1;
2068         }
2069         q = ( bSig <= aSig );
2070         if ( q ) aSig -= bSig;
2071         if ( 0 < expDiff ) {
2072             q = ( ( (bits64) aSig )<<32 ) / bSig;
2073             q >>= 32 - expDiff;
2074             bSig >>= 2;
2075             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2076         }
2077         else {
2078             aSig >>= 2;
2079             bSig >>= 2;
2080         }
2081     }
2082     else {
2083         if ( bSig <= aSig ) aSig -= bSig;
2084         aSig64 = ( (bits64) aSig )<<40;
2085         bSig64 = ( (bits64) bSig )<<40;
2086         expDiff -= 64;
2087         while ( 0 < expDiff ) {
2088             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090             aSig64 = - ( ( bSig * q64 )<<38 );
2091             expDiff -= 62;
2092         }
2093         expDiff += 64;
2094         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096         q = q64>>( 64 - expDiff );
2097         bSig <<= 6;
2098         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2099     }
2100     do {
2101         alternateASig = aSig;
2102         ++q;
2103         aSig -= bSig;
2104     } while ( 0 <= (sbits32) aSig );
2105     sigMean = aSig + alternateASig;
2106     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107         aSig = alternateASig;
2108     }
2109     zSign = ( (sbits32) aSig < 0 );
2110     if ( zSign ) aSig = - aSig;
2111     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2112
2113 }
2114 #endif /* !SOFTFLOAT_FOR_GCC */
2115
2116 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2117 /*
2118 -------------------------------------------------------------------------------
2119 Returns the square root of the single-precision floating-point value `a'.
2120 The operation is performed according to the IEC/IEEE Standard for Binary
2121 Floating-Point Arithmetic.
2122 -------------------------------------------------------------------------------
2123 */
2124 float32 float32_sqrt( float32 a )
2125 {
2126     flag aSign;
2127     int16 aExp, zExp;
2128     bits32 aSig, zSig;
2129     bits64 rem, term;
2130
2131     aSig = extractFloat32Frac( a );
2132     aExp = extractFloat32Exp( a );
2133     aSign = extractFloat32Sign( a );
2134     if ( aExp == 0xFF ) {
2135         if ( aSig ) return propagateFloat32NaN( a, 0 );
2136         if ( ! aSign ) return a;
2137         float_raise( float_flag_invalid );
2138         return float32_default_nan;
2139     }
2140     if ( aSign ) {
2141         if ( ( aExp | aSig ) == 0 ) return a;
2142         float_raise( float_flag_invalid );
2143         return float32_default_nan;
2144     }
2145     if ( aExp == 0 ) {
2146         if ( aSig == 0 ) return 0;
2147         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2148     }
2149     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2150     aSig = ( aSig | 0x00800000 )<<8;
2151     zSig = estimateSqrt32( aExp, aSig ) + 2;
2152     if ( ( zSig & 0x7F ) <= 5 ) {
2153         if ( zSig < 2 ) {
2154             zSig = 0x7FFFFFFF;
2155             goto roundAndPack;
2156         }
2157         aSig >>= aExp & 1;
2158         term = ( (bits64) zSig ) * zSig;
2159         rem = ( ( (bits64) aSig )<<32 ) - term;
2160         while ( (sbits64) rem < 0 ) {
2161             --zSig;
2162             rem += ( ( (bits64) zSig )<<1 ) | 1;
2163         }
2164         zSig |= ( rem != 0 );
2165     }
2166     shift32RightJamming( zSig, 1, &zSig );
2167  roundAndPack:
2168     return roundAndPackFloat32( 0, zExp, zSig );
2169
2170 }
2171 #endif /* !SOFTFLOAT_FOR_GCC */
2172
2173 /*
2174 -------------------------------------------------------------------------------
2175 Returns 1 if the single-precision floating-point value `a' is equal to
2176 the corresponding value `b', and 0 otherwise.  The comparison is performed
2177 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2178 -------------------------------------------------------------------------------
2179 */
2180 flag float32_eq( float32 a, float32 b )
2181 {
2182
2183     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2184          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2185        ) {
2186         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2187             float_raise( float_flag_invalid );
2188         }
2189         return 0;
2190     }
2191     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2192
2193 }
2194
2195 /*
2196 -------------------------------------------------------------------------------
2197 Returns 1 if the single-precision floating-point value `a' is less than
2198 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2199 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2200 Arithmetic.
2201 -------------------------------------------------------------------------------
2202 */
2203 flag float32_le( float32 a, float32 b )
2204 {
2205     flag aSign, bSign;
2206
2207     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2208          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2209        ) {
2210         float_raise( float_flag_invalid );
2211         return 0;
2212     }
2213     aSign = extractFloat32Sign( a );
2214     bSign = extractFloat32Sign( b );
2215     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2216     return ( a == b ) || ( aSign ^ ( a < b ) );
2217
2218 }
2219
2220 /*
2221 -------------------------------------------------------------------------------
2222 Returns 1 if the single-precision floating-point value `a' is less than
2223 the corresponding value `b', and 0 otherwise.  The comparison is performed
2224 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2225 -------------------------------------------------------------------------------
2226 */
2227 flag float32_lt( float32 a, float32 b )
2228 {
2229     flag aSign, bSign;
2230
2231     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2232          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2233        ) {
2234         float_raise( float_flag_invalid );
2235         return 0;
2236     }
2237     aSign = extractFloat32Sign( a );
2238     bSign = extractFloat32Sign( b );
2239     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2240     return ( a != b ) && ( aSign ^ ( a < b ) );
2241
2242 }
2243
2244 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2245 /*
2246 -------------------------------------------------------------------------------
2247 Returns 1 if the single-precision floating-point value `a' is equal to
2248 the corresponding value `b', and 0 otherwise.  The invalid exception is
2249 raised if either operand is a NaN.  Otherwise, the comparison is performed
2250 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2251 -------------------------------------------------------------------------------
2252 */
2253 flag float32_eq_signaling( float32 a, float32 b )
2254 {
2255
2256     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2257          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2258        ) {
2259         float_raise( float_flag_invalid );
2260         return 0;
2261     }
2262     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2263
2264 }
2265
2266 /*
2267 -------------------------------------------------------------------------------
2268 Returns 1 if the single-precision floating-point value `a' is less than or
2269 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2270 cause an exception.  Otherwise, the comparison is performed according to the
2271 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2272 -------------------------------------------------------------------------------
2273 */
2274 flag float32_le_quiet( float32 a, float32 b )
2275 {
2276     flag aSign, bSign;
2277
2278     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2279          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2280        ) {
2281         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2282             float_raise( float_flag_invalid );
2283         }
2284         return 0;
2285     }
2286     aSign = extractFloat32Sign( a );
2287     bSign = extractFloat32Sign( b );
2288     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2289     return ( a == b ) || ( aSign ^ ( a < b ) );
2290
2291 }
2292
2293 /*
2294 -------------------------------------------------------------------------------
2295 Returns 1 if the single-precision floating-point value `a' is less than
2296 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2297 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2298 Standard for Binary Floating-Point Arithmetic.
2299 -------------------------------------------------------------------------------
2300 */
2301 flag float32_lt_quiet( float32 a, float32 b )
2302 {
2303     flag aSign, bSign;
2304
2305     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2306          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2307        ) {
2308         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2309             float_raise( float_flag_invalid );
2310         }
2311         return 0;
2312     }
2313     aSign = extractFloat32Sign( a );
2314     bSign = extractFloat32Sign( b );
2315     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2316     return ( a != b ) && ( aSign ^ ( a < b ) );
2317
2318 }
2319 #endif /* !SOFTFLOAT_FOR_GCC */
2320
2321 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2322 /*
2323 -------------------------------------------------------------------------------
2324 Returns the result of converting the double-precision floating-point value
2325 `a' to the 32-bit two's complement integer format.  The conversion is
2326 performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 Arithmetic---which means in particular that the conversion is rounded
2328 according to the current rounding mode.  If `a' is a NaN, the largest
2329 positive integer is returned.  Otherwise, if the conversion overflows, the
2330 largest integer with the same sign as `a' is returned.
2331 -------------------------------------------------------------------------------
2332 */
2333 int32 float64_to_int32( float64 a )
2334 {
2335     flag aSign;
2336     int16 aExp, shiftCount;
2337     bits64 aSig;
2338
2339     aSig = extractFloat64Frac( a );
2340     aExp = extractFloat64Exp( a );
2341     aSign = extractFloat64Sign( a );
2342     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2343     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2344     shiftCount = 0x42C - aExp;
2345     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2346     return roundAndPackInt32( aSign, aSig );
2347
2348 }
2349 #endif /* !SOFTFLOAT_FOR_GCC */
2350
2351 /*
2352 -------------------------------------------------------------------------------
2353 Returns the result of converting the double-precision floating-point value
2354 `a' to the 32-bit two's complement integer format.  The conversion is
2355 performed according to the IEC/IEEE Standard for Binary Floating-Point
2356 Arithmetic, except that the conversion is always rounded toward zero.
2357 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2358 the conversion overflows, the largest integer with the same sign as `a' is
2359 returned.
2360 -------------------------------------------------------------------------------
2361 */
2362 int32 float64_to_int32_round_to_zero( float64 a )
2363 {
2364     flag aSign;
2365     int16 aExp, shiftCount;
2366     bits64 aSig, savedASig;
2367     int32 z;
2368
2369     aSig = extractFloat64Frac( a );
2370     aExp = extractFloat64Exp( a );
2371     aSign = extractFloat64Sign( a );
2372     if ( 0x41E < aExp ) {
2373         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2374         goto invalid;
2375     }
2376     else if ( aExp < 0x3FF ) {
2377         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2378         return 0;
2379     }
2380     aSig |= LIT64( 0x0010000000000000 );
2381     shiftCount = 0x433 - aExp;
2382     savedASig = aSig;
2383     aSig >>= shiftCount;
2384     z = aSig;
2385     if ( aSign ) z = - z;
2386     if ( ( z < 0 ) ^ aSign ) {
2387  invalid:
2388         float_raise( float_flag_invalid );
2389         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2390     }
2391     if ( ( aSig<<shiftCount ) != savedASig ) {
2392         float_exception_flags |= float_flag_inexact;
2393     }
2394     return z;
2395
2396 }
2397
2398 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2399 /*
2400 -------------------------------------------------------------------------------
2401 Returns the result of converting the double-precision floating-point value
2402 `a' to the 64-bit two's complement integer format.  The conversion is
2403 performed according to the IEC/IEEE Standard for Binary Floating-Point
2404 Arithmetic---which means in particular that the conversion is rounded
2405 according to the current rounding mode.  If `a' is a NaN, the largest
2406 positive integer is returned.  Otherwise, if the conversion overflows, the
2407 largest integer with the same sign as `a' is returned.
2408 -------------------------------------------------------------------------------
2409 */
2410 int64 float64_to_int64( float64 a )
2411 {
2412     flag aSign;
2413     int16 aExp, shiftCount;
2414     bits64 aSig, aSigExtra;
2415
2416     aSig = extractFloat64Frac( a );
2417     aExp = extractFloat64Exp( a );
2418     aSign = extractFloat64Sign( a );
2419     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2420     shiftCount = 0x433 - aExp;
2421     if ( shiftCount <= 0 ) {
2422         if ( 0x43E < aExp ) {
2423             float_raise( float_flag_invalid );
2424             if (    ! aSign
2425                  || (    ( aExp == 0x7FF )
2426                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2427                ) {
2428                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2429             }
2430             return (sbits64) LIT64( 0x8000000000000000 );
2431         }
2432         aSigExtra = 0;
2433         aSig <<= - shiftCount;
2434     }
2435     else {
2436         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2437     }
2438     return roundAndPackInt64( aSign, aSig, aSigExtra );
2439
2440 }
2441
2442 /*
2443 -------------------------------------------------------------------------------
2444 Returns the result of converting the double-precision floating-point value
2445 `a' to the 64-bit two's complement integer format.  The conversion is
2446 performed according to the IEC/IEEE Standard for Binary Floating-Point
2447 Arithmetic, except that the conversion is always rounded toward zero.
2448 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2449 the conversion overflows, the largest integer with the same sign as `a' is
2450 returned.
2451 -------------------------------------------------------------------------------
2452 */
2453 int64 float64_to_int64_round_to_zero( float64 a )
2454 {
2455     flag aSign;
2456     int16 aExp, shiftCount;
2457     bits64 aSig;
2458     int64 z;
2459
2460     aSig = extractFloat64Frac( a );
2461     aExp = extractFloat64Exp( a );
2462     aSign = extractFloat64Sign( a );
2463     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2464     shiftCount = aExp - 0x433;
2465     if ( 0 <= shiftCount ) {
2466         if ( 0x43E <= aExp ) {
2467             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2468                 float_raise( float_flag_invalid );
2469                 if (    ! aSign
2470                      || (    ( aExp == 0x7FF )
2471                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2472                    ) {
2473                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2474                 }
2475             }
2476             return (sbits64) LIT64( 0x8000000000000000 );
2477         }
2478         z = aSig<<shiftCount;
2479     }
2480     else {
2481         if ( aExp < 0x3FE ) {
2482             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2483             return 0;
2484         }
2485         z = aSig>>( - shiftCount );
2486         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2487             float_exception_flags |= float_flag_inexact;
2488         }
2489     }
2490     if ( aSign ) z = - z;
2491     return z;
2492
2493 }
2494 #endif /* !SOFTFLOAT_FOR_GCC */
2495
2496 /*
2497 -------------------------------------------------------------------------------
2498 Returns the result of converting the double-precision floating-point value
2499 `a' to the single-precision floating-point format.  The conversion is
2500 performed according to the IEC/IEEE Standard for Binary Floating-Point
2501 Arithmetic.
2502 -------------------------------------------------------------------------------
2503 */
2504 float32 float64_to_float32( float64 a )
2505 {
2506     flag aSign;
2507     int16 aExp;
2508     bits64 aSig;
2509     bits32 zSig;
2510
2511     aSig = extractFloat64Frac( a );
2512     aExp = extractFloat64Exp( a );
2513     aSign = extractFloat64Sign( a );
2514     if ( aExp == 0x7FF ) {
2515         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2516         return packFloat32( aSign, 0xFF, 0 );
2517     }
2518     shift64RightJamming( aSig, 22, &aSig );
2519     zSig = aSig;
2520     if ( aExp || zSig ) {
2521         zSig |= 0x40000000;
2522         aExp -= 0x381;
2523     }
2524     return roundAndPackFloat32( aSign, aExp, zSig );
2525
2526 }
2527
2528 #ifdef FLOATX80
2529
2530 /*
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the double-precision floating-point value
2533 `a' to the extended double-precision floating-point format.  The conversion
2534 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2535 Arithmetic.
2536 -------------------------------------------------------------------------------
2537 */
2538 floatx80 float64_to_floatx80( float64 a )
2539 {
2540     flag aSign;
2541     int16 aExp;
2542     bits64 aSig;
2543
2544     aSig = extractFloat64Frac( a );
2545     aExp = extractFloat64Exp( a );
2546     aSign = extractFloat64Sign( a );
2547     if ( aExp == 0x7FF ) {
2548         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2549         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2550     }
2551     if ( aExp == 0 ) {
2552         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2553         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2554     }
2555     return
2556         packFloatx80(
2557             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2558
2559 }
2560
2561 #endif
2562
2563 #ifdef FLOAT128
2564
2565 /*
2566 -------------------------------------------------------------------------------
2567 Returns the result of converting the double-precision floating-point value
2568 `a' to the quadruple-precision floating-point format.  The conversion is
2569 performed according to the IEC/IEEE Standard for Binary Floating-Point
2570 Arithmetic.
2571 -------------------------------------------------------------------------------
2572 */
2573 float128 float64_to_float128( float64 a )
2574 {
2575     flag aSign;
2576     int16 aExp;
2577     bits64 aSig, zSig0, zSig1;
2578
2579     aSig = extractFloat64Frac( a );
2580     aExp = extractFloat64Exp( a );
2581     aSign = extractFloat64Sign( a );
2582     if ( aExp == 0x7FF ) {
2583         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2584         return packFloat128( aSign, 0x7FFF, 0, 0 );
2585     }
2586     if ( aExp == 0 ) {
2587         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2588         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2589         --aExp;
2590     }
2591     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2592     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2593
2594 }
2595
2596 #endif
2597
2598 #ifndef SOFTFLOAT_FOR_GCC
2599 /*
2600 -------------------------------------------------------------------------------
2601 Rounds the double-precision floating-point value `a' to an integer, and
2602 returns the result as a double-precision floating-point value.  The
2603 operation is performed according to the IEC/IEEE Standard for Binary
2604 Floating-Point Arithmetic.
2605 -------------------------------------------------------------------------------
2606 */
2607 float64 float64_round_to_int( float64 a )
2608 {
2609     flag aSign;
2610     int16 aExp;
2611     bits64 lastBitMask, roundBitsMask;
2612     int8 roundingMode;
2613     float64 z;
2614
2615     aExp = extractFloat64Exp( a );
2616     if ( 0x433 <= aExp ) {
2617         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2618             return propagateFloat64NaN( a, a );
2619         }
2620         return a;
2621     }
2622     if ( aExp < 0x3FF ) {
2623         if ( (bits64) ( a<<1 ) == 0 ) return a;
2624         float_exception_flags |= float_flag_inexact;
2625         aSign = extractFloat64Sign( a );
2626         switch ( float_rounding_mode ) {
2627          case float_round_nearest_even:
2628             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2629                 return packFloat64( aSign, 0x3FF, 0 );
2630             }
2631             break;
2632          case float_round_to_zero:
2633             break;
2634          case float_round_down:
2635             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2636          case float_round_up:
2637             return
2638             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2639         }
2640         return packFloat64( aSign, 0, 0 );
2641     }
2642     lastBitMask = 1;
2643     lastBitMask <<= 0x433 - aExp;
2644     roundBitsMask = lastBitMask - 1;
2645     z = a;
2646     roundingMode = float_rounding_mode;
2647     if ( roundingMode == float_round_nearest_even ) {
2648         z += lastBitMask>>1;
2649         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2650     }
2651     else if ( roundingMode != float_round_to_zero ) {
2652         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2653             z += roundBitsMask;
2654         }
2655     }
2656     z &= ~ roundBitsMask;
2657     if ( z != a ) float_exception_flags |= float_flag_inexact;
2658     return z;
2659
2660 }
2661 #endif
2662
2663 /*
2664 -------------------------------------------------------------------------------
2665 Returns the result of adding the absolute values of the double-precision
2666 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2667 before being returned.  `zSign' is ignored if the result is a NaN.
2668 The addition is performed according to the IEC/IEEE Standard for Binary
2669 Floating-Point Arithmetic.
2670 -------------------------------------------------------------------------------
2671 */
2672 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2673 {
2674     int16 aExp, bExp, zExp;
2675     bits64 aSig, bSig, zSig;
2676     int16 expDiff;
2677
2678     aSig = extractFloat64Frac( a );
2679     aExp = extractFloat64Exp( a );
2680     bSig = extractFloat64Frac( b );
2681     bExp = extractFloat64Exp( b );
2682     expDiff = aExp - bExp;
2683     aSig <<= 9;
2684     bSig <<= 9;
2685     if ( 0 < expDiff ) {
2686         if ( aExp == 0x7FF ) {
2687             if ( aSig ) return propagateFloat64NaN( a, b );
2688             return a;
2689         }
2690         if ( bExp == 0 ) {
2691             --expDiff;
2692         }
2693         else {
2694             bSig |= LIT64( 0x2000000000000000 );
2695         }
2696         shift64RightJamming( bSig, expDiff, &bSig );
2697         zExp = aExp;
2698     }
2699     else if ( expDiff < 0 ) {
2700         if ( bExp == 0x7FF ) {
2701             if ( bSig ) return propagateFloat64NaN( a, b );
2702             return packFloat64( zSign, 0x7FF, 0 );
2703         }
2704         if ( aExp == 0 ) {
2705             ++expDiff;
2706         }
2707         else {
2708             aSig |= LIT64( 0x2000000000000000 );
2709         }
2710         shift64RightJamming( aSig, - expDiff, &aSig );
2711         zExp = bExp;
2712     }
2713     else {
2714         if ( aExp == 0x7FF ) {
2715             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2716             return a;
2717         }
2718         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2719         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2720         zExp = aExp;
2721         goto roundAndPack;
2722     }
2723     aSig |= LIT64( 0x2000000000000000 );
2724     zSig = ( aSig + bSig )<<1;
2725     --zExp;
2726     if ( (sbits64) zSig < 0 ) {
2727         zSig = aSig + bSig;
2728         ++zExp;
2729     }
2730  roundAndPack:
2731     return roundAndPackFloat64( zSign, zExp, zSig );
2732
2733 }
2734
2735 /*
2736 -------------------------------------------------------------------------------
2737 Returns the result of subtracting the absolute values of the double-
2738 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2739 difference is negated before being returned.  `zSign' is ignored if the
2740 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2741 Standard for Binary Floating-Point Arithmetic.
2742 -------------------------------------------------------------------------------
2743 */
2744 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2745 {
2746     int16 aExp, bExp, zExp;
2747     bits64 aSig, bSig, zSig;
2748     int16 expDiff;
2749
2750     aSig = extractFloat64Frac( a );
2751     aExp = extractFloat64Exp( a );
2752     bSig = extractFloat64Frac( b );
2753     bExp = extractFloat64Exp( b );
2754     expDiff = aExp - bExp;
2755     aSig <<= 10;
2756     bSig <<= 10;
2757     if ( 0 < expDiff ) goto aExpBigger;
2758     if ( expDiff < 0 ) goto bExpBigger;
2759     if ( aExp == 0x7FF ) {
2760         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2761         float_raise( float_flag_invalid );
2762         return float64_default_nan;
2763     }
2764     if ( aExp == 0 ) {
2765         aExp = 1;
2766         bExp = 1;
2767     }
2768     if ( bSig < aSig ) goto aBigger;
2769     if ( aSig < bSig ) goto bBigger;
2770     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2771  bExpBigger:
2772     if ( bExp == 0x7FF ) {
2773         if ( bSig ) return propagateFloat64NaN( a, b );
2774         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2775     }
2776     if ( aExp == 0 ) {
2777         ++expDiff;
2778     }
2779     else {
2780         aSig |= LIT64( 0x4000000000000000 );
2781     }
2782     shift64RightJamming( aSig, - expDiff, &aSig );
2783     bSig |= LIT64( 0x4000000000000000 );
2784  bBigger:
2785     zSig = bSig - aSig;
2786     zExp = bExp;
2787     zSign ^= 1;
2788     goto normalizeRoundAndPack;
2789  aExpBigger:
2790     if ( aExp == 0x7FF ) {
2791         if ( aSig ) return propagateFloat64NaN( a, b );
2792         return a;
2793     }
2794     if ( bExp == 0 ) {
2795         --expDiff;
2796     }
2797     else {
2798         bSig |= LIT64( 0x4000000000000000 );
2799     }
2800     shift64RightJamming( bSig, expDiff, &bSig );
2801     aSig |= LIT64( 0x4000000000000000 );
2802  aBigger:
2803     zSig = aSig - bSig;
2804     zExp = aExp;
2805  normalizeRoundAndPack:
2806     --zExp;
2807     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2808
2809 }
2810
2811 /*
2812 -------------------------------------------------------------------------------
2813 Returns the result of adding the double-precision floating-point values `a'
2814 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2815 Binary Floating-Point Arithmetic.
2816 -------------------------------------------------------------------------------
2817 */
2818 float64 float64_add( float64 a, float64 b )
2819 {
2820     flag aSign, bSign;
2821
2822     aSign = extractFloat64Sign( a );
2823     bSign = extractFloat64Sign( b );
2824     if ( aSign == bSign ) {
2825         return addFloat64Sigs( a, b, aSign );
2826     }
2827     else {
2828         return subFloat64Sigs( a, b, aSign );
2829     }
2830
2831 }
2832
2833 /*
2834 -------------------------------------------------------------------------------
2835 Returns the result of subtracting the double-precision floating-point values
2836 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2837 for Binary Floating-Point Arithmetic.
2838 -------------------------------------------------------------------------------
2839 */
2840 float64 float64_sub( float64 a, float64 b )
2841 {
2842     flag aSign, bSign;
2843
2844     aSign = extractFloat64Sign( a );
2845     bSign = extractFloat64Sign( b );
2846     if ( aSign == bSign ) {
2847         return subFloat64Sigs( a, b, aSign );
2848     }
2849     else {
2850         return addFloat64Sigs( a, b, aSign );
2851     }
2852
2853 }
2854
2855 /*
2856 -------------------------------------------------------------------------------
2857 Returns the result of multiplying the double-precision floating-point values
2858 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2859 for Binary Floating-Point Arithmetic.
2860 -------------------------------------------------------------------------------
2861 */
2862 float64 float64_mul( float64 a, float64 b )
2863 {
2864     flag aSign, bSign, zSign;
2865     int16 aExp, bExp, zExp;
2866     bits64 aSig, bSig, zSig0, zSig1;
2867
2868     aSig = extractFloat64Frac( a );
2869     aExp = extractFloat64Exp( a );
2870     aSign = extractFloat64Sign( a );
2871     bSig = extractFloat64Frac( b );
2872     bExp = extractFloat64Exp( b );
2873     bSign = extractFloat64Sign( b );
2874     zSign = aSign ^ bSign;
2875     if ( aExp == 0x7FF ) {
2876         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2877             return propagateFloat64NaN( a, b );
2878         }
2879         if ( ( bExp | bSig ) == 0 ) {
2880             float_raise( float_flag_invalid );
2881             return float64_default_nan;
2882         }
2883         return packFloat64( zSign, 0x7FF, 0 );
2884     }
2885     if ( bExp == 0x7FF ) {
2886         if ( bSig ) return propagateFloat64NaN( a, b );
2887         if ( ( aExp | aSig ) == 0 ) {
2888             float_raise( float_flag_invalid );
2889             return float64_default_nan;
2890         }
2891         return packFloat64( zSign, 0x7FF, 0 );
2892     }
2893     if ( aExp == 0 ) {
2894         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2895         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2896     }
2897     if ( bExp == 0 ) {
2898         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2899         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2900     }
2901     zExp = aExp + bExp - 0x3FF;
2902     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2903     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2904     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2905     zSig0 |= ( zSig1 != 0 );
2906     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2907         zSig0 <<= 1;
2908         --zExp;
2909     }
2910     return roundAndPackFloat64( zSign, zExp, zSig0 );
2911
2912 }
2913
2914 /*
2915 -------------------------------------------------------------------------------
2916 Returns the result of dividing the double-precision floating-point value `a'
2917 by the corresponding value `b'.  The operation is performed according to
2918 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2919 -------------------------------------------------------------------------------
2920 */
2921 float64 float64_div( float64 a, float64 b )
2922 {
2923     flag aSign, bSign, zSign;
2924     int16 aExp, bExp, zExp;
2925     bits64 aSig, bSig, zSig;
2926     bits64 rem0, rem1;
2927     bits64 term0, term1;
2928
2929     aSig = extractFloat64Frac( a );
2930     aExp = extractFloat64Exp( a );
2931     aSign = extractFloat64Sign( a );
2932     bSig = extractFloat64Frac( b );
2933     bExp = extractFloat64Exp( b );
2934     bSign = extractFloat64Sign( b );
2935     zSign = aSign ^ bSign;
2936     if ( aExp == 0x7FF ) {
2937         if ( aSig ) return propagateFloat64NaN( a, b );
2938         if ( bExp == 0x7FF ) {
2939             if ( bSig ) return propagateFloat64NaN( a, b );
2940             float_raise( float_flag_invalid );
2941             return float64_default_nan;
2942         }
2943         return packFloat64( zSign, 0x7FF, 0 );
2944     }
2945     if ( bExp == 0x7FF ) {
2946         if ( bSig ) return propagateFloat64NaN( a, b );
2947         return packFloat64( zSign, 0, 0 );
2948     }
2949     if ( bExp == 0 ) {
2950         if ( bSig == 0 ) {
2951             if ( ( aExp | aSig ) == 0 ) {
2952                 float_raise( float_flag_invalid );
2953                 return float64_default_nan;
2954             }
2955             float_raise( float_flag_divbyzero );
2956             return packFloat64( zSign, 0x7FF, 0 );
2957         }
2958         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2959     }
2960     if ( aExp == 0 ) {
2961         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2962         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2963     }
2964     zExp = aExp - bExp + 0x3FD;
2965     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2966     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2967     if ( bSig <= ( aSig + aSig ) ) {
2968         aSig >>= 1;
2969         ++zExp;
2970     }
2971     zSig = estimateDiv128To64( aSig, 0, bSig );
2972     if ( ( zSig & 0x1FF ) <= 2 ) {
2973         mul64To128( bSig, zSig, &term0, &term1 );
2974         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2975         while ( (sbits64) rem0 < 0 ) {
2976             --zSig;
2977             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2978         }
2979         zSig |= ( rem1 != 0 );
2980     }
2981     return roundAndPackFloat64( zSign, zExp, zSig );
2982
2983 }
2984
2985 #ifndef SOFTFLOAT_FOR_GCC
2986 /*
2987 -------------------------------------------------------------------------------
2988 Returns the remainder of the double-precision floating-point value `a'
2989 with respect to the corresponding value `b'.  The operation is performed
2990 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2991 -------------------------------------------------------------------------------
2992 */
2993 float64 float64_rem( float64 a, float64 b )
2994 {
2995     flag aSign, bSign, zSign;
2996     int16 aExp, bExp, expDiff;
2997     bits64 aSig, bSig;
2998     bits64 q, alternateASig;
2999     sbits64 sigMean;
3000
3001     aSig = extractFloat64Frac( a );
3002     aExp = extractFloat64Exp( a );
3003     aSign = extractFloat64Sign( a );
3004     bSig = extractFloat64Frac( b );
3005     bExp = extractFloat64Exp( b );
3006     bSign = extractFloat64Sign( b );
3007     if ( aExp == 0x7FF ) {
3008         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3009             return propagateFloat64NaN( a, b );
3010         }
3011         float_raise( float_flag_invalid );
3012         return float64_default_nan;
3013     }
3014     if ( bExp == 0x7FF ) {
3015         if ( bSig ) return propagateFloat64NaN( a, b );
3016         return a;
3017     }
3018     if ( bExp == 0 ) {
3019         if ( bSig == 0 ) {
3020             float_raise( float_flag_invalid );
3021             return float64_default_nan;
3022         }
3023         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3024     }
3025     if ( aExp == 0 ) {
3026         if ( aSig == 0 ) return a;
3027         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3028     }
3029     expDiff = aExp - bExp;
3030     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3031     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3032     if ( expDiff < 0 ) {
3033         if ( expDiff < -1 ) return a;
3034         aSig >>= 1;
3035     }
3036     q = ( bSig <= aSig );
3037     if ( q ) aSig -= bSig;
3038     expDiff -= 64;
3039     while ( 0 < expDiff ) {
3040         q = estimateDiv128To64( aSig, 0, bSig );
3041         q = ( 2 < q ) ? q - 2 : 0;
3042         aSig = - ( ( bSig>>2 ) * q );
3043         expDiff -= 62;
3044     }
3045     expDiff += 64;
3046     if ( 0 < expDiff ) {
3047         q = estimateDiv128To64( aSig, 0, bSig );
3048         q = ( 2 < q ) ? q - 2 : 0;
3049         q >>= 64 - expDiff;
3050         bSig >>= 2;
3051         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3052     }
3053     else {
3054         aSig >>= 2;
3055         bSig >>= 2;
3056     }
3057     do {
3058         alternateASig = aSig;
3059         ++q;
3060         aSig -= bSig;
3061     } while ( 0 <= (sbits64) aSig );
3062     sigMean = aSig + alternateASig;
3063     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3064         aSig = alternateASig;
3065     }
3066     zSign = ( (sbits64) aSig < 0 );
3067     if ( zSign ) aSig = - aSig;
3068     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3069
3070 }
3071
3072 /*
3073 -------------------------------------------------------------------------------
3074 Returns the square root of the double-precision floating-point value `a'.
3075 The operation is performed according to the IEC/IEEE Standard for Binary
3076 Floating-Point Arithmetic.
3077 -------------------------------------------------------------------------------
3078 */
3079 float64 float64_sqrt( float64 a )
3080 {
3081     flag aSign;
3082     int16 aExp, zExp;
3083     bits64 aSig, zSig, doubleZSig;
3084     bits64 rem0, rem1, term0, term1;
3085
3086     aSig = extractFloat64Frac( a );
3087     aExp = extractFloat64Exp( a );
3088     aSign = extractFloat64Sign( a );
3089     if ( aExp == 0x7FF ) {
3090         if ( aSig ) return propagateFloat64NaN( a, a );
3091         if ( ! aSign ) return a;
3092         float_raise( float_flag_invalid );
3093         return float64_default_nan;
3094     }
3095     if ( aSign ) {
3096         if ( ( aExp | aSig ) == 0 ) return a;
3097         float_raise( float_flag_invalid );
3098         return float64_default_nan;
3099     }
3100     if ( aExp == 0 ) {
3101         if ( aSig == 0 ) return 0;
3102         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3103     }
3104     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3105     aSig |= LIT64( 0x0010000000000000 );
3106     zSig = estimateSqrt32( aExp, aSig>>21 );
3107     aSig <<= 9 - ( aExp & 1 );
3108     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3109     if ( ( zSig & 0x1FF ) <= 5 ) {
3110         doubleZSig = zSig<<1;
3111         mul64To128( zSig, zSig, &term0, &term1 );
3112         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3113         while ( (sbits64) rem0 < 0 ) {
3114             --zSig;
3115             doubleZSig -= 2;
3116             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3117         }
3118         zSig |= ( ( rem0 | rem1 ) != 0 );
3119     }
3120     return roundAndPackFloat64( 0, zExp, zSig );
3121
3122 }
3123 #endif
3124
3125 /*
3126 -------------------------------------------------------------------------------
3127 Returns 1 if the double-precision floating-point value `a' is equal to the
3128 corresponding value `b', and 0 otherwise.  The comparison is performed
3129 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3130 -------------------------------------------------------------------------------
3131 */
3132 flag float64_eq( float64 a, float64 b )
3133 {
3134
3135     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3136          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3137        ) {
3138         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3139             float_raise( float_flag_invalid );
3140         }
3141         return 0;
3142     }
3143     return ( a == b ) ||
3144         ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3145
3146 }
3147
3148 /*
3149 -------------------------------------------------------------------------------
3150 Returns 1 if the double-precision floating-point value `a' is less than or
3151 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3152 performed according to the IEC/IEEE Standard for Binary Floating-Point
3153 Arithmetic.
3154 -------------------------------------------------------------------------------
3155 */
3156 flag float64_le( float64 a, float64 b )
3157 {
3158     flag aSign, bSign;
3159
3160     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3161          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3162        ) {
3163         float_raise( float_flag_invalid );
3164         return 0;
3165     }
3166     aSign = extractFloat64Sign( a );
3167     bSign = extractFloat64Sign( b );
3168     if ( aSign != bSign )
3169         return aSign ||
3170             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3171               0 );
3172     return ( a == b ) ||
3173         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3174
3175 }
3176
3177 /*
3178 -------------------------------------------------------------------------------
3179 Returns 1 if the double-precision floating-point value `a' is less than
3180 the corresponding value `b', and 0 otherwise.  The comparison is performed
3181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3182 -------------------------------------------------------------------------------
3183 */
3184 flag float64_lt( float64 a, float64 b )
3185 {
3186     flag aSign, bSign;
3187
3188     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3189          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3190        ) {
3191         float_raise( float_flag_invalid );
3192         return 0;
3193     }
3194     aSign = extractFloat64Sign( a );
3195     bSign = extractFloat64Sign( b );
3196     if ( aSign != bSign )
3197         return aSign &&
3198             ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3199               0 );
3200     return ( a != b ) &&
3201         ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3202
3203 }
3204
3205 #ifndef SOFTFLOAT_FOR_GCC
3206 /*
3207 -------------------------------------------------------------------------------
3208 Returns 1 if the double-precision floating-point value `a' is equal to the
3209 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3210 if either operand is a NaN.  Otherwise, the comparison is performed
3211 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3212 -------------------------------------------------------------------------------
3213 */
3214 flag float64_eq_signaling( float64 a, float64 b )
3215 {
3216
3217     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3218          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3219        ) {
3220         float_raise( float_flag_invalid );
3221         return 0;
3222     }
3223     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3224
3225 }
3226
3227 /*
3228 -------------------------------------------------------------------------------
3229 Returns 1 if the double-precision floating-point value `a' is less than or
3230 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3231 cause an exception.  Otherwise, the comparison is performed according to the
3232 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3233 -------------------------------------------------------------------------------
3234 */
3235 flag float64_le_quiet( float64 a, float64 b )
3236 {
3237     flag aSign, bSign;
3238
3239     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3240          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3241        ) {
3242         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3243             float_raise( float_flag_invalid );
3244         }
3245         return 0;
3246     }
3247     aSign = extractFloat64Sign( a );
3248     bSign = extractFloat64Sign( b );
3249     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3250     return ( a == b ) || ( aSign ^ ( a < b ) );
3251
3252 }
3253
3254 /*
3255 -------------------------------------------------------------------------------
3256 Returns 1 if the double-precision floating-point value `a' is less than
3257 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3258 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3259 Standard for Binary Floating-Point Arithmetic.
3260 -------------------------------------------------------------------------------
3261 */
3262 flag float64_lt_quiet( float64 a, float64 b )
3263 {
3264     flag aSign, bSign;
3265
3266     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3267          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3268        ) {
3269         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3270             float_raise( float_flag_invalid );
3271         }
3272         return 0;
3273     }
3274     aSign = extractFloat64Sign( a );
3275     bSign = extractFloat64Sign( b );
3276     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3277     return ( a != b ) && ( aSign ^ ( a < b ) );
3278
3279 }
3280 #endif
3281
3282 #ifdef FLOATX80
3283
3284 /*
3285 -------------------------------------------------------------------------------
3286 Returns the result of converting the extended double-precision floating-
3287 point value `a' to the 32-bit two's complement integer format.  The
3288 conversion is performed according to the IEC/IEEE Standard for Binary
3289 Floating-Point Arithmetic---which means in particular that the conversion
3290 is rounded according to the current rounding mode.  If `a' is a NaN, the
3291 largest positive integer is returned.  Otherwise, if the conversion
3292 overflows, the largest integer with the same sign as `a' is returned.
3293 -------------------------------------------------------------------------------
3294 */
3295 int32 floatx80_to_int32( floatx80 a )
3296 {
3297     flag aSign;
3298     int32 aExp, shiftCount;
3299     bits64 aSig;
3300
3301     aSig = extractFloatx80Frac( a );
3302     aExp = extractFloatx80Exp( a );
3303     aSign = extractFloatx80Sign( a );
3304     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3305     shiftCount = 0x4037 - aExp;
3306     if ( shiftCount <= 0 ) shiftCount = 1;
3307     shift64RightJamming( aSig, shiftCount, &aSig );
3308     return roundAndPackInt32( aSign, aSig );
3309
3310 }
3311
3312 /*
3313 -------------------------------------------------------------------------------
3314 Returns the result of converting the extended double-precision floating-
3315 point value `a' to the 32-bit two's complement integer format.  The
3316 conversion is performed according to the IEC/IEEE Standard for Binary
3317 Floating-Point Arithmetic, except that the conversion is always rounded
3318 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3319 Otherwise, if the conversion overflows, the largest integer with the same
3320 sign as `a' is returned.
3321 -------------------------------------------------------------------------------
3322 */
3323 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3324 {
3325     flag aSign;
3326     int32 aExp, shiftCount;
3327     bits64 aSig, savedASig;
3328     int32 z;
3329
3330     aSig = extractFloatx80Frac( a );
3331     aExp = extractFloatx80Exp( a );
3332     aSign = extractFloatx80Sign( a );
3333     if ( 0x401E < aExp ) {
3334         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3335         goto invalid;
3336     }
3337     else if ( aExp < 0x3FFF ) {
3338         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3339         return 0;
3340     }
3341     shiftCount = 0x403E - aExp;
3342     savedASig = aSig;
3343     aSig >>= shiftCount;
3344     z = aSig;
3345     if ( aSign ) z = - z;
3346     if ( ( z < 0 ) ^ aSign ) {
3347  invalid:
3348         float_raise( float_flag_invalid );
3349         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3350     }
3351     if ( ( aSig<<shiftCount ) != savedASig ) {
3352         float_exception_flags |= float_flag_inexact;
3353     }
3354     return z;
3355
3356 }
3357
3358 /*
3359 -------------------------------------------------------------------------------
3360 Returns the result of converting the extended double-precision floating-
3361 point value `a' to the 64-bit two's complement integer format.  The
3362 conversion is performed according to the IEC/IEEE Standard for Binary
3363 Floating-Point Arithmetic---which means in particular that the conversion
3364 is rounded according to the current rounding mode.  If `a' is a NaN,
3365 the largest positive integer is returned.  Otherwise, if the conversion
3366 overflows, the largest integer with the same sign as `a' is returned.
3367 -------------------------------------------------------------------------------
3368 */
3369 int64 floatx80_to_int64( floatx80 a )
3370 {
3371     flag aSign;
3372     int32 aExp, shiftCount;
3373     bits64 aSig, aSigExtra;
3374
3375     aSig = extractFloatx80Frac( a );
3376     aExp = extractFloatx80Exp( a );
3377     aSign = extractFloatx80Sign( a );
3378     shiftCount = 0x403E - aExp;
3379     if ( shiftCount <= 0 ) {
3380         if ( shiftCount ) {
3381             float_raise( float_flag_invalid );
3382             if (    ! aSign
3383                  || (    ( aExp == 0x7FFF )
3384                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3385                ) {
3386                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3387             }
3388             return (sbits64) LIT64( 0x8000000000000000 );
3389         }
3390         aSigExtra = 0;
3391     }
3392     else {
3393         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3394     }
3395     return roundAndPackInt64( aSign, aSig, aSigExtra );
3396
3397 }
3398
3399 /*
3400 -------------------------------------------------------------------------------
3401 Returns the result of converting the extended double-precision floating-
3402 point value `a' to the 64-bit two's complement integer format.  The
3403 conversion is performed according to the IEC/IEEE Standard for Binary
3404 Floating-Point Arithmetic, except that the conversion is always rounded
3405 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3406 Otherwise, if the conversion overflows, the largest integer with the same
3407 sign as `a' is returned.
3408 -------------------------------------------------------------------------------
3409 */
3410 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3411 {
3412     flag aSign;
3413     int32 aExp, shiftCount;
3414     bits64 aSig;
3415     int64 z;
3416
3417     aSig = extractFloatx80Frac( a );
3418     aExp = extractFloatx80Exp( a );
3419     aSign = extractFloatx80Sign( a );
3420     shiftCount = aExp - 0x403E;
3421     if ( 0 <= shiftCount ) {
3422         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3423         if ( ( a.high != 0xC03E ) || aSig ) {
3424             float_raise( float_flag_invalid );
3425             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3426                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3427             }
3428         }
3429         return (sbits64) LIT64( 0x8000000000000000 );
3430     }
3431     else if ( aExp < 0x3FFF ) {
3432         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3433         return 0;
3434     }
3435     z = aSig>>( - shiftCount );
3436     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3437         float_exception_flags |= float_flag_inexact;
3438     }
3439     if ( aSign ) z = - z;
3440     return z;
3441
3442 }
3443
3444 /*
3445 -------------------------------------------------------------------------------
3446 Returns the result of converting the extended double-precision floating-
3447 point value `a' to the single-precision floating-point format.  The
3448 conversion is performed according to the IEC/IEEE Standard for Binary
3449 Floating-Point Arithmetic.
3450 -------------------------------------------------------------------------------
3451 */
3452 float32 floatx80_to_float32( floatx80 a )
3453 {
3454     flag aSign;
3455     int32 aExp;
3456     bits64 aSig;
3457
3458     aSig = extractFloatx80Frac( a );
3459     aExp = extractFloatx80Exp( a );
3460     aSign = extractFloatx80Sign( a );
3461     if ( aExp == 0x7FFF ) {
3462         if ( (bits64) ( aSig<<1 ) ) {
3463             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3464         }
3465         return packFloat32( aSign, 0xFF, 0 );
3466     }
3467     shift64RightJamming( aSig, 33, &aSig );
3468     if ( aExp || aSig ) aExp -= 0x3F81;
3469     return roundAndPackFloat32( aSign, aExp, aSig );
3470
3471 }
3472
3473 /*
3474 -------------------------------------------------------------------------------
3475 Returns the result of converting the extended double-precision floating-
3476 point value `a' to the double-precision floating-point format.  The
3477 conversion is performed according to the IEC/IEEE Standard for Binary
3478 Floating-Point Arithmetic.
3479 -------------------------------------------------------------------------------
3480 */
3481 float64 floatx80_to_float64( floatx80 a )
3482 {
3483     flag aSign;
3484     int32 aExp;
3485     bits64 aSig, zSig;
3486
3487     aSig = extractFloatx80Frac( a );
3488     aExp = extractFloatx80Exp( a );
3489     aSign = extractFloatx80Sign( a );
3490     if ( aExp == 0x7FFF ) {
3491         if ( (bits64) ( aSig<<1 ) ) {
3492             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3493         }
3494         return packFloat64( aSign, 0x7FF, 0 );
3495     }
3496     shift64RightJamming( aSig, 1, &zSig );
3497     if ( aExp || aSig ) aExp -= 0x3C01;
3498     return roundAndPackFloat64( aSign, aExp, zSig );
3499
3500 }
3501
3502 #ifdef FLOAT128
3503
3504 /*
3505 -------------------------------------------------------------------------------
3506 Returns the result of converting the extended double-precision floating-
3507 point value `a' to the quadruple-precision floating-point format.  The
3508 conversion is performed according to the IEC/IEEE Standard for Binary
3509 Floating-Point Arithmetic.
3510 -------------------------------------------------------------------------------
3511 */
3512 float128 floatx80_to_float128( floatx80 a )
3513 {
3514     flag aSign;
3515     int16 aExp;
3516     bits64 aSig, zSig0, zSig1;
3517
3518     aSig = extractFloatx80Frac( a );
3519     aExp = extractFloatx80Exp( a );
3520     aSign = extractFloatx80Sign( a );
3521     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3522         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3523     }
3524     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3525     return packFloat128( aSign, aExp, zSig0, zSig1 );
3526
3527 }
3528
3529 #endif
3530
3531 /*
3532 -------------------------------------------------------------------------------
3533 Rounds the extended double-precision floating-point value `a' to an integer,
3534 and returns the result as an extended quadruple-precision floating-point
3535 value.  The operation is performed according to the IEC/IEEE Standard for
3536 Binary Floating-Point Arithmetic.
3537 -------------------------------------------------------------------------------
3538 */
3539 floatx80 floatx80_round_to_int( floatx80 a )
3540 {
3541     flag aSign;
3542     int32 aExp;
3543     bits64 lastBitMask, roundBitsMask;
3544     int8 roundingMode;
3545     floatx80 z;
3546
3547     aExp = extractFloatx80Exp( a );
3548     if ( 0x403E <= aExp ) {
3549         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3550             return propagateFloatx80NaN( a, a );
3551         }
3552         return a;
3553     }
3554     if ( aExp < 0x3FFF ) {
3555         if (    ( aExp == 0 )
3556              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3557             return a;
3558         }
3559         float_exception_flags |= float_flag_inexact;
3560         aSign = extractFloatx80Sign( a );
3561         switch ( float_rounding_mode ) {
3562          case float_round_nearest_even:
3563             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3564                ) {
3565                 return
3566                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3567             }
3568             break;
3569          case float_round_to_zero:
3570             break;
3571          case float_round_down:
3572             return
3573                   aSign ?
3574                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3575                 : packFloatx80( 0, 0, 0 );
3576          case float_round_up:
3577             return
3578                   aSign ? packFloatx80( 1, 0, 0 )
3579                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3580         }
3581         return packFloatx80( aSign, 0, 0 );
3582     }
3583     lastBitMask = 1;
3584     lastBitMask <<= 0x403E - aExp;
3585     roundBitsMask = lastBitMask - 1;
3586     z = a;
3587     roundingMode = float_rounding_mode;
3588     if ( roundingMode == float_round_nearest_even ) {
3589         z.low += lastBitMask>>1;
3590         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3591     }
3592     else if ( roundingMode != float_round_to_zero ) {
3593         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3594             z.low += roundBitsMask;
3595         }
3596     }
3597     z.low &= ~ roundBitsMask;
3598     if ( z.low == 0 ) {
3599         ++z.high;
3600         z.low = LIT64( 0x8000000000000000 );
3601     }
3602     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3603     return z;
3604
3605 }
3606
3607 /*
3608 -------------------------------------------------------------------------------
3609 Returns the result of adding the absolute values of the extended double-
3610 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3611 negated before being returned.  `zSign' is ignored if the result is a NaN.
3612 The addition is performed according to the IEC/IEEE Standard for Binary
3613 Floating-Point Arithmetic.
3614 -------------------------------------------------------------------------------
3615 */
3616 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3617 {
3618     int32 aExp, bExp, zExp;
3619     bits64 aSig, bSig, zSig0, zSig1;
3620     int32 expDiff;
3621
3622     aSig = extractFloatx80Frac( a );
3623     aExp = extractFloatx80Exp( a );
3624     bSig = extractFloatx80Frac( b );
3625     bExp = extractFloatx80Exp( b );
3626     expDiff = aExp - bExp;
3627     if ( 0 < expDiff ) {
3628         if ( aExp == 0x7FFF ) {
3629             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3630             return a;
3631         }
3632         if ( bExp == 0 ) --expDiff;
3633         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3634         zExp = aExp;
3635     }
3636     else if ( expDiff < 0 ) {
3637         if ( bExp == 0x7FFF ) {
3638             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3639             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3640         }
3641         if ( aExp == 0 ) ++expDiff;
3642         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3643         zExp = bExp;
3644     }
3645     else {
3646         if ( aExp == 0x7FFF ) {
3647             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3648                 return propagateFloatx80NaN( a, b );
3649             }
3650             return a;
3651         }
3652         zSig1 = 0;
3653         zSig0 = aSig + bSig;
3654         if ( aExp == 0 ) {
3655             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3656             goto roundAndPack;
3657         }
3658         zExp = aExp;
3659         goto shiftRight1;
3660     }
3661     zSig0 = aSig + bSig;
3662     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3663  shiftRight1:
3664     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3665     zSig0 |= LIT64( 0x8000000000000000 );
3666     ++zExp;
3667  roundAndPack:
3668     return
3669         roundAndPackFloatx80(
3670             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3671
3672 }
3673
3674 /*
3675 -------------------------------------------------------------------------------
3676 Returns the result of subtracting the absolute values of the extended
3677 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3678 difference is negated before being returned.  `zSign' is ignored if the
3679 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3680 Standard for Binary Floating-Point Arithmetic.
3681 -------------------------------------------------------------------------------
3682 */
3683 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3684 {
3685     int32 aExp, bExp, zExp;
3686     bits64 aSig, bSig, zSig0, zSig1;
3687     int32 expDiff;
3688     floatx80 z;
3689
3690     aSig = extractFloatx80Frac( a );
3691     aExp = extractFloatx80Exp( a );
3692     bSig = extractFloatx80Frac( b );
3693     bExp = extractFloatx80Exp( b );
3694     expDiff = aExp - bExp;
3695     if ( 0 < expDiff ) goto aExpBigger;
3696     if ( expDiff < 0 ) goto bExpBigger;
3697     if ( aExp == 0x7FFF ) {
3698         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3699             return propagateFloatx80NaN( a, b );
3700         }
3701         float_raise( float_flag_invalid );
3702         z.low = floatx80_default_nan_low;
3703         z.high = floatx80_default_nan_high;
3704         return z;
3705     }
3706     if ( aExp == 0 ) {
3707         aExp = 1;
3708         bExp = 1;
3709     }
3710     zSig1 = 0;
3711     if ( bSig < aSig ) goto aBigger;
3712     if ( aSig < bSig ) goto bBigger;
3713     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3714  bExpBigger:
3715     if ( bExp == 0x7FFF ) {
3716         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3717         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3718     }
3719     if ( aExp == 0 ) ++expDiff;
3720     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3721  bBigger:
3722     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3723     zExp = bExp;
3724     zSign ^= 1;
3725     goto normalizeRoundAndPack;
3726  aExpBigger:
3727     if ( aExp == 0x7FFF ) {
3728         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3729         return a;
3730     }
3731     if ( bExp == 0 ) --expDiff;
3732     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3733  aBigger:
3734     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3735     zExp = aExp;
3736  normalizeRoundAndPack:
3737     return
3738         normalizeRoundAndPackFloatx80(
3739             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3740
3741 }
3742
3743 /*
3744 -------------------------------------------------------------------------------
3745 Returns the result of adding the extended double-precision floating-point
3746 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3747 Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3749 */
3750 floatx80 floatx80_add( floatx80 a, floatx80 b )
3751 {
3752     flag aSign, bSign;
3753
3754     aSign = extractFloatx80Sign( a );
3755     bSign = extractFloatx80Sign( b );
3756     if ( aSign == bSign ) {
3757         return addFloatx80Sigs( a, b, aSign );
3758     }
3759     else {
3760         return subFloatx80Sigs( a, b, aSign );
3761     }
3762
3763 }
3764
3765 /*
3766 -------------------------------------------------------------------------------
3767 Returns the result of subtracting the extended double-precision floating-
3768 point values `a' and `b'.  The operation is performed according to the
3769 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3770 -------------------------------------------------------------------------------
3771 */
3772 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3773 {
3774     flag aSign, bSign;
3775
3776     aSign = extractFloatx80Sign( a );
3777     bSign = extractFloatx80Sign( b );
3778     if ( aSign == bSign ) {
3779         return subFloatx80Sigs( a, b, aSign );
3780     }
3781     else {
3782         return addFloatx80Sigs( a, b, aSign );
3783     }
3784
3785 }
3786
3787 /*
3788 -------------------------------------------------------------------------------
3789 Returns the result of multiplying the extended double-precision floating-
3790 point values `a' and `b'.  The operation is performed according to the
3791 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3792 -------------------------------------------------------------------------------
3793 */
3794 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3795 {
3796     flag aSign, bSign, zSign;
3797     int32 aExp, bExp, zExp;
3798     bits64 aSig, bSig, zSig0, zSig1;
3799     floatx80 z;
3800
3801     aSig = extractFloatx80Frac( a );
3802     aExp = extractFloatx80Exp( a );
3803     aSign = extractFloatx80Sign( a );
3804     bSig = extractFloatx80Frac( b );
3805     bExp = extractFloatx80Exp( b );
3806     bSign = extractFloatx80Sign( b );
3807     zSign = aSign ^ bSign;
3808     if ( aExp == 0x7FFF ) {
3809         if (    (bits64) ( aSig<<1 )
3810              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3811             return propagateFloatx80NaN( a, b );
3812         }
3813         if ( ( bExp | bSig ) == 0 ) goto invalid;
3814         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3815     }
3816     if ( bExp == 0x7FFF ) {
3817         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3818         if ( ( aExp | aSig ) == 0 ) {
3819  invalid:
3820             float_raise( float_flag_invalid );
3821             z.low = floatx80_default_nan_low;
3822             z.high = floatx80_default_nan_high;
3823             return z;
3824         }
3825         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3826     }
3827     if ( aExp == 0 ) {
3828         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3829         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3830     }
3831     if ( bExp == 0 ) {
3832         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3834     }
3835     zExp = aExp + bExp - 0x3FFE;
3836     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3837     if ( 0 < (sbits64) zSig0 ) {
3838         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3839         --zExp;
3840     }
3841     return
3842         roundAndPackFloatx80(
3843             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3844
3845 }
3846
3847 /*
3848 -------------------------------------------------------------------------------
3849 Returns the result of dividing the extended double-precision floating-point
3850 value `a' by the corresponding value `b'.  The operation is performed
3851 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3852 -------------------------------------------------------------------------------
3853 */
3854 floatx80 floatx80_div( floatx80 a, floatx80 b )
3855 {
3856     flag aSign, bSign, zSign;
3857     int32 aExp, bExp, zExp;
3858     bits64 aSig, bSig, zSig0, zSig1;
3859     bits64 rem0, rem1, rem2, term0, term1, term2;
3860     floatx80 z;
3861
3862     aSig = extractFloatx80Frac( a );
3863     aExp = extractFloatx80Exp( a );
3864     aSign = extractFloatx80Sign( a );
3865     bSig = extractFloatx80Frac( b );
3866     bExp = extractFloatx80Exp( b );
3867     bSign = extractFloatx80Sign( b );
3868     zSign = aSign ^ bSign;
3869     if ( aExp == 0x7FFF ) {
3870         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871         if ( bExp == 0x7FFF ) {
3872             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3873             goto invalid;
3874         }
3875         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3876     }
3877     if ( bExp == 0x7FFF ) {
3878         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3879         return packFloatx80( zSign, 0, 0 );
3880     }
3881     if ( bExp == 0 ) {
3882         if ( bSig == 0 ) {
3883             if ( ( aExp | aSig ) == 0 ) {
3884  invalid:
3885                 float_raise( float_flag_invalid );
3886                 z.low = floatx80_default_nan_low;
3887                 z.high = floatx80_default_nan_high;
3888                 return z;
3889             }
3890             float_raise( float_flag_divbyzero );
3891             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3892         }
3893         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3894     }
3895     if ( aExp == 0 ) {
3896         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3897         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3898     }
3899     zExp = aExp - bExp + 0x3FFE;
3900     rem1 = 0;
3901     if ( bSig <= aSig ) {
3902         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3903         ++zExp;
3904     }
3905     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3906     mul64To128( bSig, zSig0, &term0, &term1 );
3907     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3908     while ( (sbits64) rem0 < 0 ) {
3909         --zSig0;
3910         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3911     }
3912     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3913     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3914         mul64To128( bSig, zSig1, &term1, &term2 );
3915         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3916         while ( (sbits64) rem1 < 0 ) {
3917             --zSig1;
3918             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3919         }
3920         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3921     }
3922     return
3923         roundAndPackFloatx80(
3924             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3925
3926 }
3927
3928 /*
3929 -------------------------------------------------------------------------------
3930 Returns the remainder of the extended double-precision floating-point value
3931 `a' with respect to the corresponding value `b'.  The operation is performed
3932 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3933 -------------------------------------------------------------------------------
3934 */
3935 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3936 {
3937     flag aSign, bSign, zSign;
3938     int32 aExp, bExp, expDiff;
3939     bits64 aSig0, aSig1, bSig;
3940     bits64 q, term0, term1, alternateASig0, alternateASig1;
3941     floatx80 z;
3942
3943     aSig0 = extractFloatx80Frac( a );
3944     aExp = extractFloatx80Exp( a );
3945     aSign = extractFloatx80Sign( a );
3946     bSig = extractFloatx80Frac( b );
3947     bExp = extractFloatx80Exp( b );
3948     bSign = extractFloatx80Sign( b );
3949     if ( aExp == 0x7FFF ) {
3950         if (    (bits64) ( aSig0<<1 )
3951              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3952             return propagateFloatx80NaN( a, b );
3953         }
3954         goto invalid;
3955     }
3956     if ( bExp == 0x7FFF ) {
3957         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3958         return a;
3959     }
3960     if ( bExp == 0 ) {
3961         if ( bSig == 0 ) {
3962  invalid:
3963             float_raise( float_flag_invalid );
3964             z.low = floatx80_default_nan_low;
3965             z.high = floatx80_default_nan_high;
3966             return z;
3967         }
3968         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3969     }
3970     if ( aExp == 0 ) {
3971         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3972         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3973     }
3974     bSig |= LIT64( 0x8000000000000000 );
3975     zSign = aSign;
3976     expDiff = aExp - bExp;
3977     aSig1 = 0;
3978     if ( expDiff < 0 ) {
3979         if ( expDiff < -1 ) return a;
3980         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3981         expDiff = 0;
3982     }
3983     q = ( bSig <= aSig0 );
3984     if ( q ) aSig0 -= bSig;
3985     expDiff -= 64;
3986     while ( 0 < expDiff ) {
3987         q = estimateDiv128To64( aSig0, aSig1, bSig );
3988         q = ( 2 < q ) ? q - 2 : 0;
3989         mul64To128( bSig, q, &term0, &term1 );
3990         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3991         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3992         expDiff -= 62;
3993     }
3994     expDiff += 64;
3995     if ( 0 < expDiff ) {
3996         q = estimateDiv128To64( aSig0, aSig1, bSig );
3997         q = ( 2 < q ) ? q - 2 : 0;
3998         q >>= 64 - expDiff;
3999         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4000         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4001         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4002         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4003             ++q;
4004             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4005         }
4006     }
4007     else {
4008         term1 = 0;
4009         term0 = bSig;
4010     }
4011     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4012     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4013          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4014               && ( q & 1 ) )
4015        ) {
4016         aSig0 = alternateASig0;
4017         aSig1 = alternateASig1;
4018         zSign = ! zSign;
4019     }
4020     return
4021         normalizeRoundAndPackFloatx80(
4022             80, zSign, bExp + expDiff, aSig0, aSig1 );
4023
4024 }
4025
4026 /*
4027 -------------------------------------------------------------------------------
4028 Returns the square root of the extended double-precision floating-point
4029 value `a'.  The operation is performed according to the IEC/IEEE Standard
4030 for Binary Floating-Point Arithmetic.
4031 -------------------------------------------------------------------------------
4032 */
4033 floatx80 floatx80_sqrt( floatx80 a )
4034 {
4035     flag aSign;
4036     int32 aExp, zExp;
4037     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4038     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4039     floatx80 z;
4040
4041     aSig0 = extractFloatx80Frac( a );
4042     aExp = extractFloatx80Exp( a );
4043     aSign = extractFloatx80Sign( a );
4044     if ( aExp == 0x7FFF ) {
4045         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4046         if ( ! aSign ) return a;
4047         goto invalid;
4048     }
4049     if ( aSign ) {
4050         if ( ( aExp | aSig0 ) == 0 ) return a;
4051  invalid:
4052         float_raise( float_flag_invalid );
4053         z.low = floatx80_default_nan_low;
4054         z.high = floatx80_default_nan_high;
4055         return z;
4056     }
4057     if ( aExp == 0 ) {
4058         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4059         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4060     }
4061     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4062     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4063     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4064     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4065     doubleZSig0 = zSig0<<1;
4066     mul64To128( zSig0, zSig0, &term0, &term1 );
4067     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4068     while ( (sbits64) rem0 < 0 ) {
4069         --zSig0;
4070         doubleZSig0 -= 2;
4071         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4072     }
4073     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4074     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4075         if ( zSig1 == 0 ) zSig1 = 1;
4076         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4077         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4078         mul64To128( zSig1, zSig1, &term2, &term3 );
4079         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4080         while ( (sbits64) rem1 < 0 ) {
4081             --zSig1;
4082             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4083             term3 |= 1;
4084             term2 |= doubleZSig0;
4085             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4086         }
4087         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4088     }
4089     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4090     zSig0 |= doubleZSig0;
4091     return
4092         roundAndPackFloatx80(
4093             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4094
4095 }
4096
4097 /*
4098 -------------------------------------------------------------------------------
4099 Returns 1 if the extended double-precision floating-point value `a' is
4100 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4101 performed according to the IEC/IEEE Standard for Binary Floating-Point
4102 Arithmetic.
4103 -------------------------------------------------------------------------------
4104 */
4105 flag floatx80_eq( floatx80 a, floatx80 b )
4106 {
4107
4108     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4109               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4110          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4111               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4112        ) {
4113         if (    floatx80_is_signaling_nan( a )
4114              || floatx80_is_signaling_nan( b ) ) {
4115             float_raise( float_flag_invalid );
4116         }
4117         return 0;
4118     }
4119     return
4120            ( a.low == b.low )
4121         && (    ( a.high == b.high )
4122              || (    ( a.low == 0 )
4123                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4124            );
4125
4126 }
4127
4128 /*
4129 -------------------------------------------------------------------------------
4130 Returns 1 if the extended double-precision floating-point value `a' is
4131 less than or equal to the corresponding value `b', and 0 otherwise.  The
4132 comparison is performed according to the IEC/IEEE Standard for Binary
4133 Floating-Point Arithmetic.
4134 -------------------------------------------------------------------------------
4135 */
4136 flag floatx80_le( floatx80 a, floatx80 b )
4137 {
4138     flag aSign, bSign;
4139
4140     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4141               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4142          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4143               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4144        ) {
4145         float_raise( float_flag_invalid );
4146         return 0;
4147     }
4148     aSign = extractFloatx80Sign( a );
4149     bSign = extractFloatx80Sign( b );
4150     if ( aSign != bSign ) {
4151         return
4152                aSign
4153             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4154                  == 0 );
4155     }
4156     return
4157           aSign ? le128( b.high, b.low, a.high, a.low )
4158         : le128( a.high, a.low, b.high, b.low );
4159
4160 }
4161
4162 /*
4163 -------------------------------------------------------------------------------
4164 Returns 1 if the extended double-precision floating-point value `a' is
4165 less than the corresponding value `b', and 0 otherwise.  The comparison
4166 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4167 Arithmetic.
4168 -------------------------------------------------------------------------------
4169 */
4170 flag floatx80_lt( floatx80 a, floatx80 b )
4171 {
4172     flag aSign, bSign;
4173
4174     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4175               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4176          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4177               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4178        ) {
4179         float_raise( float_flag_invalid );
4180         return 0;
4181     }
4182     aSign = extractFloatx80Sign( a );
4183     bSign = extractFloatx80Sign( b );
4184     if ( aSign != bSign ) {
4185         return
4186                aSign
4187             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4188                  != 0 );
4189     }
4190     return
4191           aSign ? lt128( b.high, b.low, a.high, a.low )
4192         : lt128( a.high, a.low, b.high, b.low );
4193
4194 }
4195
4196 /*
4197 -------------------------------------------------------------------------------
4198 Returns 1 if the extended double-precision floating-point value `a' is equal
4199 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4200 raised if either operand is a NaN.  Otherwise, the comparison is performed
4201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202 -------------------------------------------------------------------------------
4203 */
4204 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4205 {
4206
4207     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4208               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4209          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4210               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4211        ) {
4212         float_raise( float_flag_invalid );
4213         return 0;
4214     }
4215     return
4216            ( a.low == b.low )
4217         && (    ( a.high == b.high )
4218              || (    ( a.low == 0 )
4219                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4220            );
4221
4222 }
4223
4224 /*
4225 -------------------------------------------------------------------------------
4226 Returns 1 if the extended double-precision floating-point value `a' is less
4227 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4228 do not cause an exception.  Otherwise, the comparison is performed according
4229 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4230 -------------------------------------------------------------------------------
4231 */
4232 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4233 {
4234     flag aSign, bSign;
4235
4236     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4237               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4238          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4239               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4240        ) {
4241         if (    floatx80_is_signaling_nan( a )
4242              || floatx80_is_signaling_nan( b ) ) {
4243             float_raise( float_flag_invalid );
4244         }
4245         return 0;
4246     }
4247     aSign = extractFloatx80Sign( a );
4248     bSign = extractFloatx80Sign( b );
4249     if ( aSign != bSign ) {
4250         return
4251                aSign
4252             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4253                  == 0 );
4254     }
4255     return
4256           aSign ? le128( b.high, b.low, a.high, a.low )
4257         : le128( a.high, a.low, b.high, b.low );
4258
4259 }
4260
4261 /*
4262 -------------------------------------------------------------------------------
4263 Returns 1 if the extended double-precision floating-point value `a' is less
4264 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4265 an exception.  Otherwise, the comparison is performed according to the
4266 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4267 -------------------------------------------------------------------------------
4268 */
4269 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4270 {
4271     flag aSign, bSign;
4272
4273     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4274               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4275          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4276               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4277        ) {
4278         if (    floatx80_is_signaling_nan( a )
4279              || floatx80_is_signaling_nan( b ) ) {
4280             float_raise( float_flag_invalid );
4281         }
4282         return 0;
4283     }
4284     aSign = extractFloatx80Sign( a );
4285     bSign = extractFloatx80Sign( b );
4286     if ( aSign != bSign ) {
4287         return
4288                aSign
4289             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4290                  != 0 );
4291     }
4292     return
4293           aSign ? lt128( b.high, b.low, a.high, a.low )
4294         : lt128( a.high, a.low, b.high, b.low );
4295
4296 }
4297
4298 #endif
4299
4300 #ifdef FLOAT128
4301
4302 /*
4303 -------------------------------------------------------------------------------
4304 Returns the result of converting the quadruple-precision floating-point
4305 value `a' to the 32-bit two's complement integer format.  The conversion
4306 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4307 Arithmetic---which means in particular that the conversion is rounded
4308 according to the current rounding mode.  If `a' is a NaN, the largest
4309 positive integer is returned.  Otherwise, if the conversion overflows, the
4310 largest integer with the same sign as `a' is returned.
4311 -------------------------------------------------------------------------------
4312 */
4313 int32 float128_to_int32( float128 a )
4314 {
4315     flag aSign;
4316     int32 aExp, shiftCount;
4317     bits64 aSig0, aSig1;
4318
4319     aSig1 = extractFloat128Frac1( a );
4320     aSig0 = extractFloat128Frac0( a );
4321     aExp = extractFloat128Exp( a );
4322     aSign = extractFloat128Sign( a );
4323     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4324     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4325     aSig0 |= ( aSig1 != 0 );
4326     shiftCount = 0x4028 - aExp;
4327     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4328     return roundAndPackInt32( aSign, aSig0 );
4329
4330 }
4331
4332 /*
4333 -------------------------------------------------------------------------------
4334 Returns the result of converting the quadruple-precision floating-point
4335 value `a' to the 32-bit two's complement integer format.  The conversion
4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 Arithmetic, except that the conversion is always rounded toward zero.  If
4338 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4339 conversion overflows, the largest integer with the same sign as `a' is
4340 returned.
4341 -------------------------------------------------------------------------------
4342 */
4343 int32 float128_to_int32_round_to_zero( float128 a )
4344 {
4345     flag aSign;
4346     int32 aExp, shiftCount;
4347     bits64 aSig0, aSig1, savedASig;
4348     int32 z;
4349
4350     aSig1 = extractFloat128Frac1( a );
4351     aSig0 = extractFloat128Frac0( a );
4352     aExp = extractFloat128Exp( a );
4353     aSign = extractFloat128Sign( a );
4354     aSig0 |= ( aSig1 != 0 );
4355     if ( 0x401E < aExp ) {
4356         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4357         goto invalid;
4358     }
4359     else if ( aExp < 0x3FFF ) {
4360         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4361         return 0;
4362     }
4363     aSig0 |= LIT64( 0x0001000000000000 );
4364     shiftCount = 0x402F - aExp;
4365     savedASig = aSig0;
4366     aSig0 >>= shiftCount;
4367     z = aSig0;
4368     if ( aSign ) z = - z;
4369     if ( ( z < 0 ) ^ aSign ) {
4370  invalid:
4371         float_raise( float_flag_invalid );
4372         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4373     }
4374     if ( ( aSig0<<shiftCount ) != savedASig ) {
4375         float_exception_flags |= float_flag_inexact;
4376     }
4377     return z;
4378
4379 }
4380
4381 /*
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format.  The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic---which means in particular that the conversion is rounded
4387 according to the current rounding mode.  If `a' is a NaN, the largest
4388 positive integer is returned.  Otherwise, if the conversion overflows, the
4389 largest integer with the same sign as `a' is returned.
4390 -------------------------------------------------------------------------------
4391 */
4392 int64 float128_to_int64( float128 a )
4393 {
4394     flag aSign;
4395     int32 aExp, shiftCount;
4396     bits64 aSig0, aSig1;
4397
4398     aSig1 = extractFloat128Frac1( a );
4399     aSig0 = extractFloat128Frac0( a );
4400     aExp = extractFloat128Exp( a );
4401     aSign = extractFloat128Sign( a );
4402     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4403     shiftCount = 0x402F - aExp;
4404     if ( shiftCount <= 0 ) {
4405         if ( 0x403E < aExp ) {
4406             float_raise( float_flag_invalid );
4407             if (    ! aSign
4408                  || (    ( aExp == 0x7FFF )
4409                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4410                     )
4411                ) {
4412                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4413             }
4414             return (sbits64) LIT64( 0x8000000000000000 );
4415         }
4416         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4417     }
4418     else {
4419         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4420     }
4421     return roundAndPackInt64( aSign, aSig0, aSig1 );
4422
4423 }
4424
4425 /*
4426 -------------------------------------------------------------------------------
4427 Returns the result of converting the quadruple-precision floating-point
4428 value `a' to the 64-bit two's complement integer format.  The conversion
4429 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4430 Arithmetic, except that the conversion is always rounded toward zero.
4431 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4432 the conversion overflows, the largest integer with the same sign as `a' is
4433 returned.
4434 -------------------------------------------------------------------------------
4435 */
4436 int64 float128_to_int64_round_to_zero( float128 a )
4437 {
4438     flag aSign;
4439     int32 aExp, shiftCount;
4440     bits64 aSig0, aSig1;
4441     int64 z;
4442
4443     aSig1 = extractFloat128Frac1( a );
4444     aSig0 = extractFloat128Frac0( a );
4445     aExp = extractFloat128Exp( a );
4446     aSign = extractFloat128Sign( a );
4447     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4448     shiftCount = aExp - 0x402F;
4449     if ( 0 < shiftCount ) {
4450         if ( 0x403E <= aExp ) {
4451             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4452             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4453                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4454                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4455             }
4456             else {
4457                 float_raise( float_flag_invalid );
4458                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4459                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4460                 }
4461             }
4462             return (sbits64) LIT64( 0x8000000000000000 );
4463         }
4464         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4465         if ( (bits64) ( aSig1<<shiftCount ) ) {
4466             float_exception_flags |= float_flag_inexact;
4467         }
4468     }
4469     else {
4470         if ( aExp < 0x3FFF ) {
4471             if ( aExp | aSig0 | aSig1 ) {
4472                 float_exception_flags |= float_flag_inexact;
4473             }
4474             return 0;
4475         }
4476         z = aSig0>>( - shiftCount );
4477         if (    aSig1
4478              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4479             float_exception_flags |= float_flag_inexact;
4480         }
4481     }
4482     if ( aSign ) z = - z;
4483     return z;
4484
4485 }
4486
4487 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4488     && defined(SOFTFLOAT_NEED_FIXUNS)
4489 /*
4490  * just like above - but do not care for overflow of signed results
4491  */
4492 uint64 float128_to_uint64_round_to_zero( float128 a )
4493 {
4494     flag aSign;
4495     int32 aExp, shiftCount;
4496     bits64 aSig0, aSig1;
4497     uint64 z;
4498
4499     aSig1 = extractFloat128Frac1( a );
4500     aSig0 = extractFloat128Frac0( a );
4501     aExp = extractFloat128Exp( a );
4502     aSign = extractFloat128Sign( a );
4503     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4504     shiftCount = aExp - 0x402F;
4505     if ( 0 < shiftCount ) {
4506         if ( 0x403F <= aExp ) {
4507             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4508             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4509                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4510                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4511             }
4512             else {
4513                 float_raise( float_flag_invalid );
4514             }
4515             return LIT64( 0xFFFFFFFFFFFFFFFF );
4516         }
4517         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4518         if ( (bits64) ( aSig1<<shiftCount ) ) {
4519             float_exception_flags |= float_flag_inexact;
4520         }
4521     }
4522     else {
4523         if ( aExp < 0x3FFF ) {
4524             if ( aExp | aSig0 | aSig1 ) {
4525                 float_exception_flags |= float_flag_inexact;
4526             }
4527             return 0;
4528         }
4529         z = aSig0>>( - shiftCount );
4530         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4531             float_exception_flags |= float_flag_inexact;
4532         }
4533     }
4534     if ( aSign ) z = - z;
4535     return z;
4536
4537 }
4538 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4539
4540 /*
4541 -------------------------------------------------------------------------------
4542 Returns the result of converting the quadruple-precision floating-point
4543 value `a' to the single-precision floating-point format.  The conversion
4544 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4545 Arithmetic.
4546 -------------------------------------------------------------------------------
4547 */
4548 float32 float128_to_float32( float128 a )
4549 {
4550     flag aSign;
4551     int32 aExp;
4552     bits64 aSig0, aSig1;
4553     bits32 zSig;
4554
4555     aSig1 = extractFloat128Frac1( a );
4556     aSig0 = extractFloat128Frac0( a );
4557     aExp = extractFloat128Exp( a );
4558     aSign = extractFloat128Sign( a );
4559     if ( aExp == 0x7FFF ) {
4560         if ( aSig0 | aSig1 ) {
4561             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4562         }
4563         return packFloat32( aSign, 0xFF, 0 );
4564     }
4565     aSig0 |= ( aSig1 != 0 );
4566     shift64RightJamming( aSig0, 18, &aSig0 );
4567     zSig = aSig0;
4568     if ( aExp || zSig ) {
4569         zSig |= 0x40000000;
4570         aExp -= 0x3F81;
4571     }
4572     return roundAndPackFloat32( aSign, aExp, zSig );
4573
4574 }
4575
4576 /*
4577 -------------------------------------------------------------------------------
4578 Returns the result of converting the quadruple-precision floating-point
4579 value `a' to the double-precision floating-point format.  The conversion
4580 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4581 Arithmetic.
4582 -------------------------------------------------------------------------------
4583 */
4584 float64 float128_to_float64( float128 a )
4585 {
4586     flag aSign;
4587     int32 aExp;
4588     bits64 aSig0, aSig1;
4589
4590     aSig1 = extractFloat128Frac1( a );
4591     aSig0 = extractFloat128Frac0( a );
4592     aExp = extractFloat128Exp( a );
4593     aSign = extractFloat128Sign( a );
4594     if ( aExp == 0x7FFF ) {
4595         if ( aSig0 | aSig1 ) {
4596             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4597         }
4598         return packFloat64( aSign, 0x7FF, 0 );
4599     }
4600     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4601     aSig0 |= ( aSig1 != 0 );
4602     if ( aExp || aSig0 ) {
4603         aSig0 |= LIT64( 0x4000000000000000 );
4604         aExp -= 0x3C01;
4605     }
4606     return roundAndPackFloat64( aSign, aExp, aSig0 );
4607
4608 }
4609
4610 #ifdef FLOATX80
4611
4612 /*
4613 -------------------------------------------------------------------------------
4614 Returns the result of converting the quadruple-precision floating-point
4615 value `a' to the extended double-precision floating-point format.  The
4616 conversion is performed according to the IEC/IEEE Standard for Binary
4617 Floating-Point Arithmetic.
4618 -------------------------------------------------------------------------------
4619 */
4620 floatx80 float128_to_floatx80( float128 a )
4621 {
4622     flag aSign;
4623     int32 aExp;
4624     bits64 aSig0, aSig1;
4625
4626     aSig1 = extractFloat128Frac1( a );
4627     aSig0 = extractFloat128Frac0( a );
4628     aExp = extractFloat128Exp( a );
4629     aSign = extractFloat128Sign( a );
4630     if ( aExp == 0x7FFF ) {
4631         if ( aSig0 | aSig1 ) {
4632             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4633         }
4634         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4635     }
4636     if ( aExp == 0 ) {
4637         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4638         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4639     }
4640     else {
4641         aSig0 |= LIT64( 0x0001000000000000 );
4642     }
4643     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4644     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4645
4646 }
4647
4648 #endif
4649
4650 /*
4651 -------------------------------------------------------------------------------
4652 Rounds the quadruple-precision floating-point value `a' to an integer, and
4653 returns the result as a quadruple-precision floating-point value.  The
4654 operation is performed according to the IEC/IEEE Standard for Binary
4655 Floating-Point Arithmetic.
4656 -------------------------------------------------------------------------------
4657 */
4658 float128 float128_round_to_int( float128 a )
4659 {
4660     flag aSign;
4661     int32 aExp;
4662     bits64 lastBitMask, roundBitsMask;
4663     int8 roundingMode;
4664     float128 z;
4665
4666     aExp = extractFloat128Exp( a );
4667     if ( 0x402F <= aExp ) {
4668         if ( 0x406F <= aExp ) {
4669             if (    ( aExp == 0x7FFF )
4670                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4671                ) {
4672                 return propagateFloat128NaN( a, a );
4673             }
4674             return a;
4675         }
4676         lastBitMask = 1;
4677         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4678         roundBitsMask = lastBitMask - 1;
4679         z = a;
4680         roundingMode = float_rounding_mode;
4681         if ( roundingMode == float_round_nearest_even ) {
4682             if ( lastBitMask ) {
4683                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4684                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4685             }
4686             else {
4687                 if ( (sbits64) z.low < 0 ) {
4688                     ++z.high;
4689                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4690                 }
4691             }
4692         }
4693         else if ( roundingMode != float_round_to_zero ) {
4694             if (   extractFloat128Sign( z )
4695                  ^ ( roundingMode == float_round_up ) ) {
4696                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4697             }
4698         }
4699         z.low &= ~ roundBitsMask;
4700     }
4701     else {
4702         if ( aExp < 0x3FFF ) {
4703             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4704             float_exception_flags |= float_flag_inexact;
4705             aSign = extractFloat128Sign( a );
4706             switch ( float_rounding_mode ) {
4707              case float_round_nearest_even:
4708                 if (    ( aExp == 0x3FFE )
4709                      && (   extractFloat128Frac0( a )
4710                           | extractFloat128Frac1( a ) )
4711                    ) {
4712                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4713                 }
4714                 break;
4715              case float_round_to_zero:
4716                 break;
4717              case float_round_down:
4718                 return
4719                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4720                     : packFloat128( 0, 0, 0, 0 );
4721              case float_round_up:
4722                 return
4723                       aSign ? packFloat128( 1, 0, 0, 0 )
4724                     : packFloat128( 0, 0x3FFF, 0, 0 );
4725             }
4726             return packFloat128( aSign, 0, 0, 0 );
4727         }
4728         lastBitMask = 1;
4729         lastBitMask <<= 0x402F - aExp;
4730         roundBitsMask = lastBitMask - 1;
4731         z.low = 0;
4732         z.high = a.high;
4733         roundingMode = float_rounding_mode;
4734         if ( roundingMode == float_round_nearest_even ) {
4735             z.high += lastBitMask>>1;
4736             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4737                 z.high &= ~ lastBitMask;
4738             }
4739         }
4740         else if ( roundingMode != float_round_to_zero ) {
4741             if (   extractFloat128Sign( z )
4742                  ^ ( roundingMode == float_round_up ) ) {
4743                 z.high |= ( a.low != 0 );
4744                 z.high += roundBitsMask;
4745             }
4746         }
4747         z.high &= ~ roundBitsMask;
4748     }
4749     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4750         float_exception_flags |= float_flag_inexact;
4751     }
4752     return z;
4753
4754 }
4755
4756 /*
4757 -------------------------------------------------------------------------------
4758 Returns the result of adding the absolute values of the quadruple-precision
4759 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4760 before being returned.  `zSign' is ignored if the result is a NaN.
4761 The addition is performed according to the IEC/IEEE Standard for Binary
4762 Floating-Point Arithmetic.
4763 -------------------------------------------------------------------------------
4764 */
4765 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4766 {
4767     int32 aExp, bExp, zExp;
4768     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4769     int32 expDiff;
4770
4771     aSig1 = extractFloat128Frac1( a );
4772     aSig0 = extractFloat128Frac0( a );
4773     aExp = extractFloat128Exp( a );
4774     bSig1 = extractFloat128Frac1( b );
4775     bSig0 = extractFloat128Frac0( b );
4776     bExp = extractFloat128Exp( b );
4777     expDiff = aExp - bExp;
4778     if ( 0 < expDiff ) {
4779         if ( aExp == 0x7FFF ) {
4780             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4781             return a;
4782         }
4783         if ( bExp == 0 ) {
4784             --expDiff;
4785         }
4786         else {
4787             bSig0 |= LIT64( 0x0001000000000000 );
4788         }
4789         shift128ExtraRightJamming(
4790             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4791         zExp = aExp;
4792     }
4793     else if ( expDiff < 0 ) {
4794         if ( bExp == 0x7FFF ) {
4795             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4796             return packFloat128( zSign, 0x7FFF, 0, 0 );
4797         }
4798         if ( aExp == 0 ) {
4799             ++expDiff;
4800         }
4801         else {
4802             aSig0 |= LIT64( 0x0001000000000000 );
4803         }
4804         shift128ExtraRightJamming(
4805             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4806         zExp = bExp;
4807     }
4808     else {
4809         if ( aExp == 0x7FFF ) {
4810             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4811                 return propagateFloat128NaN( a, b );
4812             }
4813             return a;
4814         }
4815         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4816         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4817         zSig2 = 0;
4818         zSig0 |= LIT64( 0x0002000000000000 );
4819         zExp = aExp;
4820         goto shiftRight1;
4821     }
4822     aSig0 |= LIT64( 0x0001000000000000 );
4823     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4824     --zExp;
4825     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4826     ++zExp;
4827  shiftRight1:
4828     shift128ExtraRightJamming(
4829         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4830  roundAndPack:
4831     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4832
4833 }
4834
4835 /*
4836 -------------------------------------------------------------------------------
4837 Returns the result of subtracting the absolute values of the quadruple-
4838 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4839 difference is negated before being returned.  `zSign' is ignored if the
4840 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4841 Standard for Binary Floating-Point Arithmetic.
4842 -------------------------------------------------------------------------------
4843 */
4844 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4845 {
4846     int32 aExp, bExp, zExp;
4847     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4848     int32 expDiff;
4849     float128 z;
4850
4851     aSig1 = extractFloat128Frac1( a );
4852     aSig0 = extractFloat128Frac0( a );
4853     aExp = extractFloat128Exp( a );
4854     bSig1 = extractFloat128Frac1( b );
4855     bSig0 = extractFloat128Frac0( b );
4856     bExp = extractFloat128Exp( b );
4857     expDiff = aExp - bExp;
4858     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4859     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4860     if ( 0 < expDiff ) goto aExpBigger;
4861     if ( expDiff < 0 ) goto bExpBigger;
4862     if ( aExp == 0x7FFF ) {
4863         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4864             return propagateFloat128NaN( a, b );
4865         }
4866         float_raise( float_flag_invalid );
4867         z.low = float128_default_nan_low;
4868         z.high = float128_default_nan_high;
4869         return z;
4870     }
4871     if ( aExp == 0 ) {
4872         aExp = 1;
4873         bExp = 1;
4874     }
4875     if ( bSig0 < aSig0 ) goto aBigger;
4876     if ( aSig0 < bSig0 ) goto bBigger;
4877     if ( bSig1 < aSig1 ) goto aBigger;
4878     if ( aSig1 < bSig1 ) goto bBigger;
4879     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4880  bExpBigger:
4881     if ( bExp == 0x7FFF ) {
4882         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4883         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4884     }
4885     if ( aExp == 0 ) {
4886         ++expDiff;
4887     }
4888     else {
4889         aSig0 |= LIT64( 0x4000000000000000 );
4890     }
4891     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4892     bSig0 |= LIT64( 0x4000000000000000 );
4893  bBigger:
4894     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4895     zExp = bExp;
4896     zSign ^= 1;
4897     goto normalizeRoundAndPack;
4898  aExpBigger:
4899     if ( aExp == 0x7FFF ) {
4900         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4901         return a;
4902     }
4903     if ( bExp == 0 ) {
4904         --expDiff;
4905     }
4906     else {
4907         bSig0 |= LIT64( 0x4000000000000000 );
4908     }
4909     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4910     aSig0 |= LIT64( 0x4000000000000000 );
4911  aBigger:
4912     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4913     zExp = aExp;
4914  normalizeRoundAndPack:
4915     --zExp;
4916     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4917
4918 }
4919
4920 /*
4921 -------------------------------------------------------------------------------
4922 Returns the result of adding the quadruple-precision floating-point values
4923 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4924 for Binary Floating-Point Arithmetic.
4925 -------------------------------------------------------------------------------
4926 */
4927 float128 float128_add( float128 a, float128 b )
4928 {
4929     flag aSign, bSign;
4930
4931     aSign = extractFloat128Sign( a );
4932     bSign = extractFloat128Sign( b );
4933     if ( aSign == bSign ) {
4934         return addFloat128Sigs( a, b, aSign );
4935     }
4936     else {
4937         return subFloat128Sigs( a, b, aSign );
4938     }
4939
4940 }
4941
4942 /*
4943 -------------------------------------------------------------------------------
4944 Returns the result of subtracting the quadruple-precision floating-point
4945 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4946 Standard for Binary Floating-Point Arithmetic.
4947 -------------------------------------------------------------------------------
4948 */
4949 float128 float128_sub( float128 a, float128 b )
4950 {
4951     flag aSign, bSign;
4952
4953     aSign = extractFloat128Sign( a );
4954     bSign = extractFloat128Sign( b );
4955     if ( aSign == bSign ) {
4956         return subFloat128Sigs( a, b, aSign );
4957     }
4958     else {
4959         return addFloat128Sigs( a, b, aSign );
4960     }
4961
4962 }
4963
4964 /*
4965 -------------------------------------------------------------------------------
4966 Returns the result of multiplying the quadruple-precision floating-point
4967 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4968 Standard for Binary Floating-Point Arithmetic.
4969 -------------------------------------------------------------------------------
4970 */
4971 float128 float128_mul( float128 a, float128 b )
4972 {
4973     flag aSign, bSign, zSign;
4974     int32 aExp, bExp, zExp;
4975     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4976     float128 z;
4977
4978     aSig1 = extractFloat128Frac1( a );
4979     aSig0 = extractFloat128Frac0( a );
4980     aExp = extractFloat128Exp( a );
4981     aSign = extractFloat128Sign( a );
4982     bSig1 = extractFloat128Frac1( b );
4983     bSig0 = extractFloat128Frac0( b );
4984     bExp = extractFloat128Exp( b );
4985     bSign = extractFloat128Sign( b );
4986     zSign = aSign ^ bSign;
4987     if ( aExp == 0x7FFF ) {
4988         if (    ( aSig0 | aSig1 )
4989              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4990             return propagateFloat128NaN( a, b );
4991         }
4992         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4993         return packFloat128( zSign, 0x7FFF, 0, 0 );
4994     }
4995     if ( bExp == 0x7FFF ) {
4996         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4997         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4998  invalid:
4999             float_raise( float_flag_invalid );
5000             z.low = float128_default_nan_low;
5001             z.high = float128_default_nan_high;
5002             return z;
5003         }
5004         return packFloat128( zSign, 0x7FFF, 0, 0 );
5005     }
5006     if ( aExp == 0 ) {
5007         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5008         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5009     }
5010     if ( bExp == 0 ) {
5011         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5013     }
5014     zExp = aExp + bExp - 0x4000;
5015     aSig0 |= LIT64( 0x0001000000000000 );
5016     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5017     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5018     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5019     zSig2 |= ( zSig3 != 0 );
5020     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5021         shift128ExtraRightJamming(
5022             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5023         ++zExp;
5024     }
5025     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5026
5027 }
5028
5029 /*
5030 -------------------------------------------------------------------------------
5031 Returns the result of dividing the quadruple-precision floating-point value
5032 `a' by the corresponding value `b'.  The operation is performed according to
5033 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5034 -------------------------------------------------------------------------------
5035 */
5036 float128 float128_div( float128 a, float128 b )
5037 {
5038     flag aSign, bSign, zSign;
5039     int32 aExp, bExp, zExp;
5040     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5041     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5042     float128 z;
5043
5044     aSig1 = extractFloat128Frac1( a );
5045     aSig0 = extractFloat128Frac0( a );
5046     aExp = extractFloat128Exp( a );
5047     aSign = extractFloat128Sign( a );
5048     bSig1 = extractFloat128Frac1( b );
5049     bSig0 = extractFloat128Frac0( b );
5050     bExp = extractFloat128Exp( b );
5051     bSign = extractFloat128Sign( b );
5052     zSign = aSign ^ bSign;
5053     if ( aExp == 0x7FFF ) {
5054         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5055         if ( bExp == 0x7FFF ) {
5056             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5057             goto invalid;
5058         }
5059         return packFloat128( zSign, 0x7FFF, 0, 0 );
5060     }
5061     if ( bExp == 0x7FFF ) {
5062         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5063         return packFloat128( zSign, 0, 0, 0 );
5064     }
5065     if ( bExp == 0 ) {
5066         if ( ( bSig0 | bSig1 ) == 0 ) {
5067             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5068  invalid:
5069                 float_raise( float_flag_invalid );
5070                 z.low = float128_default_nan_low;
5071                 z.high = float128_default_nan_high;
5072                 return z;
5073             }
5074             float_raise( float_flag_divbyzero );
5075             return packFloat128( zSign, 0x7FFF, 0, 0 );
5076         }
5077         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5078     }
5079     if ( aExp == 0 ) {
5080         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5081         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5082     }
5083     zExp = aExp - bExp + 0x3FFD;
5084     shortShift128Left(
5085         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5086     shortShift128Left(
5087         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5088     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5089         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5090         ++zExp;
5091     }
5092     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5093     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5094     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5095     while ( (sbits64) rem0 < 0 ) {
5096         --zSig0;
5097         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5098     }
5099     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5100     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5101         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5102         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5103         while ( (sbits64) rem1 < 0 ) {
5104             --zSig1;
5105             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5106         }
5107         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5108     }
5109     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5110     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5111
5112 }
5113
5114 /*
5115 -------------------------------------------------------------------------------
5116 Returns the remainder of the quadruple-precision floating-point value `a'
5117 with respect to the corresponding value `b'.  The operation is performed
5118 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5119 -------------------------------------------------------------------------------
5120 */
5121 float128 float128_rem( float128 a, float128 b )
5122 {
5123     flag aSign, bSign, zSign;
5124     int32 aExp, bExp, expDiff;
5125     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5126     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5127     sbits64 sigMean0;
5128     float128 z;
5129
5130     aSig1 = extractFloat128Frac1( a );
5131     aSig0 = extractFloat128Frac0( a );
5132     aExp = extractFloat128Exp( a );
5133     aSign = extractFloat128Sign( a );
5134     bSig1 = extractFloat128Frac1( b );
5135     bSig0 = extractFloat128Frac0( b );
5136     bExp = extractFloat128Exp( b );
5137     bSign = extractFloat128Sign( b );
5138     if ( aExp == 0x7FFF ) {
5139         if (    ( aSig0 | aSig1 )
5140              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5141             return propagateFloat128NaN( a, b );
5142         }
5143         goto invalid;
5144     }
5145     if ( bExp == 0x7FFF ) {
5146         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5147         return a;
5148     }
5149     if ( bExp == 0 ) {
5150         if ( ( bSig0 | bSig1 ) == 0 ) {
5151  invalid:
5152             float_raise( float_flag_invalid );
5153             z.low = float128_default_nan_low;
5154             z.high = float128_default_nan_high;
5155             return z;
5156         }
5157         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5158     }
5159     if ( aExp == 0 ) {
5160         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5161         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5162     }
5163     expDiff = aExp - bExp;
5164     if ( expDiff < -1 ) return a;
5165     shortShift128Left(
5166         aSig0 | LIT64( 0x0001000000000000 ),
5167         aSig1,
5168         15 - ( expDiff < 0 ),
5169         &aSig0,
5170         &aSig1
5171     );
5172     shortShift128Left(
5173         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5174     q = le128( bSig0, bSig1, aSig0, aSig1 );
5175     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5176     expDiff -= 64;
5177     while ( 0 < expDiff ) {
5178         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5179         q = ( 4 < q ) ? q - 4 : 0;
5180         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5181         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5182         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5183         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5184         expDiff -= 61;
5185     }
5186     if ( -64 < expDiff ) {
5187         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5188         q = ( 4 < q ) ? q - 4 : 0;
5189         q >>= - expDiff;
5190         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5191         expDiff += 52;
5192         if ( expDiff < 0 ) {
5193             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5194         }
5195         else {
5196             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5197         }
5198         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5199         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5200     }
5201     else {
5202         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5203         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5204     }
5205     do {
5206         alternateASig0 = aSig0;
5207         alternateASig1 = aSig1;
5208         ++q;
5209         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5210     } while ( 0 <= (sbits64) aSig0 );
5211     add128(
5212         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5213     if (    ( sigMean0 < 0 )
5214          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5215         aSig0 = alternateASig0;
5216         aSig1 = alternateASig1;
5217     }
5218     zSign = ( (sbits64) aSig0 < 0 );
5219     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5220     return
5221         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5222
5223 }
5224
5225 /*
5226 -------------------------------------------------------------------------------
5227 Returns the square root of the quadruple-precision floating-point value `a'.
5228 The operation is performed according to the IEC/IEEE Standard for Binary
5229 Floating-Point Arithmetic.
5230 -------------------------------------------------------------------------------
5231 */
5232 float128 float128_sqrt( float128 a )
5233 {
5234     flag aSign;
5235     int32 aExp, zExp;
5236     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5237     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5238     float128 z;
5239
5240     aSig1 = extractFloat128Frac1( a );
5241     aSig0 = extractFloat128Frac0( a );
5242     aExp = extractFloat128Exp( a );
5243     aSign = extractFloat128Sign( a );
5244     if ( aExp == 0x7FFF ) {
5245         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5246         if ( ! aSign ) return a;
5247         goto invalid;
5248     }
5249     if ( aSign ) {
5250         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5251  invalid:
5252         float_raise( float_flag_invalid );
5253         z.low = float128_default_nan_low;
5254         z.high = float128_default_nan_high;
5255         return z;
5256     }
5257     if ( aExp == 0 ) {
5258         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5259         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5260     }
5261     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5262     aSig0 |= LIT64( 0x0001000000000000 );
5263     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5264     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5265     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5266     doubleZSig0 = zSig0<<1;
5267     mul64To128( zSig0, zSig0, &term0, &term1 );
5268     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5269     while ( (sbits64) rem0 < 0 ) {
5270         --zSig0;
5271         doubleZSig0 -= 2;
5272         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5273     }
5274     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5275     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5276         if ( zSig1 == 0 ) zSig1 = 1;
5277         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5278         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5279         mul64To128( zSig1, zSig1, &term2, &term3 );
5280         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5281         while ( (sbits64) rem1 < 0 ) {
5282             --zSig1;
5283             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5284             term3 |= 1;
5285             term2 |= doubleZSig0;
5286             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5287         }
5288         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5289     }
5290     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5291     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5292
5293 }
5294
5295 /*
5296 -------------------------------------------------------------------------------
5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5298 the corresponding value `b', and 0 otherwise.  The comparison is performed
5299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5300 -------------------------------------------------------------------------------
5301 */
5302 flag float128_eq( float128 a, float128 b )
5303 {
5304
5305     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5306               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5307          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5308               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5309        ) {
5310         if (    float128_is_signaling_nan( a )
5311              || float128_is_signaling_nan( b ) ) {
5312             float_raise( float_flag_invalid );
5313         }
5314         return 0;
5315     }
5316     return
5317            ( a.low == b.low )
5318         && (    ( a.high == b.high )
5319              || (    ( a.low == 0 )
5320                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5321            );
5322
5323 }
5324
5325 /*
5326 -------------------------------------------------------------------------------
5327 Returns 1 if the quadruple-precision floating-point value `a' is less than
5328 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5329 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5330 Arithmetic.
5331 -------------------------------------------------------------------------------
5332 */
5333 flag float128_le( float128 a, float128 b )
5334 {
5335     flag aSign, bSign;
5336
5337     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5338               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5339          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5340               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5341        ) {
5342         float_raise( float_flag_invalid );
5343         return 0;
5344     }
5345     aSign = extractFloat128Sign( a );
5346     bSign = extractFloat128Sign( b );
5347     if ( aSign != bSign ) {
5348         return
5349                aSign
5350             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5351                  == 0 );
5352     }
5353     return
5354           aSign ? le128( b.high, b.low, a.high, a.low )
5355         : le128( a.high, a.low, b.high, b.low );
5356
5357 }
5358
5359 /*
5360 -------------------------------------------------------------------------------
5361 Returns 1 if the quadruple-precision floating-point value `a' is less than
5362 the corresponding value `b', and 0 otherwise.  The comparison is performed
5363 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5365 */
5366 flag float128_lt( float128 a, float128 b )
5367 {
5368     flag aSign, bSign;
5369
5370     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5371               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5373               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5374        ) {
5375         float_raise( float_flag_invalid );
5376         return 0;
5377     }
5378     aSign = extractFloat128Sign( a );
5379     bSign = extractFloat128Sign( b );
5380     if ( aSign != bSign ) {
5381         return
5382                aSign
5383             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5384                  != 0 );
5385     }
5386     return
5387           aSign ? lt128( b.high, b.low, a.high, a.low )
5388         : lt128( a.high, a.low, b.high, b.low );
5389
5390 }
5391
5392 /*
5393 -------------------------------------------------------------------------------
5394 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395 the corresponding value `b', and 0 otherwise.  The invalid exception is
5396 raised if either operand is a NaN.  Otherwise, the comparison is performed
5397 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5398 -------------------------------------------------------------------------------
5399 */
5400 flag float128_eq_signaling( float128 a, float128 b )
5401 {
5402
5403     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5404               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5405          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5406               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5407        ) {
5408         float_raise( float_flag_invalid );
5409         return 0;
5410     }
5411     return
5412            ( a.low == b.low )
5413         && (    ( a.high == b.high )
5414              || (    ( a.low == 0 )
5415                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5416            );
5417
5418 }
5419
5420 /*
5421 -------------------------------------------------------------------------------
5422 Returns 1 if the quadruple-precision floating-point value `a' is less than
5423 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5424 cause an exception.  Otherwise, the comparison is performed according to the
5425 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5426 -------------------------------------------------------------------------------
5427 */
5428 flag float128_le_quiet( float128 a, float128 b )
5429 {
5430     flag aSign, bSign;
5431
5432     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5433               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5434          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5435               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5436        ) {
5437         if (    float128_is_signaling_nan( a )
5438              || float128_is_signaling_nan( b ) ) {
5439             float_raise( float_flag_invalid );
5440         }
5441         return 0;
5442     }
5443     aSign = extractFloat128Sign( a );
5444     bSign = extractFloat128Sign( b );
5445     if ( aSign != bSign ) {
5446         return
5447                aSign
5448             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5449                  == 0 );
5450     }
5451     return
5452           aSign ? le128( b.high, b.low, a.high, a.low )
5453         : le128( a.high, a.low, b.high, b.low );
5454
5455 }
5456
5457 /*
5458 -------------------------------------------------------------------------------
5459 Returns 1 if the quadruple-precision floating-point value `a' is less than
5460 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5461 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5462 Standard for Binary Floating-Point Arithmetic.
5463 -------------------------------------------------------------------------------
5464 */
5465 flag float128_lt_quiet( float128 a, float128 b )
5466 {
5467     flag aSign, bSign;
5468
5469     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5470               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5471          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5472               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5473        ) {
5474         if (    float128_is_signaling_nan( a )
5475              || float128_is_signaling_nan( b ) ) {
5476             float_raise( float_flag_invalid );
5477         }
5478         return 0;
5479     }
5480     aSign = extractFloat128Sign( a );
5481     bSign = extractFloat128Sign( b );
5482     if ( aSign != bSign ) {
5483         return
5484                aSign
5485             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5486                  != 0 );
5487     }
5488     return
5489           aSign ? lt128( b.high, b.low, a.high, a.low )
5490         : lt128( a.high, a.low, b.high, b.low );
5491
5492 }
5493
5494 #endif
5495
5496
5497 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5498
5499 /*
5500  * These two routines are not part of the original softfloat distribution.
5501  *
5502  * They are based on the corresponding conversions to integer but return
5503  * unsigned numbers instead since these functions are required by GCC.
5504  *
5505  * Added by Mark Brinicombe <mark@NetBSD.org>   27/09/97
5506  *
5507  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5508  */
5509
5510 /*
5511 -------------------------------------------------------------------------------
5512 Returns the result of converting the double-precision floating-point value
5513 `a' to the 32-bit unsigned integer format.  The conversion is
5514 performed according to the IEC/IEEE Standard for Binary Floating-point
5515 Arithmetic, except that the conversion is always rounded toward zero.  If
5516 `a' is a NaN, the largest positive integer is returned.  If the conversion
5517 overflows, the largest integer positive is returned.
5518 -------------------------------------------------------------------------------
5519 */
5520 uint32 float64_to_uint32_round_to_zero( float64 a )
5521 {
5522     flag aSign;
5523     int16 aExp, shiftCount;
5524     bits64 aSig, savedASig;
5525     uint32 z;
5526
5527     aSig = extractFloat64Frac( a );
5528     aExp = extractFloat64Exp( a );
5529     aSign = extractFloat64Sign( a );
5530
5531     if (aSign) {
5532         float_raise( float_flag_invalid );
5533         return(0);
5534     }
5535
5536     if ( 0x41E < aExp ) {
5537         float_raise( float_flag_invalid );
5538         return 0xffffffff;
5539     }
5540     else if ( aExp < 0x3FF ) {
5541         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5542         return 0;
5543     }
5544     aSig |= LIT64( 0x0010000000000000 );
5545     shiftCount = 0x433 - aExp;
5546     savedASig = aSig;
5547     aSig >>= shiftCount;
5548     z = aSig;
5549     if ( ( aSig<<shiftCount ) != savedASig ) {
5550         float_exception_flags |= float_flag_inexact;
5551     }
5552     return z;
5553
5554 }
5555
5556 /*
5557 -------------------------------------------------------------------------------
5558 Returns the result of converting the single-precision floating-point value
5559 `a' to the 32-bit unsigned integer format.  The conversion is
5560 performed according to the IEC/IEEE Standard for Binary Floating-point
5561 Arithmetic, except that the conversion is always rounded toward zero.  If
5562 `a' is a NaN, the largest positive integer is returned.  If the conversion
5563 overflows, the largest positive integer is returned.
5564 -------------------------------------------------------------------------------
5565 */
5566 uint32 float32_to_uint32_round_to_zero( float32 a )
5567 {
5568     flag aSign;
5569     int16 aExp, shiftCount;
5570     bits32 aSig;
5571     uint32 z;
5572
5573     aSig = extractFloat32Frac( a );
5574     aExp = extractFloat32Exp( a );
5575     aSign = extractFloat32Sign( a );
5576     shiftCount = aExp - 0x9E;
5577
5578     if (aSign) {
5579         float_raise( float_flag_invalid );
5580         return(0);
5581     }
5582     if ( 0 < shiftCount ) {
5583         float_raise( float_flag_invalid );
5584         return 0xFFFFFFFF;
5585     }
5586     else if ( aExp <= 0x7E ) {
5587         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5588         return 0;
5589     }
5590     aSig = ( aSig | 0x800000 )<<8;
5591     z = aSig>>( - shiftCount );
5592     if ( aSig<<( shiftCount & 31 ) ) {
5593         float_exception_flags |= float_flag_inexact;
5594     }
5595     return z;
5596
5597 }
5598
5599 #endif