contrib/compiler-rt/lib/builtins/hexagon/dfmul.S

   1 //===----------------------Hexagon builtin routine ------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is dual licensed under the MIT and the University of Illinois Open
   6 // Source Licenses. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 /* Double Precision Multiply */
  11 #define A r1:0
  12 #define AH r1
  13 #define AL r0
  14 #define B r3:2
  15 #define BH r3
  16 #define BL r2
  17
  18 #define BTMP r5:4
  19 #define BTMPH r5
  20 #define BTMPL r4
  21
  22 #define PP_ODD r7:6
  23 #define PP_ODD_H r7
  24 #define PP_ODD_L r6
  25
  26 #define ONE r9:8
  27 #define S_ONE r8
  28 #define S_ZERO r9
  29
  30 #define PP_HH r11:10
  31 #define PP_HH_H r11
  32 #define PP_HH_L r10
  33
  34 #define ATMP r13:12
  35 #define ATMPH r13
  36 #define ATMPL r12
  37
  38 #define PP_LL r15:14
  39 #define PP_LL_H r15
  40 #define PP_LL_L r14
  41
  42 #define TMP r28
  43
  44 #define MANTBITS 52
  45 #define HI_MANTBITS 20
  46 #define EXPBITS 11
  47 #define BIAS 1024
  48 #define MANTISSA_TO_INT_BIAS 52
  49
  50 /* Some constant to adjust normalization amount in error code */
  51 /* Amount to right shift the partial product to get to a denorm */
  52 #define FUDGE 5
  53
  54 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
  55 #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
  56 #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
  57 #define END(TAG) .size TAG,.-TAG
  58
  59 #define SR_ROUND_OFF 22
  60         .text
  61         .global __hexagon_muldf3
  62         .type __hexagon_muldf3,@function
  63         Q6_ALIAS(muldf3)
  64   FAST_ALIAS(muldf3)
  65   FAST2_ALIAS(muldf3)
  66         .p2align 5
  67 __hexagon_muldf3:
  68         {
  69                 p0 = dfclass(A,#2)
  70                 p0 = dfclass(B,#2)
  71                 ATMP = combine(##0x40000000,#0)
  72         }
  73         {
  74                 ATMP = insert(A,#MANTBITS,#EXPBITS-1)
  75                 BTMP = asl(B,#EXPBITS-1)
  76                 TMP = #-BIAS
  77                 ONE = #1
  78         }
  79         {
  80                 PP_ODD = mpyu(BTMPL,ATMPH)
  81                 BTMP = insert(ONE,#2,#62)
  82         }
  83         /* since we know that the MSB of the H registers is zero, we should never carry */
  84         /* H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1 */
  85         /* Adding 2 HLs, we get 2^64-3*2^32+2 maximum.  */
  86         /* Therefore, we can add 3 2^32-1 values safely without carry.  We only need one. */
  87         {
  88                 PP_LL = mpyu(ATMPL,BTMPL)
  89                 PP_ODD += mpyu(ATMPL,BTMPH)
  90         }
  91         {
  92                 PP_ODD += lsr(PP_LL,#32)
  93                 PP_HH = mpyu(ATMPH,BTMPH)
  94                 BTMP = combine(##BIAS+BIAS-4,#0)
  95         }
  96         {
  97                 PP_HH += lsr(PP_ODD,#32)
  98                 if (!p0) jump .Lmul_abnormal
  99                 p1 = cmp.eq(PP_LL_L,#0)         // 64 lsb's 0?
 100                 p1 = cmp.eq(PP_ODD_L,#0)        // 64 lsb's 0?
 101         }
 102         /*
 103          * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 104          * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
 105          */
 106 #undef PP_ODD
 107 #undef PP_ODD_H
 108 #undef PP_ODD_L
 109 #define EXP10 r7:6
 110 #define EXP1 r7
 111 #define EXP0 r6
 112         {
 113                 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
 114                 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 115                 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
 116         }
 117         {
 118                 PP_LL = neg(PP_HH)
 119                 EXP0 += add(TMP,EXP1)
 120                 TMP = xor(AH,BH)
 121         }
 122         {
 123                 if (!p2.new) PP_HH = PP_LL
 124                 p2 = cmp.gt(TMP,#-1)
 125                 p0 = !cmp.gt(EXP0,BTMPH)
 126                 p0 = cmp.gt(EXP0,BTMPL)
 127                 if (!p0.new) jump:nt .Lmul_ovf_unf
 128         }
 129         {
 130                 A = convert_d2df(PP_HH)
 131                 EXP0 = add(EXP0,#-BIAS-58)
 132         }
 133         {
 134                 AH += asl(EXP0,#HI_MANTBITS)
 135                 jumpr r31
 136         }
 137
 138         .falign
 139 .Lpossible_unf:
 140         /* We end up with a positive exponent */
 141         /* But we may have rounded up to an exponent of 1. */
 142         /* If the exponent is 1, if we rounded up to it
 143          * we need to also raise underflow
 144          * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
 145          * And the PP should also have more than one bit set
 146          */
 147         /* Note: ATMP should have abs(PP_HH) */
 148         /* Note: BTMPL should have 0x7FEFFFFF */
 149         {
 150                 p0 = cmp.eq(AL,#0)
 151                 p0 = bitsclr(AH,BTMPL)
 152                 if (!p0.new) jumpr:t r31
 153                 BTMPH = #0x7fff
 154         }
 155         {
 156                 p0 = bitsset(ATMPH,BTMPH)
 157                 BTMPL = USR
 158                 BTMPH = #0x030
 159         }
 160         {
 161                 if (p0) BTMPL = or(BTMPL,BTMPH)
 162         }
 163         {
 164                 USR = BTMPL
 165         }
 166         {
 167                 p0 = dfcmp.eq(A,A)
 168                 jumpr r31
 169         }
 170         .falign
 171 .Lmul_ovf_unf:
 172         {
 173                 A = convert_d2df(PP_HH)
 174                 ATMP = abs(PP_HH)                       // take absolute value
 175                 EXP1 = add(EXP0,#-BIAS-58)
 176         }
 177         {
 178                 AH += asl(EXP1,#HI_MANTBITS)
 179                 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 180                 BTMPL = ##0x7FEFFFFF
 181         }
 182         {
 183                 EXP1 += add(EXP0,##-BIAS-58)
 184                 //BTMPH = add(clb(ATMP),#-2)
 185                 BTMPH = #0
 186         }
 187         {
 188                 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
 189                 if (p0.new) jump:nt .Lmul_ovf
 190         }
 191         {
 192                 p0 = cmp.gt(EXP1,#0)
 193                 if (p0.new) jump:nt .Lpossible_unf
 194                 BTMPH = sub(EXP0,BTMPH)
 195                 TMP = #63                               // max amount to shift
 196         }
 197         /* Underflow */
 198         /*
 199          * PP_HH has the partial product with sticky LSB.
 200          * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 201          * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
 202          * The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
 203          * That's the exponent that happens after the normalization
 204          *
 205          * EXP0 has the exponent that, when added to the normalized value, is out of range.
 206          *
 207          * Strategy:
 208          *
 209          * * Shift down bits, with sticky bit, such that the bits are aligned according
 210          *   to the LZ count and appropriate exponent, but not all the way to mantissa
 211          *   field, keep around the last few bits.
 212          * * Put a 1 near the MSB
 213          * * Check the LSBs for inexact; if inexact also set underflow
 214          * * Convert [u]d2df -- will correctly round according to rounding mode
 215          * * Replace exponent field with zero
 216          *
 217          *
 218          */
 219
 220
 221         {
 222                 BTMPL = #0                              // offset for extract
 223                 BTMPH = sub(#FUDGE,BTMPH)               // amount to right shift
 224         }
 225         {
 226                 p3 = cmp.gt(PP_HH_H,#-1)                // is it positive?
 227                 BTMPH = min(BTMPH,TMP)                  // Don't shift more than 63
 228                 PP_HH = ATMP
 229         }
 230         {
 231                 TMP = USR
 232                 PP_LL = extractu(PP_HH,BTMP)
 233         }
 234         {
 235                 PP_HH = asr(PP_HH,BTMPH)
 236                 BTMPL = #0x0030                                 // underflow flag
 237                 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
 238         }
 239         {
 240                 p0 = cmp.gtu(ONE,PP_LL)                         // Did we extract all zeros?
 241                 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)        // add sticky bit
 242                 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)        // Add back in a bit so we can use convert instruction
 243         }
 244         {
 245                 PP_LL = neg(PP_HH)
 246                 p1 = bitsclr(PP_HH_L,#0x7)              // Are the LSB's clear?
 247                 if (!p1.new) TMP = or(BTMPL,TMP)        // If not, Inexact+Underflow
 248         }
 249         {
 250                 if (!p3) PP_HH = PP_LL
 251                 USR = TMP
 252         }
 253         {
 254                 A = convert_d2df(PP_HH)                 // Do rounding
 255                 p0 = dfcmp.eq(A,A)                      // realize exception
 256         }
 257         {
 258                 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)           // Insert correct exponent
 259                 jumpr r31
 260         }
 261         .falign
 262 .Lmul_ovf:
 263         // We get either max finite value or infinity.  Either way, overflow+inexact
 264         {
 265                 TMP = USR
 266                 ATMP = combine(##0x7fefffff,#-1)        // positive max finite
 267                 A = PP_HH
 268         }
 269         {
 270                 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)        // rounding bits
 271                 TMP = or(TMP,#0x28)                     // inexact + overflow
 272                 BTMP = combine(##0x7ff00000,#0)         // positive infinity
 273         }
 274         {
 275                 USR = TMP
 276                 PP_LL_L ^= lsr(AH,#31)                  // Does sign match rounding?
 277                 TMP = PP_LL_L                           // unmodified rounding mode
 278         }
 279         {
 280                 p0 = !cmp.eq(TMP,#1)                    // If not round-to-zero and
 281                 p0 = !cmp.eq(PP_LL_L,#2)                // Not rounding the other way,
 282                 if (p0.new) ATMP = BTMP                 // we should get infinity
 283                 p0 = dfcmp.eq(A,A)                      // Realize FP exception if enabled
 284         }
 285         {
 286                 A = insert(ATMP,#63,#0)                 // insert inf/maxfinite, leave sign
 287                 jumpr r31
 288         }
 289
 290 .Lmul_abnormal:
 291         {
 292                 ATMP = extractu(A,#63,#0)               // strip off sign
 293                 BTMP = extractu(B,#63,#0)               // strip off sign
 294         }
 295         {
 296                 p3 = cmp.gtu(ATMP,BTMP)
 297                 if (!p3.new) A = B                      // sort values
 298                 if (!p3.new) B = A                      // sort values
 299         }
 300         {
 301                 // Any NaN --> NaN, possibly raise invalid if sNaN
 302                 p0 = dfclass(A,#0x0f)           // A not NaN?
 303                 if (!p0.new) jump:nt .Linvalid_nan
 304                 if (!p3) ATMP = BTMP
 305                 if (!p3) BTMP = ATMP
 306         }
 307         {
 308                 // Infinity * nonzero number is infinity
 309                 p1 = dfclass(A,#0x08)           // A is infinity
 310                 p1 = dfclass(B,#0x0e)           // B is nonzero
 311         }
 312         {
 313                 // Infinity * zero --> NaN, raise invalid
 314                 // Other zeros return zero
 315                 p0 = dfclass(A,#0x08)           // A is infinity
 316                 p0 = dfclass(B,#0x01)           // B is zero
 317         }
 318         {
 319                 if (p1) jump .Ltrue_inf
 320                 p2 = dfclass(B,#0x01)
 321         }
 322         {
 323                 if (p0) jump .Linvalid_zeroinf
 324                 if (p2) jump .Ltrue_zero                // so return zero
 325                 TMP = ##0x7c000000
 326         }
 327         // We are left with a normal or subnormal times a subnormal. A > B
 328         // If A and B are both very small (exp(a) < BIAS-MANTBITS),
 329         // we go to a single sticky bit, which we can round easily.
 330         // If A and B might multiply to something bigger, decrease A exponent and increase
 331         // B exponent and try again
 332         {
 333                 p0 = bitsclr(AH,TMP)
 334                 if (p0.new) jump:nt .Lmul_tiny
 335         }
 336         {
 337                 TMP = cl0(BTMP)
 338         }
 339         {
 340                 TMP = add(TMP,#-EXPBITS)
 341         }
 342         {
 343                 BTMP = asl(BTMP,TMP)
 344         }
 345         {
 346                 B = insert(BTMP,#63,#0)
 347                 AH -= asl(TMP,#HI_MANTBITS)
 348         }
 349         jump __hexagon_muldf3
 350 .Lmul_tiny:
 351         {
 352                 TMP = USR
 353                 A = xor(A,B)                            // get sign bit
 354         }
 355         {
 356                 TMP = or(TMP,#0x30)                     // Inexact + Underflow
 357                 A = insert(ONE,#63,#0)                  // put in rounded up value
 358                 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)  // get rounding mode
 359         }
 360         {
 361                 USR = TMP
 362                 p0 = cmp.gt(BTMPH,#1)                   // Round towards pos/neg inf?
 363                 if (!p0.new) AL = #0                    // If not, zero
 364                 BTMPH ^= lsr(AH,#31)                    // rounding my way --> set LSB
 365         }
 366         {
 367                 p0 = cmp.eq(BTMPH,#3)                   // if rounding towards right inf
 368                 if (!p0.new) AL = #0                    // don't go to zero
 369                 jumpr r31
 370         }
 371 .Linvalid_zeroinf:
 372         {
 373                 TMP = USR
 374         }
 375         {
 376                 A = #-1
 377                 TMP = or(TMP,#2)
 378         }
 379         {
 380                 USR = TMP
 381         }
 382         {
 383                 p0 = dfcmp.uo(A,A)                      // force exception if enabled
 384                 jumpr r31
 385         }
 386 .Linvalid_nan:
 387         {
 388                 p0 = dfclass(B,#0x0f)                   // if B is not NaN
 389                 TMP = convert_df2sf(A)                  // will generate invalid if sNaN
 390                 if (p0.new) B = A                       // make it whatever A is
 391         }
 392         {
 393                 BL = convert_df2sf(B)                   // will generate invalid if sNaN
 394                 A = #-1
 395                 jumpr r31
 396         }
 397         .falign
 398 .Ltrue_zero:
 399         {
 400                 A = B
 401                 B = A
 402         }
 403 .Ltrue_inf:
 404         {
 405                 BH = extract(BH,#1,#31)
 406         }
 407         {
 408                 AH ^= asl(BH,#31)
 409                 jumpr r31
 410         }
 411 END(__hexagon_muldf3)
 412
 413 #undef ATMP
 414 #undef ATMPL
 415 #undef ATMPH
 416 #undef BTMP
 417 #undef BTMPL
 418 #undef BTMPH