1 //===----------------------Hexagon builtin routine ------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /* Double Precision Multiply */
45 #define HI_MANTBITS 20
48 #define MANTISSA_TO_INT_BIAS 52
50 /* Some constant to adjust normalization amount in error code */
51 /* Amount to right shift the partial product to get to a denorm */
54 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
55 #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
56 #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
57 #define END(TAG) .size TAG,.-TAG
59 #define SR_ROUND_OFF 22
61 .global __hexagon_muldf3
62 .type __hexagon_muldf3,@function
71 ATMP = combine(##0x40000000,#0)
74 ATMP = insert(A,#MANTBITS,#EXPBITS-1)
75 BTMP = asl(B,#EXPBITS-1)
80 PP_ODD = mpyu(BTMPL,ATMPH)
81 BTMP = insert(ONE,#2,#62)
83 /* since we know that the MSB of the H registers is zero, we should never carry */
84 /* H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 */
85 /* Adding 2 HLs, we get 2^64-3*2^32+2 maximum. */
86 /* Therefore, we can add 3 2^32-1 values safely without carry. We only need one. */
88 PP_LL = mpyu(ATMPL,BTMPL)
89 PP_ODD += mpyu(ATMPL,BTMPH)
92 PP_ODD += lsr(PP_LL,#32)
93 PP_HH = mpyu(ATMPH,BTMPH)
94 BTMP = combine(##BIAS+BIAS-4,#0)
97 PP_HH += lsr(PP_ODD,#32)
98 if (!p0) jump .Lmul_abnormal
99 p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
100 p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
103 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
104 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
113 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
114 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
115 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
119 EXP0 += add(TMP,EXP1)
123 if (!p2.new) PP_HH = PP_LL
125 p0 = !cmp.gt(EXP0,BTMPH)
126 p0 = cmp.gt(EXP0,BTMPL)
127 if (!p0.new) jump:nt .Lmul_ovf_unf
130 A = convert_d2df(PP_HH)
131 EXP0 = add(EXP0,#-BIAS-58)
134 AH += asl(EXP0,#HI_MANTBITS)
140 /* We end up with a positive exponent */
141 /* But we may have rounded up to an exponent of 1. */
142 /* If the exponent is 1, if we rounded up to it
143 * we need to also raise underflow
144 * Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
145 * And the PP should also have more than one bit set
147 /* Note: ATMP should have abs(PP_HH) */
148 /* Note: BTMPL should have 0x7FEFFFFF */
151 p0 = bitsclr(AH,BTMPL)
152 if (!p0.new) jumpr:t r31
156 p0 = bitsset(ATMPH,BTMPH)
161 if (p0) BTMPL = or(BTMPL,BTMPH)
173 A = convert_d2df(PP_HH)
174 ATMP = abs(PP_HH) // take absolute value
175 EXP1 = add(EXP0,#-BIAS-58)
178 AH += asl(EXP1,#HI_MANTBITS)
179 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
183 EXP1 += add(EXP0,##-BIAS-58)
184 //BTMPH = add(clb(ATMP),#-2)
188 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
189 if (p0.new) jump:nt .Lmul_ovf
193 if (p0.new) jump:nt .Lpossible_unf
194 BTMPH = sub(EXP0,BTMPH)
195 TMP = #63 // max amount to shift
199 * PP_HH has the partial product with sticky LSB.
200 * PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
201 * PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
202 * The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
203 * That's the exponent that happens after the normalization
205 * EXP0 has the exponent that, when added to the normalized value, is out of range.
209 * * Shift down bits, with sticky bit, such that the bits are aligned according
210 * to the LZ count and appropriate exponent, but not all the way to mantissa
211 * field, keep around the last few bits.
212 * * Put a 1 near the MSB
213 * * Check the LSBs for inexact; if inexact also set underflow
214 * * Convert [u]d2df -- will correctly round according to rounding mode
215 * * Replace exponent field with zero
222 BTMPL = #0 // offset for extract
223 BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
226 p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
227 BTMPH = min(BTMPH,TMP) // Don't shift more than 63
232 PP_LL = extractu(PP_HH,BTMP)
235 PP_HH = asr(PP_HH,BTMPH)
236 BTMPL = #0x0030 // underflow flag
237 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
240 p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
241 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
242 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
246 p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
247 if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
250 if (!p3) PP_HH = PP_LL
254 A = convert_d2df(PP_HH) // Do rounding
255 p0 = dfcmp.eq(A,A) // realize exception
258 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
263 // We get either max finite value or infinity. Either way, overflow+inexact
266 ATMP = combine(##0x7fefffff,#-1) // positive max finite
270 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
271 TMP = or(TMP,#0x28) // inexact + overflow
272 BTMP = combine(##0x7ff00000,#0) // positive infinity
276 PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
277 TMP = PP_LL_L // unmodified rounding mode
280 p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
281 p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
282 if (p0.new) ATMP = BTMP // we should get infinity
283 p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
286 A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
292 ATMP = extractu(A,#63,#0) // strip off sign
293 BTMP = extractu(B,#63,#0) // strip off sign
296 p3 = cmp.gtu(ATMP,BTMP)
297 if (!p3.new) A = B // sort values
298 if (!p3.new) B = A // sort values
301 // Any NaN --> NaN, possibly raise invalid if sNaN
302 p0 = dfclass(A,#0x0f) // A not NaN?
303 if (!p0.new) jump:nt .Linvalid_nan
308 // Infinity * nonzero number is infinity
309 p1 = dfclass(A,#0x08) // A is infinity
310 p1 = dfclass(B,#0x0e) // B is nonzero
313 // Infinity * zero --> NaN, raise invalid
314 // Other zeros return zero
315 p0 = dfclass(A,#0x08) // A is infinity
316 p0 = dfclass(B,#0x01) // B is zero
319 if (p1) jump .Ltrue_inf
320 p2 = dfclass(B,#0x01)
323 if (p0) jump .Linvalid_zeroinf
324 if (p2) jump .Ltrue_zero // so return zero
327 // We are left with a normal or subnormal times a subnormal. A > B
328 // If A and B are both very small (exp(a) < BIAS-MANTBITS),
329 // we go to a single sticky bit, which we can round easily.
330 // If A and B might multiply to something bigger, decrease A exponent and increase
331 // B exponent and try again
334 if (p0.new) jump:nt .Lmul_tiny
340 TMP = add(TMP,#-EXPBITS)
346 B = insert(BTMP,#63,#0)
347 AH -= asl(TMP,#HI_MANTBITS)
349 jump __hexagon_muldf3
353 A = xor(A,B) // get sign bit
356 TMP = or(TMP,#0x30) // Inexact + Underflow
357 A = insert(ONE,#63,#0) // put in rounded up value
358 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
362 p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
363 if (!p0.new) AL = #0 // If not, zero
364 BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
367 p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
368 if (!p0.new) AL = #0 // don't go to zero
383 p0 = dfcmp.uo(A,A) // force exception if enabled
388 p0 = dfclass(B,#0x0f) // if B is not NaN
389 TMP = convert_df2sf(A) // will generate invalid if sNaN
390 if (p0.new) B = A // make it whatever A is
393 BL = convert_df2sf(B) // will generate invalid if sNaN
405 BH = extract(BH,#1,#31)
411 END(__hexagon_muldf3)