1 //===----------------------Hexagon builtin routine ------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
11 #define END(TAG) .size TAG,.-TAG
13 /* Double Precision Multiply */
65 #define HI_MANTBITS 20
76 #define SR_ROUND_OFF 22
80 * First, classify for normal values, and abort if abnormal
82 * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
84 * Since we know that the 2 MSBs of the H registers is zero, we should never carry
85 * the partial products that involve the H registers
87 * Try to buy X slots, at the expense of latency if needed
89 * We will have PP_HH with the upper bits of the product, PP_LL with the lower
90 * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
91 * PP_HH can have a minimum of 0x0100_0000_0000_0000
93 * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
95 * We need to align CTMP.
96 * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
97 * If CTMP << PP align CTMP and add 128 bits. Then compute sticky
98 * If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
100 * Convert partial product and CTMP to 2's complement prior to addition
102 * After we add, we need to normalize into upper 64 bits, then compute sticky.
108 .global __hexagon_fmadf4
109 .type __hexagon_fmadf4,@function
110 .global __hexagon_fmadf5
111 .type __hexagon_fmadf5,@function
120 P_TMP = dfclass(A,#2)
121 P_TMP = dfclass(B,#2)
126 ATMP = insert(A,#MANTBITS,#EXPBITS-3)
127 BTMP = insert(B,#MANTBITS,#EXPBITS-3)
128 PP_ODD_H = ##0x10000000
129 allocframe(#STACKSPACE)
132 PP_LL = mpyu(ATMPL,BTMPL)
133 if (!P_TMP) jump .Lfma_abnormal_ab
134 ATMPH = or(ATMPH,PP_ODD_H)
135 BTMPH = or(BTMPH,PP_ODD_H)
138 P_TMP = dfclass(C,#2)
139 if (!P_TMP.new) jump:nt .Lfma_abnormal_c
140 CTMP = combine(PP_ODD_H,#0)
141 PP_ODD = combine(#0,PP_LL_H)
143 .Lfma_abnormal_c_restart:
145 PP_ODD += mpyu(BTMPL,ATMPH)
146 CTMP = insert(C,#MANTBITS,#EXPBITS-3)
151 PP_ODD += mpyu(ATMPL,BTMPH)
153 P_TMP = cmp.gt(CH,#-1)
157 EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
158 EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
159 PP_HH = combine(#0,PP_ODD_H)
160 if (!P_TMP) CTMP = EXPBA
163 PP_HH += mpyu(ATMPH,BTMPH)
164 PP_LL = combine(PP_ODD_L,PP_LL_L)
174 #define RIGHTLEFTSHIFT r13:12
175 #define RIGHTSHIFT r13
176 #define LEFTSHIFT r12
178 EXPA = add(EXPA,EXPB)
183 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
185 /* PP_HH:PP_LL now has product */
186 /* CTMP is negated */
187 /* EXPA,B,C are extracted */
189 * We need to negate PP
190 * Since we will be adding with carry later, if we need to negate,
191 * just invert all bits now, which we can do conditionally and in parallel
193 #define PP_HH_TMP r15:14
194 #define PP_LL_TMP r7:6
196 EXPA = add(EXPA,#-BIAS+(ADJUST))
197 PROD_NEG = !cmp.gt(TMP,#-1)
202 PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
203 P_TMP = !cmp.gt(TMP,#-1)
204 SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
205 if (SWAP.new) EXPCA = combine(EXPA,EXPC)
208 PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
209 if (P_TMP) PP_LL = PP_LL_TMP
215 EXPC = sub(EXPA,EXPC)
218 if (P_TMP) PP_HH = PP_HH_TMP
219 P_TMP = cmp.gt(EXPC,#63)
220 if (SWAP) PP_LL = CTMP2
221 if (SWAP) CTMP2 = PP_LL
231 if (SWAP) PP_HH = CTMP // Swap C and PP
232 if (SWAP) CTMP = PP_HH
233 if (P_TMP) EXPC = add(EXPC,#-64)
237 // If diff > 63, pre-shift-right by 64...
238 if (P_TMP) CTMP2 = CTMP
240 RIGHTSHIFT = min(EXPC,TMP)
246 #define STICKIES r5:4
250 if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
251 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
252 CTMP2 = lsr(CTMP2,RIGHTSHIFT)
253 LEFTSHIFT = sub(#64,RIGHTSHIFT)
258 CTMP2 |= lsl(CTMP,LEFTSHIFT)
259 CTMP = asr(CTMP,RIGHTSHIFT)
262 P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
263 if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
271 PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
274 PP_HH = add(CTMP,PP_HH,P_CARRY):carry
278 * PP_HH:PP_LL now holds the sum
279 * We may need to normalize left, up to ??? bits.
281 * I think that if we have massive cancellation, the range we normalize by
285 LEFTSHIFT = add(clb(PP_HH),#-2)
286 if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
288 /* We had all sign bits, shift left by 62. */
290 CTMP = extractu(PP_LL,#62,#2)
291 PP_LL = asl(PP_LL,#62)
292 EXPA = add(EXPA,#-62) // And adjust exponent of result
295 PP_HH = insert(CTMP,#62,#0) // Then shift 63
298 LEFTSHIFT = add(clb(PP_HH),#-2)
303 CTMP = asl(PP_HH,LEFTSHIFT)
304 STICKIES |= asl(PP_LL,LEFTSHIFT)
305 RIGHTSHIFT = sub(#64,LEFTSHIFT)
306 EXPA = sub(EXPA,LEFTSHIFT)
309 CTMP |= lsr(PP_LL,RIGHTSHIFT)
310 EXACT = cmp.gtu(ONE,STICKIES)
314 if (!EXACT) CTMPL = or(CTMPL,S_ONE)
315 // If EXPA is overflow/underflow, jump to ovf_unf
316 P_TMP = !cmp.gt(EXPA,TMP)
317 P_TMP = cmp.gt(EXPA,#1)
318 if (!P_TMP.new) jump:nt .Lfma_ovf_unf
321 // XXX: FIXME: should PP_HH for check of zero be CTMP?
322 P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
323 A = convert_d2df(CTMP)
324 EXPA = add(EXPA,#-BIAS-60)
328 AH += asl(EXPA,#HI_MANTBITS)
330 if (!P_TMP) dealloc_return // not zero, return
333 /* We had full cancellation. Return +/- zero (-0 when round-down) */
339 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
345 if (p0.new) AH = ##0x80000000
349 #undef RIGHTLEFTSHIFT
358 p0 = cmp.gtu(ONE,CTMP)
359 if (p0.new) jump:nt .Ladd_yields_zero
362 A = convert_d2df(CTMP)
363 EXPA = add(EXPA,#-BIAS-60)
369 AH += asl(EXPA,#HI_MANTBITS)
370 NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
373 NEW_EXPA = add(EXPA,NEW_EXPB)
396 p0 = cmp.gt(EXPA,##BIAS+BIAS)
397 if (p0.new) jump:nt .Lfma_ovf
401 if (p0.new) jump:nt .Lpossible_unf
404 // TMP has original EXPA.
405 // ATMP is corresponding value
406 // Normalize ATMP and shift right to correct location
407 EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
408 EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
409 p3 = cmp.gt(CTMPH,#-1)
412 /* We know that the infinte range exponent should be EXPA */
413 /* CTMP is 2's complement, ATMP is abs(CTMP) */
415 EXPA = add(EXPA,EXPB) // how much to shift back right
416 ATMP = asl(ATMP,EXPB) // shift left
426 B = extractu(ATMP,EXPBA)
427 ATMP = asr(ATMP,EXPB)
431 if (!p0.new) ATMPL = or(ATMPL,S_ONE)
432 ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
436 p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
437 if (!p1.new) AH = or(AH,AL)
443 TMP = #-BIAS-(MANTBITS+FUDGE2)
446 A = convert_d2df(CTMP)
449 AH += asl(TMP,#HI_MANTBITS)
460 if (!p0.new) dealloc_return:t
464 p0 = bitsset(ATMPH,TMP)
469 if (p0) BH = or(BH,BL)
481 CTMP = combine(##0x7fefffff,#-1)
485 ATMP = combine(##0x7ff00000,#0)
486 BH = extractu(TMP,#2,#SR_ROUND_OFF)
499 p0 = dfcmp.eq(ATMP,ATMP)
500 if (p0.new) CTMP = ATMP
503 A = insert(CTMP,#63,#0)
522 ATMP = extractu(A,#63,#0)
523 BTMP = extractu(B,#63,#0)
527 p3 = cmp.gtu(ATMP,BTMP)
528 if (!p3.new) A = B // sort values
532 p0 = dfclass(A,#0x0f) // A NaN?
533 if (!p0.new) jump:nt .Lnan
538 p1 = dfclass(A,#0x08) // A is infinity
539 p1 = dfclass(B,#0x0e) // B is nonzero
542 p0 = dfclass(A,#0x08) // a is inf
543 p0 = dfclass(B,#0x01) // b is zero
546 if (p1) jump .Lab_inf
547 p2 = dfclass(B,#0x01)
550 if (p0) jump .Linvalid
551 if (p2) jump .Lab_true_zero
554 // We are left with a normal or subnormal times a subnormal, A > B
555 // If A and B are both very small, we will go to a single sticky bit; replace
556 // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
557 // if A and B might multiply to something bigger, decrease A exp and increase B exp
561 if (p0.new) jump:nt .Lfma_ab_tiny
564 TMP = add(clb(BTMP),#-EXPBITS)
570 B = insert(BTMP,#63,#0)
571 AH -= asl(TMP,#HI_MANTBITS)
576 ATMP = combine(##0x00100000,#0)
578 A = insert(ATMP,#63,#0)
579 B = insert(ATMP,#63,#0)
586 p0 = dfclass(C,#0x10)
593 p1 = dfclass(C,#0x08)
594 if (p1.new) jump:nt .Lfma_inf_plus_inf
596 /* A*B is +/- inf, C is finite. Return A */
602 { // adding infinities of different signs is invalid
604 if (!p0.new) jump:nt .Linvalid
612 p0 = dfclass(B,#0x10)
613 p1 = dfclass(C,#0x10)
618 BH = convert_df2sf(B)
619 BL = convert_df2sf(C)
622 BH = convert_df2sf(A)
629 TMP = ##0x7f800001 // sp snan
632 A = convert_sf2df(TMP)
637 // B is zero, A is finite number
639 p0 = dfclass(C,#0x10)
640 if (p0.new) jump:nt .Lnan
644 p0 = dfcmp.eq(B,C) // is C also zero?
645 AH = lsr(AH,#31) // get sign
648 BH ^= asl(AH,#31) // form correctly signed zero in B
649 if (!p0) A = C // If C is not zero, return C
652 /* B has correctly signed zero, C is also zero */
655 p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
656 if (p0.new) jumpr:t r31
663 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
668 if (p0.new) AH = ##0x80000000
677 /* We know that AB is normal * normal */
678 /* C is not normal: zero, subnormal, inf, or NaN. */
680 p0 = dfclass(C,#0x10) // is C NaN?
681 if (p0.new) jump:nt .Lnan
682 if (p0.new) A = C // move NaN to A
686 p0 = dfclass(C,#0x08) // is C inf?
687 if (p0.new) A = C // return C
688 if (p0.new) jumpr:nt r31
691 // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
693 p0 = dfclass(C,#0x01) // is C zero?
694 if (p0.new) jump:nt __hexagon_muldf3
697 // Left with: subnormal
698 // Adjust C and jump back to restart
700 allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
702 CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
703 jump .Lfma_abnormal_c_restart