1 //===----------------------Hexagon builtin routine ------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
10 #define END(TAG) .size TAG,.-TAG
12 // Double Precision Multiply
64 #define HI_MANTBITS 20
75 #define SR_ROUND_OFF 22
78 // First, classify for normal values, and abort if abnormal
80 // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
82 // Since we know that the 2 MSBs of the H registers is zero, we should never carry
83 // the partial products that involve the H registers
85 // Try to buy X slots, at the expense of latency if needed
87 // We will have PP_HH with the upper bits of the product, PP_LL with the lower
88 // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
89 // PP_HH can have a minimum of 0x0100_0000_0000_0000
91 // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
93 // We need to align CTMP.
94 // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
95 // If CTMP << PP align CTMP and add 128 bits. Then compute sticky
96 // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
98 // Convert partial product and CTMP to 2's complement prior to addition
100 // After we add, we need to normalize into upper 64 bits, then compute sticky.
103 .global __hexagon_fmadf4
104 .type __hexagon_fmadf4,@function
105 .global __hexagon_fmadf5
106 .type __hexagon_fmadf5,@function
115 P_TMP = dfclass(A,#2)
116 P_TMP = dfclass(B,#2)
121 ATMP = insert(A,#MANTBITS,#EXPBITS-3)
122 BTMP = insert(B,#MANTBITS,#EXPBITS-3)
123 PP_ODD_H = ##0x10000000
124 allocframe(#STACKSPACE)
127 PP_LL = mpyu(ATMPL,BTMPL)
128 if (!P_TMP) jump .Lfma_abnormal_ab
129 ATMPH = or(ATMPH,PP_ODD_H)
130 BTMPH = or(BTMPH,PP_ODD_H)
133 P_TMP = dfclass(C,#2)
134 if (!P_TMP.new) jump:nt .Lfma_abnormal_c
135 CTMP = combine(PP_ODD_H,#0)
136 PP_ODD = combine(#0,PP_LL_H)
138 .Lfma_abnormal_c_restart:
140 PP_ODD += mpyu(BTMPL,ATMPH)
141 CTMP = insert(C,#MANTBITS,#EXPBITS-3)
146 PP_ODD += mpyu(ATMPL,BTMPH)
148 P_TMP = cmp.gt(CH,#-1)
152 EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
153 EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
154 PP_HH = combine(#0,PP_ODD_H)
155 if (!P_TMP) CTMP = EXPBA
158 PP_HH += mpyu(ATMPH,BTMPH)
159 PP_LL = combine(PP_ODD_L,PP_LL_L)
169 #define RIGHTLEFTSHIFT r13:12
170 #define RIGHTSHIFT r13
171 #define LEFTSHIFT r12
173 EXPA = add(EXPA,EXPB)
178 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
180 // PP_HH:PP_LL now has product
182 // EXPA,B,C are extracted
183 // We need to negate PP
184 // Since we will be adding with carry later, if we need to negate,
185 // just invert all bits now, which we can do conditionally and in parallel
186 #define PP_HH_TMP r15:14
187 #define PP_LL_TMP r7:6
189 EXPA = add(EXPA,#-BIAS+(ADJUST))
190 PROD_NEG = !cmp.gt(TMP,#-1)
195 PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
196 P_TMP = !cmp.gt(TMP,#-1)
197 SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
198 if (SWAP.new) EXPCA = combine(EXPA,EXPC)
201 PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
202 if (P_TMP) PP_LL = PP_LL_TMP
208 EXPC = sub(EXPA,EXPC)
211 if (P_TMP) PP_HH = PP_HH_TMP
212 P_TMP = cmp.gt(EXPC,#63)
213 if (SWAP) PP_LL = CTMP2
214 if (SWAP) CTMP2 = PP_LL
224 if (SWAP) PP_HH = CTMP // Swap C and PP
225 if (SWAP) CTMP = PP_HH
226 if (P_TMP) EXPC = add(EXPC,#-64)
230 // If diff > 63, pre-shift-right by 64...
231 if (P_TMP) CTMP2 = CTMP
233 RIGHTSHIFT = min(EXPC,TMP)
239 #define STICKIES r5:4
243 if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
244 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
245 CTMP2 = lsr(CTMP2,RIGHTSHIFT)
246 LEFTSHIFT = sub(#64,RIGHTSHIFT)
251 CTMP2 |= lsl(CTMP,LEFTSHIFT)
252 CTMP = asr(CTMP,RIGHTSHIFT)
255 P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
256 if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
264 PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
267 PP_HH = add(CTMP,PP_HH,P_CARRY):carry
270 // PP_HH:PP_LL now holds the sum
271 // We may need to normalize left, up to ??? bits.
273 // I think that if we have massive cancellation, the range we normalize by
276 LEFTSHIFT = add(clb(PP_HH),#-2)
277 if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
279 // We had all sign bits, shift left by 62.
281 CTMP = extractu(PP_LL,#62,#2)
282 PP_LL = asl(PP_LL,#62)
283 EXPA = add(EXPA,#-62) // And adjust exponent of result
286 PP_HH = insert(CTMP,#62,#0) // Then shift 63
289 LEFTSHIFT = add(clb(PP_HH),#-2)
294 CTMP = asl(PP_HH,LEFTSHIFT)
295 STICKIES |= asl(PP_LL,LEFTSHIFT)
296 RIGHTSHIFT = sub(#64,LEFTSHIFT)
297 EXPA = sub(EXPA,LEFTSHIFT)
300 CTMP |= lsr(PP_LL,RIGHTSHIFT)
301 EXACT = cmp.gtu(ONE,STICKIES)
305 if (!EXACT) CTMPL = or(CTMPL,S_ONE)
306 // If EXPA is overflow/underflow, jump to ovf_unf
307 P_TMP = !cmp.gt(EXPA,TMP)
308 P_TMP = cmp.gt(EXPA,#1)
309 if (!P_TMP.new) jump:nt .Lfma_ovf_unf
312 // XXX: FIXME: should PP_HH for check of zero be CTMP?
313 P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
314 A = convert_d2df(CTMP)
315 EXPA = add(EXPA,#-BIAS-60)
319 AH += asl(EXPA,#HI_MANTBITS)
321 if (!P_TMP) dealloc_return // not zero, return
324 // We had full cancellation. Return +/- zero (-0 when round-down)
330 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
336 if (p0.new) AH = ##0x80000000
340 #undef RIGHTLEFTSHIFT
349 p0 = cmp.gtu(ONE,CTMP)
350 if (p0.new) jump:nt .Ladd_yields_zero
353 A = convert_d2df(CTMP)
354 EXPA = add(EXPA,#-BIAS-60)
360 AH += asl(EXPA,#HI_MANTBITS)
361 NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
364 NEW_EXPA = add(EXPA,NEW_EXPB)
387 p0 = cmp.gt(EXPA,##BIAS+BIAS)
388 if (p0.new) jump:nt .Lfma_ovf
392 if (p0.new) jump:nt .Lpossible_unf
395 // TMP has original EXPA.
396 // ATMP is corresponding value
397 // Normalize ATMP and shift right to correct location
398 EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
399 EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
400 p3 = cmp.gt(CTMPH,#-1)
403 // We know that the infinte range exponent should be EXPA
404 // CTMP is 2's complement, ATMP is abs(CTMP)
406 EXPA = add(EXPA,EXPB) // how much to shift back right
407 ATMP = asl(ATMP,EXPB) // shift left
417 B = extractu(ATMP,EXPBA)
418 ATMP = asr(ATMP,EXPB)
422 if (!p0.new) ATMPL = or(ATMPL,S_ONE)
423 ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
427 p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
428 if (!p1.new) AH = or(AH,AL)
434 TMP = #-BIAS-(MANTBITS+FUDGE2)
437 A = convert_d2df(CTMP)
440 AH += asl(TMP,#HI_MANTBITS)
451 if (!p0.new) dealloc_return:t
455 p0 = bitsset(ATMPH,TMP)
460 if (p0) BH = or(BH,BL)
472 CTMP = combine(##0x7fefffff,#-1)
476 ATMP = combine(##0x7ff00000,#0)
477 BH = extractu(TMP,#2,#SR_ROUND_OFF)
490 p0 = dfcmp.eq(ATMP,ATMP)
491 if (p0.new) CTMP = ATMP
494 A = insert(CTMP,#63,#0)
513 ATMP = extractu(A,#63,#0)
514 BTMP = extractu(B,#63,#0)
518 p3 = cmp.gtu(ATMP,BTMP)
519 if (!p3.new) A = B // sort values
523 p0 = dfclass(A,#0x0f) // A NaN?
524 if (!p0.new) jump:nt .Lnan
529 p1 = dfclass(A,#0x08) // A is infinity
530 p1 = dfclass(B,#0x0e) // B is nonzero
533 p0 = dfclass(A,#0x08) // a is inf
534 p0 = dfclass(B,#0x01) // b is zero
537 if (p1) jump .Lab_inf
538 p2 = dfclass(B,#0x01)
541 if (p0) jump .Linvalid
542 if (p2) jump .Lab_true_zero
545 // We are left with a normal or subnormal times a subnormal, A > B
546 // If A and B are both very small, we will go to a single sticky bit; replace
547 // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
548 // if A and B might multiply to something bigger, decrease A exp and increase B exp
552 if (p0.new) jump:nt .Lfma_ab_tiny
555 TMP = add(clb(BTMP),#-EXPBITS)
561 B = insert(BTMP,#63,#0)
562 AH -= asl(TMP,#HI_MANTBITS)
567 ATMP = combine(##0x00100000,#0)
569 A = insert(ATMP,#63,#0)
570 B = insert(ATMP,#63,#0)
577 p0 = dfclass(C,#0x10)
584 p1 = dfclass(C,#0x08)
585 if (p1.new) jump:nt .Lfma_inf_plus_inf
587 // A*B is +/- inf, C is finite. Return A
593 { // adding infinities of different signs is invalid
595 if (!p0.new) jump:nt .Linvalid
603 p0 = dfclass(B,#0x10)
604 p1 = dfclass(C,#0x10)
609 BH = convert_df2sf(B)
610 BL = convert_df2sf(C)
613 BH = convert_df2sf(A)
620 TMP = ##0x7f800001 // sp snan
623 A = convert_sf2df(TMP)
628 // B is zero, A is finite number
630 p0 = dfclass(C,#0x10)
631 if (p0.new) jump:nt .Lnan
635 p0 = dfcmp.eq(B,C) // is C also zero?
636 AH = lsr(AH,#31) // get sign
639 BH ^= asl(AH,#31) // form correctly signed zero in B
640 if (!p0) A = C // If C is not zero, return C
643 // B has correctly signed zero, C is also zero
646 p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
647 if (p0.new) jumpr:t r31
654 TMP = extractu(TMP,#2,#SR_ROUND_OFF)
659 if (p0.new) AH = ##0x80000000
668 // We know that AB is normal * normal
669 // C is not normal: zero, subnormal, inf, or NaN.
671 p0 = dfclass(C,#0x10) // is C NaN?
672 if (p0.new) jump:nt .Lnan
673 if (p0.new) A = C // move NaN to A
677 p0 = dfclass(C,#0x08) // is C inf?
678 if (p0.new) A = C // return C
679 if (p0.new) jumpr:nt r31
682 // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
684 p0 = dfclass(C,#0x01) // is C zero?
685 if (p0.new) jump:nt __hexagon_muldf3
688 // Left with: subnormal
689 // Adjust C and jump back to restart
691 allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
693 CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
694 jump .Lfma_abnormal_c_restart