//===----------------------Hexagon builtin routine ------------------------===// // // The LLVM Compiler Infrastructure // // This file is dual licensed under the MIT and the University of Illinois Open // Source Licenses. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG #define END(TAG) .size TAG,.-TAG /* Double Precision Multiply */ #define A r1:0 #define AH r1 #define AL r0 #define B r3:2 #define BH r3 #define BL r2 #define C r5:4 #define CH r5 #define CL r4 #define BTMP r15:14 #define BTMPH r15 #define BTMPL r14 #define ATMP r13:12 #define ATMPH r13 #define ATMPL r12 #define CTMP r11:10 #define CTMPH r11 #define CTMPL r10 #define PP_LL r9:8 #define PP_LL_H r9 #define PP_LL_L r8 #define PP_ODD r7:6 #define PP_ODD_H r7 #define PP_ODD_L r6 #define PP_HH r17:16 #define PP_HH_H r17 #define PP_HH_L r16 #define EXPA r18 #define EXPB r19 #define EXPBA r19:18 #define TMP r28 #define P_TMP p0 #define PROD_NEG p3 #define EXACT p2 #define SWAP p1 #define MANTBITS 52 #define HI_MANTBITS 20 #define EXPBITS 11 #define BIAS 1023 #define STACKSPACE 32 #define ADJUST 4 #define FUDGE 7 #define FUDGE2 3 #ifndef SR_ROUND_OFF #define SR_ROUND_OFF 22 #endif /* * First, classify for normal values, and abort if abnormal * * Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8 * * Since we know that the 2 MSBs of the H registers is zero, we should never carry * the partial products that involve the H registers * * Try to buy X slots, at the expense of latency if needed * * We will have PP_HH with the upper bits of the product, PP_LL with the lower * PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts * PP_HH can have a minimum of 0x0100_0000_0000_0000 * * 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS * * We need to align CTMP. * If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add * If CTMP << PP align CTMP and add 128 bits. Then compute sticky * If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation. * * Convert partial product and CTMP to 2's complement prior to addition * * After we add, we need to normalize into upper 64 bits, then compute sticky. * * */ .text .global __hexagon_fmadf4 .type __hexagon_fmadf4,@function .global __hexagon_fmadf5 .type __hexagon_fmadf5,@function .global fma .type fma,@function Q6_ALIAS(fmadf5) .p2align 5 __hexagon_fmadf4: __hexagon_fmadf5: fma: { P_TMP = dfclass(A,#2) P_TMP = dfclass(B,#2) ATMP = #0 BTMP = #0 } { ATMP = insert(A,#MANTBITS,#EXPBITS-3) BTMP = insert(B,#MANTBITS,#EXPBITS-3) PP_ODD_H = ##0x10000000 allocframe(#STACKSPACE) } { PP_LL = mpyu(ATMPL,BTMPL) if (!P_TMP) jump .Lfma_abnormal_ab ATMPH = or(ATMPH,PP_ODD_H) BTMPH = or(BTMPH,PP_ODD_H) } { P_TMP = dfclass(C,#2) if (!P_TMP.new) jump:nt .Lfma_abnormal_c CTMP = combine(PP_ODD_H,#0) PP_ODD = combine(#0,PP_LL_H) } .Lfma_abnormal_c_restart: { PP_ODD += mpyu(BTMPL,ATMPH) CTMP = insert(C,#MANTBITS,#EXPBITS-3) memd(r29+#0) = PP_HH memd(r29+#8) = EXPBA } { PP_ODD += mpyu(ATMPL,BTMPH) EXPBA = neg(CTMP) P_TMP = cmp.gt(CH,#-1) TMP = xor(AH,BH) } { EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) PP_HH = combine(#0,PP_ODD_H) if (!P_TMP) CTMP = EXPBA } { PP_HH += mpyu(ATMPH,BTMPH) PP_LL = combine(PP_ODD_L,PP_LL_L) #undef PP_ODD #undef PP_ODD_H #undef PP_ODD_L #undef ATMP #undef ATMPL #undef ATMPH #undef BTMP #undef BTMPL #undef BTMPH #define RIGHTLEFTSHIFT r13:12 #define RIGHTSHIFT r13 #define LEFTSHIFT r12 EXPA = add(EXPA,EXPB) #undef EXPB #undef EXPBA #define EXPC r19 #define EXPCA r19:18 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS) } /* PP_HH:PP_LL now has product */ /* CTMP is negated */ /* EXPA,B,C are extracted */ /* * We need to negate PP * Since we will be adding with carry later, if we need to negate, * just invert all bits now, which we can do conditionally and in parallel */ #define PP_HH_TMP r15:14 #define PP_LL_TMP r7:6 { EXPA = add(EXPA,#-BIAS+(ADJUST)) PROD_NEG = !cmp.gt(TMP,#-1) PP_LL_TMP = #0 PP_HH_TMP = #0 } { PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry P_TMP = !cmp.gt(TMP,#-1) SWAP = cmp.gt(EXPC,EXPA) // If C >> PP if (SWAP.new) EXPCA = combine(EXPA,EXPC) } { PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry if (P_TMP) PP_LL = PP_LL_TMP #undef PP_LL_TMP #define CTMP2 r7:6 #define CTMP2H r7 #define CTMP2L r6 CTMP2 = #0 EXPC = sub(EXPA,EXPC) } { if (P_TMP) PP_HH = PP_HH_TMP P_TMP = cmp.gt(EXPC,#63) if (SWAP) PP_LL = CTMP2 if (SWAP) CTMP2 = PP_LL } #undef PP_HH_TMP //#define ONE r15:14 //#define S_ONE r14 #define ZERO r15:14 #define S_ZERO r15 #undef PROD_NEG #define P_CARRY p3 { if (SWAP) PP_HH = CTMP // Swap C and PP if (SWAP) CTMP = PP_HH if (P_TMP) EXPC = add(EXPC,#-64) TMP = #63 } { // If diff > 63, pre-shift-right by 64... if (P_TMP) CTMP2 = CTMP TMP = asr(CTMPH,#31) RIGHTSHIFT = min(EXPC,TMP) LEFTSHIFT = #0 } #undef C #undef CH #undef CL #define STICKIES r5:4 #define STICKIESH r5 #define STICKIESL r4 { if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT) CTMP2 = lsr(CTMP2,RIGHTSHIFT) LEFTSHIFT = sub(#64,RIGHTSHIFT) } { ZERO = #0 TMP = #-2 CTMP2 |= lsl(CTMP,LEFTSHIFT) CTMP = asr(CTMP,RIGHTSHIFT) } { P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR #undef ZERO #define ONE r15:14 #define S_ONE r14 ONE = #1 STICKIES = #0 } { PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky } { PP_HH = add(CTMP,PP_HH,P_CARRY):carry TMP = #62 } /* * PP_HH:PP_LL now holds the sum * We may need to normalize left, up to ??? bits. * * I think that if we have massive cancellation, the range we normalize by * is still limited */ { LEFTSHIFT = add(clb(PP_HH),#-2) if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits? } /* We had all sign bits, shift left by 62. */ { CTMP = extractu(PP_LL,#62,#2) PP_LL = asl(PP_LL,#62) EXPA = add(EXPA,#-62) // And adjust exponent of result } { PP_HH = insert(CTMP,#62,#0) // Then shift 63 } { LEFTSHIFT = add(clb(PP_HH),#-2) } .falign 1: { CTMP = asl(PP_HH,LEFTSHIFT) STICKIES |= asl(PP_LL,LEFTSHIFT) RIGHTSHIFT = sub(#64,LEFTSHIFT) EXPA = sub(EXPA,LEFTSHIFT) } { CTMP |= lsr(PP_LL,RIGHTSHIFT) EXACT = cmp.gtu(ONE,STICKIES) TMP = #BIAS+BIAS-2 } { if (!EXACT) CTMPL = or(CTMPL,S_ONE) // If EXPA is overflow/underflow, jump to ovf_unf P_TMP = !cmp.gt(EXPA,TMP) P_TMP = cmp.gt(EXPA,#1) if (!P_TMP.new) jump:nt .Lfma_ovf_unf } { // XXX: FIXME: should PP_HH for check of zero be CTMP? P_TMP = cmp.gtu(ONE,CTMP) // is result true zero? A = convert_d2df(CTMP) EXPA = add(EXPA,#-BIAS-60) PP_HH = memd(r29+#0) } { AH += asl(EXPA,#HI_MANTBITS) EXPCA = memd(r29+#8) if (!P_TMP) dealloc_return // not zero, return } .Ladd_yields_zero: /* We had full cancellation. Return +/- zero (-0 when round-down) */ { TMP = USR A = #0 } { TMP = extractu(TMP,#2,#SR_ROUND_OFF) PP_HH = memd(r29+#0) EXPCA = memd(r29+#8) } { p0 = cmp.eq(TMP,#2) if (p0.new) AH = ##0x80000000 dealloc_return } #undef RIGHTLEFTSHIFT #undef RIGHTSHIFT #undef LEFTSHIFT #undef CTMP2 #undef CTMP2H #undef CTMP2L .Lfma_ovf_unf: { p0 = cmp.gtu(ONE,CTMP) if (p0.new) jump:nt .Ladd_yields_zero } { A = convert_d2df(CTMP) EXPA = add(EXPA,#-BIAS-60) TMP = EXPA } #define NEW_EXPB r7 #define NEW_EXPA r6 { AH += asl(EXPA,#HI_MANTBITS) NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS) } { NEW_EXPA = add(EXPA,NEW_EXPB) PP_HH = memd(r29+#0) EXPCA = memd(r29+#8) #undef PP_HH #undef PP_HH_H #undef PP_HH_L #undef EXPCA #undef EXPC #undef EXPA #undef PP_LL #undef PP_LL_H #undef PP_LL_L #define EXPA r6 #define EXPB r7 #define EXPBA r7:6 #define ATMP r9:8 #define ATMPH r9 #define ATMPL r8 #undef NEW_EXPB #undef NEW_EXPA ATMP = abs(CTMP) } { p0 = cmp.gt(EXPA,##BIAS+BIAS) if (p0.new) jump:nt .Lfma_ovf } { p0 = cmp.gt(EXPA,#0) if (p0.new) jump:nt .Lpossible_unf } { // TMP has original EXPA. // ATMP is corresponding value // Normalize ATMP and shift right to correct location EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize p3 = cmp.gt(CTMPH,#-1) } /* Underflow */ /* We know that the infinte range exponent should be EXPA */ /* CTMP is 2's complement, ATMP is abs(CTMP) */ { EXPA = add(EXPA,EXPB) // how much to shift back right ATMP = asl(ATMP,EXPB) // shift left AH = USR TMP = #63 } { EXPB = min(EXPA,TMP) EXPA = #0 AL = #0x0030 } { B = extractu(ATMP,EXPBA) ATMP = asr(ATMP,EXPB) } { p0 = cmp.gtu(ONE,B) if (!p0.new) ATMPL = or(ATMPL,S_ONE) ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2) } { CTMP = neg(ATMP) p1 = bitsclr(ATMPL,#(1< B // If A and B are both very small, we will go to a single sticky bit; replace // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results // if A and B might multiply to something bigger, decrease A exp and increase B exp // and start over { p0 = bitsclr(AH,TMP) if (p0.new) jump:nt .Lfma_ab_tiny } { TMP = add(clb(BTMP),#-EXPBITS) } { BTMP = asl(BTMP,TMP) } { B = insert(BTMP,#63,#0) AH -= asl(TMP,#HI_MANTBITS) } jump fma .Lfma_ab_tiny: ATMP = combine(##0x00100000,#0) { A = insert(ATMP,#63,#0) B = insert(ATMP,#63,#0) } jump fma .Lab_inf: { B = lsr(B,#63) p0 = dfclass(C,#0x10) } { A ^= asl(B,#63) if (p0) jump .Lnan } { p1 = dfclass(C,#0x08) if (p1.new) jump:nt .Lfma_inf_plus_inf } /* A*B is +/- inf, C is finite. Return A */ { jumpr r31 } .falign .Lfma_inf_plus_inf: { // adding infinities of different signs is invalid p0 = dfcmp.eq(A,C) if (!p0.new) jump:nt .Linvalid } { jumpr r31 } .Lnan: { p0 = dfclass(B,#0x10) p1 = dfclass(C,#0x10) if (!p0.new) B = A if (!p1.new) C = A } { // find sNaNs BH = convert_df2sf(B) BL = convert_df2sf(C) } { BH = convert_df2sf(A) A = #-1 jumpr r31 } .Linvalid: { TMP = ##0x7f800001 // sp snan } { A = convert_sf2df(TMP) jumpr r31 } .Lab_true_zero: // B is zero, A is finite number { p0 = dfclass(C,#0x10) if (p0.new) jump:nt .Lnan if (p0.new) A = C } { p0 = dfcmp.eq(B,C) // is C also zero? AH = lsr(AH,#31) // get sign } { BH ^= asl(AH,#31) // form correctly signed zero in B if (!p0) A = C // If C is not zero, return C if (!p0) jumpr r31 } /* B has correctly signed zero, C is also zero */ .Lzero_plus_zero: { p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0 if (p0.new) jumpr:t r31 A = B } { TMP = USR } { TMP = extractu(TMP,#2,#SR_ROUND_OFF) A = #0 } { p0 = cmp.eq(TMP,#2) if (p0.new) AH = ##0x80000000 jumpr r31 } #undef BTMP #undef BTMPH #undef BTMPL #define CTMP r11:10 .falign .Lfma_abnormal_c: /* We know that AB is normal * normal */ /* C is not normal: zero, subnormal, inf, or NaN. */ { p0 = dfclass(C,#0x10) // is C NaN? if (p0.new) jump:nt .Lnan if (p0.new) A = C // move NaN to A deallocframe } { p0 = dfclass(C,#0x08) // is C inf? if (p0.new) A = C // return C if (p0.new) jumpr:nt r31 } // zero or subnormal // If we have a zero, and we know AB is normal*normal, we can just call normal multiply { p0 = dfclass(C,#0x01) // is C zero? if (p0.new) jump:nt __hexagon_muldf3 TMP = #1 } // Left with: subnormal // Adjust C and jump back to restart { allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame CTMP = #0 CH = insert(TMP,#EXPBITS,#HI_MANTBITS) jump .Lfma_abnormal_c_restart } END(fma)