//===----------------------Hexagon builtin routine ------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // Double Precision square root #define EXP r28 #define A r1:0 #define AH r1 #define AL r0 #define SFSH r3:2 #define SF_S r3 #define SF_H r2 #define SFHALF_SONE r5:4 #define S_ONE r4 #define SFHALF r5 #define SF_D r6 #define SF_E r7 #define RECIPEST r8 #define SFRAD r9 #define FRACRAD r11:10 #define FRACRADH r11 #define FRACRADL r10 #define ROOT r13:12 #define ROOTHI r13 #define ROOTLO r12 #define PROD r15:14 #define PRODHI r15 #define PRODLO r14 #define P_TMP p0 #define P_EXP1 p1 #define NORMAL p2 #define SF_EXPBITS 8 #define SF_MANTBITS 23 #define DF_EXPBITS 11 #define DF_MANTBITS 52 #define DF_BIAS 0x3ff #define DFCLASS_ZERO 0x01 #define DFCLASS_NORMAL 0x02 #define DFCLASS_DENORMAL 0x02 #define DFCLASS_INFINITE 0x08 #define DFCLASS_NAN 0x10 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function #define END(TAG) .size TAG,.-TAG .text .global __hexagon_sqrtdf2 .type __hexagon_sqrtdf2,@function .global __hexagon_sqrt .type __hexagon_sqrt,@function Q6_ALIAS(sqrtdf2) Q6_ALIAS(sqrt) FAST_ALIAS(sqrtdf2) FAST_ALIAS(sqrt) FAST2_ALIAS(sqrtdf2) FAST2_ALIAS(sqrt) .type sqrt,@function .p2align 5 __hexagon_sqrtdf2: __hexagon_sqrt: { PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) SFHALF_SONE = combine(##0x3f000004,#1) } { NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal NORMAL = cmp.gt(AH,#-1) // and positive? if (!NORMAL.new) jump:nt .Lsqrt_abnormal SFRAD = or(SFHALF,PRODLO) } #undef NORMAL .Ldenormal_restart: { FRACRAD = A SF_E,P_TMP = sfinvsqrta(SFRAD) SFHALF = and(SFHALF,#-16) SFSH = #0 } #undef A #undef AH #undef AL #define ERROR r1:0 #define ERRORHI r1 #define ERRORLO r0 // SF_E : reciprocal square root // SF_H : half rsqrt // sf_S : square root // SF_D : error term // SFHALF: 0.5 { SF_S += sfmpy(SF_E,SFRAD):lib // s0: root SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... SF_D = SFHALF #undef SFRAD #define SHIFTAMT r9 SHIFTAMT = and(EXP,#1) } { SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden P_EXP1 = cmp.gtu(SHIFTAMT,#0) } { SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip SF_D = SFHALF SHIFTAMT = mux(P_EXP1,#8,#9) } { SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place SHIFTAMT = mux(P_EXP1,#3,#2) } { SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) } { SF_H = and(SF_H,##0x007fffff) } { SF_H = add(SF_H,##0x00800000 - 3) SHIFTAMT = mux(P_EXP1,#7,#8) } { RECIPEST = asl(SF_H,SHIFTAMT) SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) } { ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) } #undef SFSH // r3:2 #undef SF_H // r2 #undef SF_S // r3 #undef S_ONE // r4 #undef SFHALF // r5 #undef SFHALF_SONE // r5:4 #undef SF_D // r6 #undef SF_E // r7 #define HL r3:2 #define LL r5:4 #define HH r7:6 #undef P_EXP1 #define P_CARRY0 p1 #define P_CARRY1 p2 #define P_CARRY2 p3 // Iteration 0 // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply // We can shift and subtract instead of shift and add? { ERROR = asl(FRACRAD,#15) PROD = mpyu(ROOTHI,ROOTHI) P_CARRY0 = cmp.eq(r0,r0) } { ERROR -= asl(PROD,#15) PROD = mpyu(ROOTHI,ROOTLO) P_CARRY1 = cmp.eq(r0,r0) } { ERROR -= lsr(PROD,#16) P_CARRY2 = cmp.eq(r0,r0) } { ERROR = mpyu(ERRORHI,RECIPEST) } { ROOT += lsr(ERROR,SHIFTAMT) SHIFTAMT = add(SHIFTAMT,#16) ERROR = asl(FRACRAD,#31) // for next iter } // Iteration 1 { PROD = mpyu(ROOTHI,ROOTHI) ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed } { ERROR -= asl(PROD,#31) PROD = mpyu(ROOTLO,ROOTLO) } { ERROR -= lsr(PROD,#33) } { ERROR = mpyu(ERRORHI,RECIPEST) } { ROOT += lsr(ERROR,SHIFTAMT) SHIFTAMT = add(SHIFTAMT,#16) ERROR = asl(FRACRAD,#47) // for next iter } // Iteration 2 { PROD = mpyu(ROOTHI,ROOTHI) } { ERROR -= asl(PROD,#47) PROD = mpyu(ROOTHI,ROOTLO) } { ERROR -= asl(PROD,#16) // bidir shr 31-47 PROD = mpyu(ROOTLO,ROOTLO) } { ERROR -= lsr(PROD,#17) // 64-47 } { ERROR = mpyu(ERRORHI,RECIPEST) } { ROOT += lsr(ERROR,SHIFTAMT) } #undef ERROR #undef PROD #undef PRODHI #undef PRODLO #define REM_HI r15:14 #define REM_HI_HI r15 #define REM_LO r1:0 #undef RECIPEST #undef SHIFTAMT #define TWOROOT_LO r9:8 // Adjust Root { HL = mpyu(ROOTHI,ROOTLO) LL = mpyu(ROOTLO,ROOTLO) REM_HI = #0 REM_LO = #0 } { HL += lsr(LL,#33) LL += asl(HL,#33) P_CARRY0 = cmp.eq(r0,r0) } { HH = mpyu(ROOTHI,ROOTHI) REM_LO = sub(REM_LO,LL,P_CARRY0):carry TWOROOT_LO = #1 } { HH += lsr(HL,#31) TWOROOT_LO += asl(ROOT,#1) } #undef HL #undef LL #define REM_HI_TMP r3:2 #define REM_HI_TMP_HI r3 #define REM_LO_TMP r5:4 { REM_HI = sub(FRACRAD,HH,P_CARRY0):carry REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry #undef FRACRAD #undef HH #define ZERO r11:10 #define ONE r7:6 ONE = #1 ZERO = #0 } { REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry ONE = add(ROOT,ONE) EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp } { // If carry set, no borrow: result was still positive if (P_CARRY1) ROOT = ONE if (P_CARRY1) REM_LO = REM_LO_TMP if (P_CARRY1) REM_HI = REM_HI_TMP } { REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry ONE = #1 EXP = asr(EXP,#1) // divide signed exp by 2 } { REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry ONE = add(ROOT,ONE) } { if (P_CARRY2) ROOT = ONE if (P_CARRY2) REM_LO = REM_LO_TMP // since tworoot <= 2^32, remhi must be zero #undef REM_HI_TMP #undef REM_HI_TMP_HI #define S_ONE r2 #define ADJ r3 S_ONE = #1 } { P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully ADJ = cl0(ROOT) EXP = add(EXP,#-63) } #undef REM_LO #define RET r1:0 #define RETHI r1 { RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag EXP = add(EXP,ADJ) // add back bias } { RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust jumpr r31 } #undef REM_LO_TMP #undef REM_HI_TMP #undef REM_HI_TMP_HI #undef REM_LO #undef REM_HI #undef TWOROOT_LO #undef RET #define A r1:0 #define AH r1 #define AL r1 #undef S_ONE #define TMP r3:2 #define TMPHI r3 #define TMPLO r2 #undef P_CARRY0 #define P_NEG p1 #define SFHALF r5 #define SFRAD r9 .Lsqrt_abnormal: { P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? if (P_TMP.new) jumpr:t r31 } { P_TMP = dfclass(A,#DFCLASS_NAN) if (P_TMP.new) jump:nt .Lsqrt_nan } { P_TMP = cmp.gt(AH,#-1) if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg if (!P_TMP.new) EXP = ##0x7F800001 // sNaN } { P_TMP = dfclass(A,#DFCLASS_INFINITE) if (P_TMP.new) jumpr:nt r31 } // If we got here, we're denormal // prepare to restart { A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa } { EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? } { A = asl(A,EXP) // Shift mantissa EXP = sub(#1,EXP) // Form exponent } { AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent } { TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) SFHALF = ##0x3f000004 // form half constant } { SFRAD = or(SFHALF,TMPLO) // form sf value SFHALF = and(SFHALF,#-16) jump .Ldenormal_restart // restart } .Lsqrt_nan: { EXP = convert_df2sf(A) // if sNaN, get invalid A = #-1 // qNaN jumpr r31 } .Lsqrt_invalid_neg: { A = convert_sf2df(EXP) // Invalid,NaNval jumpr r31 } END(__hexagon_sqrt) END(__hexagon_sqrtdf2)