#include "_Alphasm.h" #include "_InvSqrt.h" /* Approximation table */ /* * _InvSqrt() --- 1/sqrt(x) approximation (good to 23 bits) * * This routine can be used to compute vector normalization factors. * Also, by performing x*(1/sqrt(x) == sqrt(x), this routine can be used * to determine lengths, or where any "close-enuf" sqrt is acceptable. * * This routine uses the exponent LSB and the upper 7 mantissa bits(I call them * fraction bits) to perform a table-lookup & linear approximation of 1/sqrt. * This value is then improved by one pass thru a Newton-Raphson 1/sqrt * approximation. The result is then given the correct exponent, and * we're on our way. * * The algorithm is laid out further down, for your amusement. * * NOTE: I your willing to live with 15 bits or so, you can take out the * NR iteration and just go with the piecewise linear approximation. * This would improve performance greatly. * * PERFORMANCE: * At the time this was first written, the cycle count is around 54. * This doesn't include cache factors. * */ /* * Register Allocation */ #define nX v0 #define n0_5 t0 #define pTable s7 /* aka t1 */ #define nXc s8 /* aka t2 */ #define nIndex s9 /* aka t3 */ #define nXa s10 /* aka t4 */ #define pEntry s11 /* aka t5 */ #define fX f16 /* pts to input fp value */ #define fXc f17 /* */ #define fXa f18 /* */ #define fSlope f19 /* */ #define fInter f20 /* */ #define fXs f21 /* */ #define f0_5 f22 /* */ #define f1_5 f23 /* */ #define fTemp1 f24 /* */ #define fTemp2 f25 /* */ /* * Stack offsets */ #define tempX 1*8 #define tempXa 2*8 #define tempXc 3*8 #define FRAME_SIZE 4*8 #define FRAME_OFFSET 0 #define BIASPLUS63 (127+63) .extern _InvSqrt /* * Constants */ .rdata .align 5 _Nf0_5: .float 0.5 _Nf1_5: .float 1.5 .text .align 5 .globl _InvSqrt .ent _InvSqrt _InvSqrt: #if 0 .set noreorder .set nomacro #endif ldgp gp, 0(pv) /* regenerate gp*/ lda sp,-FRAME_SIZE(sp) /* grab some stack*/ #ifdef SAVE_RETURN stq ra, 0(sp) /* stash return address*/ #endif .frame sp, FRAME_SIZE, ra, FRAME_OFFSET .prologue 1 #if 0 // // 1/sqrt( X ) // = 1/sqrt( Xf * 2** [y-127] ) y = biased exponent // // if y is odd // // = 1/sqrt( Xf * 2** (y-1 - 126) ) // = 1/sqrt( Xf * 2** (2*( (y-1-126) div 2 ) + ((y-1-126) mod 2) ) // = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + ((y-1) mod 2) ) // = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + (0) ) // = 2**(63 - (y-1) div 2) * 1/sqrt( Xc = Xf * 2**(0) ) // --------------------- --------------------- // ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1)) // // if y is even // // = 1/sqrt( Xf * 2**(y-126 - 1) ) // = 1/sqrt( Xf * 2**(y-126) * 2**(-1) ) // = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) ) // = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) ) // = 1/sqrt( 2**(2*(y-126) div 2) ) * 1/sqrt( Xf * 2**(-1) ) // = 1/sqrt( 2**(2*((y) div 2) - 63 ) * 1/sqrt( Xf * 2**(-1) ) // = 2**(63 - (y) div 2) * 1/sqrt( Xc = Xf * 2**(-1) ) // ------------------- ---------------------- // ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1)) // // // where INVSQRT approximates 1/sqrt(Xc) with table lookup & 1 NR iteration: // // Xs = SEED_LOOKUP(Xf,y&1) // Xb = [ 0.5 * Xs * ( 3.0 - Xc * Xs**2 ) ] // = Xs * [ 1.5 - ( 0.5 * Xc * Xs**2 ) ] // // // // // where // Xseed = [slope(INDEX(X)) * Xc] + intercept(INDEX(X)) // index = n concat Xf<7 MSBs> into slope,intercept tables // #endif sts fX, tempX(sp) /* store passed fp input for move*/ ldah n0_5, HEX(3F00)(rzero) /* fetch exp=126 --> 0.5*/ lda pTable, _Nrsqrt_table /* fetch ptr to top of approx table*/ ldl nX, tempX(sp) /* fetch X as integer*/ lds f0_5, _Nf0_5 /* fetch 0.5 */ zap nX, 8, nXc /* clear sign, 7 msb exp*/ srl nX, 24, nXa /* y>>1*/ bis nXc, n0_5, nXc /* compose Xc [0.5,2.0)*/ stl nXc, tempXc(sp) /* write Xc for move to fp reg*/ extbl nX, 2, nIndex /* get INDEX(X) (exp lsb, frac 7 msb)*/ lda nXa, -BIASPLUS63(nXa) /* - [(63 - (y>>1)) + 127 bias]*/ negl nXa, nXa /* (63 - (y>>1)) + 127 bias*/ s8addq nIndex, pTable, pEntry /* make quad index, add to table addr*/ sll nXa, 23, nXa /* fp format for 2**(63 - (y>>1))*/ stl nXa, tempXa(sp) /* write Xa for move to fp reg*/ lds fXc, tempXc(sp) /* get Xc into fp reg*/ lds fSlope, 0(pEntry) /* fetch slope from table*/ lds fInter, 4(pEntry) /* fetch intercept from table*/ muls fXc, fSlope, fXs /* slope * Xc*/ muls f0_5, fXc, fTemp1 /* 0.5 * Xc*/ lds f1_5, _Nf1_5 /* fetch 1.5 */ lds fXa, tempXa(sp) /* get Xa into fp reg*/ adds fXs, fInter, fXs /* Xseed = slope*Xc + intercept*/ muls fXs, fXs, fTemp2 /* Xs**2*/ muls fXa, fXs, fXs /* Xa * Xs*/ muls fTemp1, fTemp2,fTemp1 /* 0.5*Xc * Xs**2*/ subs f1_5, fTemp1, fTemp1 /* 1.5 - (0.5*Xc*Xs**2)*/ muls fTemp1, fXs, f0 /* Xa*Xs * [1.5 - (0.5*Xc*Xs**2)]*/ /* DONE!!*/ ExitHere: #ifdef SAVE_RETURN ldgp gp, 0(ra) /* regenerate gp*/ ldq ra, 0(sp) /* recall ret addr*/ #endif lda sp, FRAME_SIZE(sp) /* release stack*/ ret rzero, (ra), 1 ASM_END(_InvSqrt)