mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 lines
5.0 KiB
181 lines
5.0 KiB
#include "_Alphasm.h"
|
|
|
|
#include "_InvSqrt.h" /* Approximation table */
|
|
|
|
/*
|
|
* _InvSqrt() --- 1/sqrt(x) approximation (good to 23 bits)
|
|
*
|
|
* This routine can be used to compute vector normalization factors.
|
|
* Also, by performing x*(1/sqrt(x) == sqrt(x), this routine can be used
|
|
* to determine lengths, or where any "close-enuf" sqrt is acceptable.
|
|
*
|
|
* This routine uses the exponent LSB and the upper 7 mantissa bits(I call them
|
|
* fraction bits) to perform a table-lookup & linear approximation of 1/sqrt.
|
|
* This value is then improved by one pass thru a Newton-Raphson 1/sqrt
|
|
* approximation. The result is then given the correct exponent, and
|
|
* we're on our way.
|
|
*
|
|
* The algorithm is laid out further down, for your amusement.
|
|
*
|
|
* NOTE: I your willing to live with 15 bits or so, you can take out the
|
|
* NR iteration and just go with the piecewise linear approximation.
|
|
* This would improve performance greatly.
|
|
*
|
|
* PERFORMANCE:
|
|
* At the time this was first written, the cycle count is around 54.
|
|
* This doesn't include cache factors.
|
|
*
|
|
*/
|
|
/*
|
|
* Register Allocation
|
|
*/
|
|
#define nX v0
|
|
#define n0_5 t0
|
|
#define pTable s7 /* aka t1 */
|
|
#define nXc s8 /* aka t2 */
|
|
#define nIndex s9 /* aka t3 */
|
|
#define nXa s10 /* aka t4 */
|
|
#define pEntry s11 /* aka t5 */
|
|
|
|
#define fX f16 /* pts to input fp value */
|
|
#define fXc f17 /* */
|
|
#define fXa f18 /* */
|
|
#define fSlope f19 /* */
|
|
#define fInter f20 /* */
|
|
#define fXs f21 /* */
|
|
#define f0_5 f22 /* */
|
|
#define f1_5 f23 /* */
|
|
#define fTemp1 f24 /* */
|
|
#define fTemp2 f25 /* */
|
|
/*
|
|
* Stack offsets
|
|
*/
|
|
#define tempX 1*8
|
|
#define tempXa 2*8
|
|
#define tempXc 3*8
|
|
|
|
#define FRAME_SIZE 4*8
|
|
#define FRAME_OFFSET 0
|
|
|
|
#define BIASPLUS63 (127+63)
|
|
|
|
.extern _InvSqrt
|
|
|
|
/*
|
|
* Constants
|
|
*/
|
|
.rdata
|
|
.align 5
|
|
_Nf0_5: .float 0.5
|
|
_Nf1_5: .float 1.5
|
|
|
|
.text
|
|
.align 5
|
|
.globl _InvSqrt
|
|
.ent _InvSqrt
|
|
_InvSqrt:
|
|
|
|
#if 0
|
|
.set noreorder
|
|
.set nomacro
|
|
#endif
|
|
|
|
ldgp gp, 0(pv) /* regenerate gp*/
|
|
lda sp,-FRAME_SIZE(sp) /* grab some stack*/
|
|
|
|
#ifdef SAVE_RETURN
|
|
stq ra, 0(sp) /* stash return address*/
|
|
#endif
|
|
|
|
.frame sp, FRAME_SIZE, ra, FRAME_OFFSET
|
|
.prologue 1
|
|
|
|
|
|
#if 0
|
|
//
|
|
// 1/sqrt( X )
|
|
// = 1/sqrt( Xf * 2** [y-127] ) y = biased exponent
|
|
//
|
|
// if y is odd
|
|
//
|
|
// = 1/sqrt( Xf * 2** (y-1 - 126) )
|
|
// = 1/sqrt( Xf * 2** (2*( (y-1-126) div 2 ) + ((y-1-126) mod 2) )
|
|
// = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + ((y-1) mod 2) )
|
|
// = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + (0) )
|
|
// = 2**(63 - (y-1) div 2) * 1/sqrt( Xc = Xf * 2**(0) )
|
|
// --------------------- ---------------------
|
|
// ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1))
|
|
//
|
|
// if y is even
|
|
//
|
|
// = 1/sqrt( Xf * 2**(y-126 - 1) )
|
|
// = 1/sqrt( Xf * 2**(y-126) * 2**(-1) )
|
|
// = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) )
|
|
// = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) )
|
|
// = 1/sqrt( 2**(2*(y-126) div 2) ) * 1/sqrt( Xf * 2**(-1) )
|
|
// = 1/sqrt( 2**(2*((y) div 2) - 63 ) * 1/sqrt( Xf * 2**(-1) )
|
|
// = 2**(63 - (y) div 2) * 1/sqrt( Xc = Xf * 2**(-1) )
|
|
// ------------------- ----------------------
|
|
// ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1))
|
|
//
|
|
//
|
|
// where INVSQRT approximates 1/sqrt(Xc) with table lookup & 1 NR iteration:
|
|
//
|
|
// Xs = SEED_LOOKUP(Xf,y&1)
|
|
// Xb = [ 0.5 * Xs * ( 3.0 - Xc * Xs**2 ) ]
|
|
// = Xs * [ 1.5 - ( 0.5 * Xc * Xs**2 ) ]
|
|
//
|
|
//
|
|
//
|
|
//
|
|
// where
|
|
// Xseed = [slope(INDEX(X)) * Xc] + intercept(INDEX(X))
|
|
// index = n concat Xf<7 MSBs> into slope,intercept tables
|
|
//
|
|
#endif
|
|
|
|
sts fX, tempX(sp) /* store passed fp input for move*/
|
|
|
|
ldah n0_5, HEX(3F00)(rzero) /* fetch exp=126 --> 0.5*/
|
|
lda pTable, _Nrsqrt_table /* fetch ptr to top of approx table*/
|
|
|
|
ldl nX, tempX(sp) /* fetch X as integer*/
|
|
lds f0_5, _Nf0_5 /* fetch 0.5 */
|
|
|
|
zap nX, 8, nXc /* clear sign, 7 msb exp*/
|
|
srl nX, 24, nXa /* y>>1*/
|
|
bis nXc, n0_5, nXc /* compose Xc [0.5,2.0)*/
|
|
stl nXc, tempXc(sp) /* write Xc for move to fp reg*/
|
|
extbl nX, 2, nIndex /* get INDEX(X) (exp lsb, frac 7 msb)*/
|
|
|
|
lda nXa, -BIASPLUS63(nXa) /* - [(63 - (y>>1)) + 127 bias]*/
|
|
|
|
negl nXa, nXa /* (63 - (y>>1)) + 127 bias*/
|
|
s8addq nIndex, pTable, pEntry /* make quad index, add to table addr*/
|
|
sll nXa, 23, nXa /* fp format for 2**(63 - (y>>1))*/
|
|
stl nXa, tempXa(sp) /* write Xa for move to fp reg*/
|
|
lds fXc, tempXc(sp) /* get Xc into fp reg*/
|
|
lds fSlope, 0(pEntry) /* fetch slope from table*/
|
|
lds fInter, 4(pEntry) /* fetch intercept from table*/
|
|
muls fXc, fSlope, fXs /* slope * Xc*/
|
|
muls f0_5, fXc, fTemp1 /* 0.5 * Xc*/
|
|
lds f1_5, _Nf1_5 /* fetch 1.5 */
|
|
lds fXa, tempXa(sp) /* get Xa into fp reg*/
|
|
adds fXs, fInter, fXs /* Xseed = slope*Xc + intercept*/
|
|
muls fXs, fXs, fTemp2 /* Xs**2*/
|
|
muls fXa, fXs, fXs /* Xa * Xs*/
|
|
muls fTemp1, fTemp2,fTemp1 /* 0.5*Xc * Xs**2*/
|
|
subs f1_5, fTemp1, fTemp1 /* 1.5 - (0.5*Xc*Xs**2)*/
|
|
muls fTemp1, fXs, f0 /* Xa*Xs * [1.5 - (0.5*Xc*Xs**2)]*/
|
|
/* DONE!!*/
|
|
|
|
ExitHere:
|
|
|
|
#ifdef SAVE_RETURN
|
|
ldgp gp, 0(ra) /* regenerate gp*/
|
|
ldq ra, 0(sp) /* recall ret addr*/
|
|
#endif
|
|
lda sp, FRAME_SIZE(sp) /* release stack*/
|
|
ret rzero, (ra), 1
|
|
|
|
ASM_END(_InvSqrt)
|