Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

181 lines
5.0 KiB

#include "_Alphasm.h"
#include "_InvSqrt.h" /* Approximation table */
/*
* _InvSqrt() --- 1/sqrt(x) approximation (good to 23 bits)
*
* This routine can be used to compute vector normalization factors.
* Also, by performing x*(1/sqrt(x) == sqrt(x), this routine can be used
* to determine lengths, or where any "close-enuf" sqrt is acceptable.
*
* This routine uses the exponent LSB and the upper 7 mantissa bits(I call them
* fraction bits) to perform a table-lookup & linear approximation of 1/sqrt.
* This value is then improved by one pass thru a Newton-Raphson 1/sqrt
* approximation. The result is then given the correct exponent, and
* we're on our way.
*
* The algorithm is laid out further down, for your amusement.
*
* NOTE: I your willing to live with 15 bits or so, you can take out the
* NR iteration and just go with the piecewise linear approximation.
* This would improve performance greatly.
*
* PERFORMANCE:
* At the time this was first written, the cycle count is around 54.
* This doesn't include cache factors.
*
*/
/*
* Register Allocation
*/
#define nX v0
#define n0_5 t0
#define pTable s7 /* aka t1 */
#define nXc s8 /* aka t2 */
#define nIndex s9 /* aka t3 */
#define nXa s10 /* aka t4 */
#define pEntry s11 /* aka t5 */
#define fX f16 /* pts to input fp value */
#define fXc f17 /* */
#define fXa f18 /* */
#define fSlope f19 /* */
#define fInter f20 /* */
#define fXs f21 /* */
#define f0_5 f22 /* */
#define f1_5 f23 /* */
#define fTemp1 f24 /* */
#define fTemp2 f25 /* */
/*
* Stack offsets
*/
#define tempX 1*8
#define tempXa 2*8
#define tempXc 3*8
#define FRAME_SIZE 4*8
#define FRAME_OFFSET 0
#define BIASPLUS63 (127+63)
.extern _InvSqrt
/*
* Constants
*/
.rdata
.align 5
_Nf0_5: .float 0.5
_Nf1_5: .float 1.5
.text
.align 5
.globl _InvSqrt
.ent _InvSqrt
_InvSqrt:
#if 0
.set noreorder
.set nomacro
#endif
ldgp gp, 0(pv) /* regenerate gp*/
lda sp,-FRAME_SIZE(sp) /* grab some stack*/
#ifdef SAVE_RETURN
stq ra, 0(sp) /* stash return address*/
#endif
.frame sp, FRAME_SIZE, ra, FRAME_OFFSET
.prologue 1
#if 0
//
// 1/sqrt( X )
// = 1/sqrt( Xf * 2** [y-127] ) y = biased exponent
//
// if y is odd
//
// = 1/sqrt( Xf * 2** (y-1 - 126) )
// = 1/sqrt( Xf * 2** (2*( (y-1-126) div 2 ) + ((y-1-126) mod 2) )
// = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + ((y-1) mod 2) )
// = 1/sqrt( Xf * 2** (2*( (y-1) div 2 - 63 ) + (0) )
// = 2**(63 - (y-1) div 2) * 1/sqrt( Xc = Xf * 2**(0) )
// --------------------- ---------------------
// ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1))
//
// if y is even
//
// = 1/sqrt( Xf * 2**(y-126 - 1) )
// = 1/sqrt( Xf * 2**(y-126) * 2**(-1) )
// = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) )
// = 1/sqrt( Xf * 2**(2*(y-126) div 2) * 2**(-1) )
// = 1/sqrt( 2**(2*(y-126) div 2) ) * 1/sqrt( Xf * 2**(-1) )
// = 1/sqrt( 2**(2*((y) div 2) - 63 ) * 1/sqrt( Xf * 2**(-1) )
// = 2**(63 - (y) div 2) * 1/sqrt( Xc = Xf * 2**(-1) )
// ------------------- ----------------------
// ~= (Xa = 2**(63 - (y>>1)) * (Xb = INVSQRT(Xf, y&1))
//
//
// where INVSQRT approximates 1/sqrt(Xc) with table lookup & 1 NR iteration:
//
// Xs = SEED_LOOKUP(Xf,y&1)
// Xb = [ 0.5 * Xs * ( 3.0 - Xc * Xs**2 ) ]
// = Xs * [ 1.5 - ( 0.5 * Xc * Xs**2 ) ]
//
//
//
//
// where
// Xseed = [slope(INDEX(X)) * Xc] + intercept(INDEX(X))
// index = n concat Xf<7 MSBs> into slope,intercept tables
//
#endif
sts fX, tempX(sp) /* store passed fp input for move*/
ldah n0_5, HEX(3F00)(rzero) /* fetch exp=126 --> 0.5*/
lda pTable, _Nrsqrt_table /* fetch ptr to top of approx table*/
ldl nX, tempX(sp) /* fetch X as integer*/
lds f0_5, _Nf0_5 /* fetch 0.5 */
zap nX, 8, nXc /* clear sign, 7 msb exp*/
srl nX, 24, nXa /* y>>1*/
bis nXc, n0_5, nXc /* compose Xc [0.5,2.0)*/
stl nXc, tempXc(sp) /* write Xc for move to fp reg*/
extbl nX, 2, nIndex /* get INDEX(X) (exp lsb, frac 7 msb)*/
lda nXa, -BIASPLUS63(nXa) /* - [(63 - (y>>1)) + 127 bias]*/
negl nXa, nXa /* (63 - (y>>1)) + 127 bias*/
s8addq nIndex, pTable, pEntry /* make quad index, add to table addr*/
sll nXa, 23, nXa /* fp format for 2**(63 - (y>>1))*/
stl nXa, tempXa(sp) /* write Xa for move to fp reg*/
lds fXc, tempXc(sp) /* get Xc into fp reg*/
lds fSlope, 0(pEntry) /* fetch slope from table*/
lds fInter, 4(pEntry) /* fetch intercept from table*/
muls fXc, fSlope, fXs /* slope * Xc*/
muls f0_5, fXc, fTemp1 /* 0.5 * Xc*/
lds f1_5, _Nf1_5 /* fetch 1.5 */
lds fXa, tempXa(sp) /* get Xa into fp reg*/
adds fXs, fInter, fXs /* Xseed = slope*Xc + intercept*/
muls fXs, fXs, fTemp2 /* Xs**2*/
muls fXa, fXs, fXs /* Xa * Xs*/
muls fTemp1, fTemp2,fTemp1 /* 0.5*Xc * Xs**2*/
subs f1_5, fTemp1, fTemp1 /* 1.5 - (0.5*Xc*Xs**2)*/
muls fTemp1, fXs, f0 /* Xa*Xs * [1.5 - (0.5*Xc*Xs**2)]*/
/* DONE!!*/
ExitHere:
#ifdef SAVE_RETURN
ldgp gp, 0(ra) /* regenerate gp*/
ldq ra, 0(sp) /* recall ret addr*/
#endif
lda sp, FRAME_SIZE(sp) /* release stack*/
ret rzero, (ra), 1
ASM_END(_InvSqrt)