Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

242 lines
7.6 KiB

#include "_Alphasm.h"
#include "_Fpow.h" /* Approximation table */
/*
*
* NOTE: the code to generate _Nfpow.h follows this routine (ifdefd-out C)
*
* _Nfpow() --- single precision pow() approx routine
*
* This is used in GL specular shading computation. GL conformance's l_se.c
* steps the exponent from 0->128 in fractional increments, so we need
* to deal with REAL exponents. This routine gives just enough precision
* to squeak by l_se.c (path 0 -- which is the most difficult).
*
* NOTE: the base is clamped to 1.0, so base >= 1.0 yields exactly 1.0
*
* Here's the algorithm:
*
* Result = Base**Power
* = 2**[log2(Base**Power)]
* = 2**[Power*log2(Base)]
* using
* Base = mantissa(Base)*2**(exponent(Base)) (FP notation)
*
* Result = 2**(Power * log2( mantissa(Base) * 2**exponent(Base) ))
* = 2**(Power * ( exponent(Base) + log2(mantissa(Base)) ))
*
* now let P' = (Power * ...)
*
* 1: convert fp Power to fixed point
* 2: use _Nlog2_table to lookup log2(mantissa(Base))
* 3: add to extracted exponent(Base)
* 4: mult by cvt'd Power
*
* We now have P' in fixed point format
*
* Result = 2**P'
* = 2**[int(P') + frac(P')]
* = 2**frac(P') * 2**int(P')
*
* Build Result using:
*
* mantissa(Result) = _Npow2_table[frac(P')] (table lookup)
* exponent(Result) = int(P')
*
*/
/*
* Register Allocation
*/
#define nResult r0 /* result -- asm'd in int reg */
#define nB r1 /* base in int reg (gets munged) */
#define nP r2 /* scaled power in int reg (gets munged) */
#define nLog2 r3 /* value from log2 table (slope/intcpt) */
#define pLog2Tab r4 /* pointer to log2 table */
#define nIntcpt r4 /* intercept extracted from nLog2 */
#define pPow2Tab r5 /* pointer to pow2 table */
#define nIndex r6 /* index into log2,pow2 table */
#define nSlope r6 /* slope extracted from nLog2 table */
#define pEntry r7 /* pointer into log2,pow2 table */
#define nTmp r7 /* temp reg */
#define nPShifted r8 /* result exponent, shifted into position */
#define nBfrac r8 /* frac part of base, used to index into log2*/
#define fResult f0 /* OUTPUT: return value */
#define fOne f0 /* 1.0 for testing */
#define fB f16 /* INPUT: base */
#define fP f17 /* INPUT: power */
#define fScaleP f18 /* power scale factor (to get int&frac bits) */
#define fTest f20 /* result of base==1 test */
#define fTest2 f21 /* result of power==1 test */
#define fP2 f22 /* scaled power */
#define FRAME_SIZE 4*8
/*
* Stack offsets
*/
#define stackQ0 1*8
#define stackQ1 2*8
#define NBITS_LOG2 8 /* log2 of entries in _Nlog2_table */
#define NBITS_POW2 12 /* log2 of entries in _Npow2_table */
.extern fpow
#define IS_GP_USED 1
#define FRAME_OFFSET 0
/*
* Constants
*/
.rdata
.align 5
_scaleP:
.float 65536.0 /* used to generate power in 8.16 fixed pt format */
_f1_0:
.float 1.0 /* for testing, also result of x**0 */
.text
.align 5
.globl fpow
.ent fpow
fpow:
lda sp,-FRAME_SIZE(sp) # grab some stack
.frame sp, FRAME_SIZE, ra, FRAME_OFFSET
.prologue IS_GP_USED
/*
* Setup stuff
*
* Get pointers, constants loaded
* Move base, scaled power to integer regs
*/
lds fScaleP, _scaleP /* get power scale factor */
lda pLog2Tab, _Nlog2_table /* get ptr to log2 table */
lda pPow2Tab, _Npow2_table /* get ptr to pow2 table */
sts fB, stackQ0(sp) /* move base to stack xfer to intreg */
fbeq fP, ZeroPow /* test for x**0 */
muls fP, fScaleP, fP2 /* scale power to incl hi16 mant bits*/
fbeq fB, ZeroBase /* test for 0**x */
lds fOne, _f1_0 /* for base=1 check */
ldl nB, stackQ0(sp) /* now have base in int reg */
cvttq fP2, fP2 /* convert scaled power to int */
stt fP2, stackQ1(sp) /* move to stack for xfer to int reg */
ldq nP, stackQ1(sp) /* now have 8.16 power in int reg */
/*
* For those of you who forget, here's the format of IEEE single-prec FP:
* (but if you don't know this, you're probably barking up the wrong tree).
*
* 33222222 22221111 11111100 00000000
* 10987654 32109876 54321098 76543210
* +--------+--------+--------+--------+
* |seeeeeee efffffff ffffffff ffffffff|
* +--------+--------+--------+--------+
* s = sign bit
* e = 8-bit bias127 exponent
* f = 23-bit fraction (aka mantissa in my book)
*
* "Normalized" numbers represent values: (-1)**s * (1.f) * 2**(e-127)
*
*/
/*
* Use hi bits of base mantissa to index into the _Nlog2_table
*/
sll nB,1,nB /* move left, lose sign bit 8.24 */
zap nB,HEX(F8),nBfrac /* lose exp for indexing into log2tab*/
zap nB,HEX(F7),nB /* keep only exponent in nB */
srl nBfrac,(24-NBITS_LOG2),nIndex /* get hi bits as index */
s8addq nIndex,pLog2Tab,pEntry /* form log2 table pointer */
ldq nLog2,0(pEntry) /* get log2 table entry */
ldah nB,-HEX(7F00)(nB) /* Remove bias from base exponent */
/*
* Using slope & intercept from the log2 table,
* compute log2(base frac) = slope * base frac + intercept
*/
sra nLog2,32,nIntcpt /* negative 8.24 intercept */
zap nLog2,HEX(F0),nSlope /* positive 16.16 slope */
mulq nBfrac,nSlope,nBfrac /* base frac * slope 16.40 */
addq nB,nIntcpt,nB /* base exp + intercept 08.24 */
sra nBfrac,(40-24),nBfrac /* base frac * slope 16.24 */
addq nB,nBfrac,nB /* basef*slope+intercept+bexp 17.24 */
/*
* Now multiply Power * Base exponent
*
* Result format:
*
* 66665555 55555544 44444444 33333333 33222222 22221111 11111100 00000000
* 32109876 54321098 76543210 98765432 10987654 32109876 54321098 76543210
* +--------+--------+--------+--------V--------+--------+--------+--------+
* ssssssss EEEEEEEE EEEEEEEE.eeeeeeee eeeeeeee eeeeeeee eeeeeeee eeeeeeee
*
*/
mulq nB, nP, nP /* result exponent (16.40) */
/*
* While we're waiting for the int mult to complete...
* let's test for some special cases? Is this worth the cache?
*/
cmptle fOne, fB, fTest /* test for base >= 1.0 */
cmpteq fP, fOne, fTest2 /* test for power == 1.0 */
// fbne fTest, ClampBase
fbne fTest2, OnePow
/*
* Extract fractional part of result exponent
* and use to index into pow2 table
*/
zap nP, HEX(E0), nPShifted /* remove int part for indexing */
/* shift hi12 to low byte for index */
srl nPShifted, (40-NBITS_POW2), nIndex
s4addq nIndex, pPow2Tab, pEntry
ldl nResult, 0(pEntry) /* get normalized result */
/*
* Compose final result
*/
sra nP, 40, nP /* shift exp down to lose frac */
sll nP, 23, nPShifted /* shift exp up into position */
addl nPShifted, nResult, nResult /* compose final answer */
stl nResult, stackQ0(sp) /* store result for transfer */
lda nP, 64(nP) /* add 64 to test for underflow */
/* assume 2**-64 is close enuf to 0 */
blt nP, Underflow
lds fResult, stackQ0(sp) /* load result as float */
ExitHere:
lda sp, FRAME_SIZE(sp) /* release stack */
ret rzero, (ra), 1
ZeroPow:
ClampBase:
lds fResult, _f1_0 /* x**0 = 1.0 */
br ExitHere
ZeroBase:
Underflow:
fmov fzero, fResult /* 0**x = 0, (x!=0) */
br ExitHere
OnePow:
fmov fB, fResult /* x**1 = x, yes? */
br ExitHere
ASM_END(fpow)