mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
242 lines
7.6 KiB
242 lines
7.6 KiB
|
|
#include "_Alphasm.h"
|
|
#include "_Fpow.h" /* Approximation table */
|
|
|
|
/*
|
|
*
|
|
* NOTE: the code to generate _Nfpow.h follows this routine (ifdefd-out C)
|
|
*
|
|
* _Nfpow() --- single precision pow() approx routine
|
|
*
|
|
* This is used in GL specular shading computation. GL conformance's l_se.c
|
|
* steps the exponent from 0->128 in fractional increments, so we need
|
|
* to deal with REAL exponents. This routine gives just enough precision
|
|
* to squeak by l_se.c (path 0 -- which is the most difficult).
|
|
*
|
|
* NOTE: the base is clamped to 1.0, so base >= 1.0 yields exactly 1.0
|
|
*
|
|
* Here's the algorithm:
|
|
*
|
|
* Result = Base**Power
|
|
* = 2**[log2(Base**Power)]
|
|
* = 2**[Power*log2(Base)]
|
|
* using
|
|
* Base = mantissa(Base)*2**(exponent(Base)) (FP notation)
|
|
*
|
|
* Result = 2**(Power * log2( mantissa(Base) * 2**exponent(Base) ))
|
|
* = 2**(Power * ( exponent(Base) + log2(mantissa(Base)) ))
|
|
*
|
|
* now let P' = (Power * ...)
|
|
*
|
|
* 1: convert fp Power to fixed point
|
|
* 2: use _Nlog2_table to lookup log2(mantissa(Base))
|
|
* 3: add to extracted exponent(Base)
|
|
* 4: mult by cvt'd Power
|
|
*
|
|
* We now have P' in fixed point format
|
|
*
|
|
* Result = 2**P'
|
|
* = 2**[int(P') + frac(P')]
|
|
* = 2**frac(P') * 2**int(P')
|
|
*
|
|
* Build Result using:
|
|
*
|
|
* mantissa(Result) = _Npow2_table[frac(P')] (table lookup)
|
|
* exponent(Result) = int(P')
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Register Allocation
|
|
*/
|
|
#define nResult r0 /* result -- asm'd in int reg */
|
|
#define nB r1 /* base in int reg (gets munged) */
|
|
#define nP r2 /* scaled power in int reg (gets munged) */
|
|
#define nLog2 r3 /* value from log2 table (slope/intcpt) */
|
|
#define pLog2Tab r4 /* pointer to log2 table */
|
|
#define nIntcpt r4 /* intercept extracted from nLog2 */
|
|
#define pPow2Tab r5 /* pointer to pow2 table */
|
|
#define nIndex r6 /* index into log2,pow2 table */
|
|
#define nSlope r6 /* slope extracted from nLog2 table */
|
|
#define pEntry r7 /* pointer into log2,pow2 table */
|
|
#define nTmp r7 /* temp reg */
|
|
#define nPShifted r8 /* result exponent, shifted into position */
|
|
#define nBfrac r8 /* frac part of base, used to index into log2*/
|
|
|
|
#define fResult f0 /* OUTPUT: return value */
|
|
#define fOne f0 /* 1.0 for testing */
|
|
#define fB f16 /* INPUT: base */
|
|
#define fP f17 /* INPUT: power */
|
|
#define fScaleP f18 /* power scale factor (to get int&frac bits) */
|
|
#define fTest f20 /* result of base==1 test */
|
|
#define fTest2 f21 /* result of power==1 test */
|
|
#define fP2 f22 /* scaled power */
|
|
|
|
|
|
#define FRAME_SIZE 4*8
|
|
/*
|
|
* Stack offsets
|
|
*/
|
|
#define stackQ0 1*8
|
|
#define stackQ1 2*8
|
|
|
|
#define NBITS_LOG2 8 /* log2 of entries in _Nlog2_table */
|
|
#define NBITS_POW2 12 /* log2 of entries in _Npow2_table */
|
|
|
|
.extern fpow
|
|
|
|
|
|
#define IS_GP_USED 1
|
|
#define FRAME_OFFSET 0
|
|
|
|
/*
|
|
* Constants
|
|
*/
|
|
.rdata
|
|
.align 5
|
|
_scaleP:
|
|
.float 65536.0 /* used to generate power in 8.16 fixed pt format */
|
|
_f1_0:
|
|
.float 1.0 /* for testing, also result of x**0 */
|
|
|
|
|
|
|
|
.text
|
|
.align 5
|
|
.globl fpow
|
|
.ent fpow
|
|
fpow:
|
|
|
|
lda sp,-FRAME_SIZE(sp) # grab some stack
|
|
|
|
.frame sp, FRAME_SIZE, ra, FRAME_OFFSET
|
|
.prologue IS_GP_USED
|
|
|
|
/*
|
|
* Setup stuff
|
|
*
|
|
* Get pointers, constants loaded
|
|
* Move base, scaled power to integer regs
|
|
*/
|
|
|
|
|
|
lds fScaleP, _scaleP /* get power scale factor */
|
|
lda pLog2Tab, _Nlog2_table /* get ptr to log2 table */
|
|
lda pPow2Tab, _Npow2_table /* get ptr to pow2 table */
|
|
sts fB, stackQ0(sp) /* move base to stack xfer to intreg */
|
|
fbeq fP, ZeroPow /* test for x**0 */
|
|
muls fP, fScaleP, fP2 /* scale power to incl hi16 mant bits*/
|
|
fbeq fB, ZeroBase /* test for 0**x */
|
|
lds fOne, _f1_0 /* for base=1 check */
|
|
ldl nB, stackQ0(sp) /* now have base in int reg */
|
|
cvttq fP2, fP2 /* convert scaled power to int */
|
|
stt fP2, stackQ1(sp) /* move to stack for xfer to int reg */
|
|
ldq nP, stackQ1(sp) /* now have 8.16 power in int reg */
|
|
|
|
/*
|
|
* For those of you who forget, here's the format of IEEE single-prec FP:
|
|
* (but if you don't know this, you're probably barking up the wrong tree).
|
|
*
|
|
* 33222222 22221111 11111100 00000000
|
|
* 10987654 32109876 54321098 76543210
|
|
* +--------+--------+--------+--------+
|
|
* |seeeeeee efffffff ffffffff ffffffff|
|
|
* +--------+--------+--------+--------+
|
|
* s = sign bit
|
|
* e = 8-bit bias127 exponent
|
|
* f = 23-bit fraction (aka mantissa in my book)
|
|
*
|
|
* "Normalized" numbers represent values: (-1)**s * (1.f) * 2**(e-127)
|
|
*
|
|
*/
|
|
|
|
/*
|
|
* Use hi bits of base mantissa to index into the _Nlog2_table
|
|
*/
|
|
sll nB,1,nB /* move left, lose sign bit 8.24 */
|
|
zap nB,HEX(F8),nBfrac /* lose exp for indexing into log2tab*/
|
|
zap nB,HEX(F7),nB /* keep only exponent in nB */
|
|
srl nBfrac,(24-NBITS_LOG2),nIndex /* get hi bits as index */
|
|
s8addq nIndex,pLog2Tab,pEntry /* form log2 table pointer */
|
|
ldq nLog2,0(pEntry) /* get log2 table entry */
|
|
ldah nB,-HEX(7F00)(nB) /* Remove bias from base exponent */
|
|
/*
|
|
* Using slope & intercept from the log2 table,
|
|
* compute log2(base frac) = slope * base frac + intercept
|
|
*/
|
|
sra nLog2,32,nIntcpt /* negative 8.24 intercept */
|
|
zap nLog2,HEX(F0),nSlope /* positive 16.16 slope */
|
|
mulq nBfrac,nSlope,nBfrac /* base frac * slope 16.40 */
|
|
addq nB,nIntcpt,nB /* base exp + intercept 08.24 */
|
|
sra nBfrac,(40-24),nBfrac /* base frac * slope 16.24 */
|
|
addq nB,nBfrac,nB /* basef*slope+intercept+bexp 17.24 */
|
|
|
|
/*
|
|
* Now multiply Power * Base exponent
|
|
*
|
|
* Result format:
|
|
*
|
|
* 66665555 55555544 44444444 33333333 33222222 22221111 11111100 00000000
|
|
* 32109876 54321098 76543210 98765432 10987654 32109876 54321098 76543210
|
|
* +--------+--------+--------+--------V--------+--------+--------+--------+
|
|
* ssssssss EEEEEEEE EEEEEEEE.eeeeeeee eeeeeeee eeeeeeee eeeeeeee eeeeeeee
|
|
*
|
|
*/
|
|
mulq nB, nP, nP /* result exponent (16.40) */
|
|
|
|
/*
|
|
* While we're waiting for the int mult to complete...
|
|
* let's test for some special cases? Is this worth the cache?
|
|
*/
|
|
cmptle fOne, fB, fTest /* test for base >= 1.0 */
|
|
cmpteq fP, fOne, fTest2 /* test for power == 1.0 */
|
|
// fbne fTest, ClampBase
|
|
fbne fTest2, OnePow
|
|
|
|
/*
|
|
* Extract fractional part of result exponent
|
|
* and use to index into pow2 table
|
|
*/
|
|
zap nP, HEX(E0), nPShifted /* remove int part for indexing */
|
|
/* shift hi12 to low byte for index */
|
|
srl nPShifted, (40-NBITS_POW2), nIndex
|
|
s4addq nIndex, pPow2Tab, pEntry
|
|
ldl nResult, 0(pEntry) /* get normalized result */
|
|
|
|
/*
|
|
* Compose final result
|
|
*/
|
|
sra nP, 40, nP /* shift exp down to lose frac */
|
|
sll nP, 23, nPShifted /* shift exp up into position */
|
|
addl nPShifted, nResult, nResult /* compose final answer */
|
|
stl nResult, stackQ0(sp) /* store result for transfer */
|
|
|
|
lda nP, 64(nP) /* add 64 to test for underflow */
|
|
/* assume 2**-64 is close enuf to 0 */
|
|
blt nP, Underflow
|
|
lds fResult, stackQ0(sp) /* load result as float */
|
|
|
|
|
|
ExitHere:
|
|
|
|
lda sp, FRAME_SIZE(sp) /* release stack */
|
|
ret rzero, (ra), 1
|
|
|
|
ZeroPow:
|
|
ClampBase:
|
|
lds fResult, _f1_0 /* x**0 = 1.0 */
|
|
br ExitHere
|
|
|
|
ZeroBase:
|
|
Underflow:
|
|
fmov fzero, fResult /* 0**x = 0, (x!=0) */
|
|
br ExitHere
|
|
|
|
OnePow:
|
|
fmov fB, fResult /* x**1 = x, yes? */
|
|
br ExitHere
|
|
|
|
|
|
ASM_END(fpow)
|
|
|
|
|