You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
847 lines
27 KiB
847 lines
27 KiB
dnl-----------------------------------------------------------------------------
|
|
dnl
|
|
dnl This file contains the macro for generating texture addressing routines.
|
|
dnl
|
|
dnl-----------------------------------------------------------------------------
|
|
dnl
|
|
dnl
|
|
dnl
|
|
dnl d_TexAddr
|
|
dnl
|
|
dnl Generates all the differentiated texture address routines.
|
|
dnl
|
|
dnl It takes 5 parameters.
|
|
dnl
|
|
dnl $1 is one of 0 or 1. 0 is single texture, and 1 is the first multi-texture
|
|
dnl $2 is one of TexAddrWrapMirror TexAddrAll
|
|
dnl $3 is one of NoPersp Persp
|
|
dnl $4 is one of Point Bilinear MaybeBilinear
|
|
dnl $5 is one of NoMip
|
|
dnl
|
|
dnl Note that even when we are not mip mapping, we use iLOD to get to the nearest mip map
|
|
dnl (so iLOD must be 0 if the texture has no mip levels)
|
|
dnl
|
|
|
|
dnl two different jump counts.
|
|
define(`d_WDIVcnt', 0)dnl
|
|
define(`d_MaybeBilinearcnt', 0)dnl
|
|
define(`d_MaxCLODcnt', 0)dnl
|
|
|
|
|
|
|
|
dnl
|
|
dnl Variables needed for texturing
|
|
dnl
|
|
dnl
|
|
|
|
define(`texaddraVars', `
|
|
EXTERN IncHighandLow16:MMWORD
|
|
EXTERN UFracVFracMask:MMWORD
|
|
EXTERN UV32to15Mask:MMWORD
|
|
EXTERN Makelow16one:MMWORD
|
|
EXTERN MaskKeepUValues:MMWORD
|
|
EXTERN MaskKeepVValues:MMWORD
|
|
EXTERN UFrac:MMWORD
|
|
EXTERN VFrac:MMWORD
|
|
EXTERN Zero:MMWORD
|
|
EXTERN memD3DTFG_POINT:MMWORD
|
|
EXTERN GiveUp:MMWORD
|
|
EXTERN LastW:MMWORD
|
|
EXTERN Val0x000a000a:MMWORD
|
|
EXTERN Val0xffff:MMWORD
|
|
EXTERN Val0x0000002000000020:MMWORD
|
|
EXTERN Val0x0000ffff0000ffff:MMWORD
|
|
')
|
|
|
|
dnl
|
|
dnl d_UpdateUoWAndVoW
|
|
dnl increments UoW and VoW for textures and can be used
|
|
dnl in several different files.
|
|
dnl
|
|
define(`d_UpdateUoWandVoW', `
|
|
;pS->iUoW`'d_TexNum += pP->iDUoW`'$1`'DX;
|
|
;pS->iVoW`'d_TexNum += pP->iDVoW`'$1`'DX;
|
|
movq mm5, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
paddd mm5, XpP(DUVoWDX + edi * SIZEOF_UV_UNION)
|
|
movq XpS(UVoW + edi * SIZEOF_UV_UNION), mm5
|
|
')
|
|
|
|
|
|
define(`d_UpdateLOD', `
|
|
; Seems like this should be done with something else
|
|
; i.e. group 16 bit adds together.
|
|
;pS->iLOD += pS->iDLOD;
|
|
xor eax, eax
|
|
mov ax, XpS(iLOD)
|
|
add ax, XpS(iDLOD)
|
|
mov XpS(iLOD), ax
|
|
')
|
|
|
|
define(`d_UpdateOoW', `
|
|
;pS->iOoW += pP->iDOoWDX;
|
|
mov eax, XpS(iOoW)
|
|
add eax, XpP(iDOoWDX)
|
|
mov XpS(iOoW), eax
|
|
')
|
|
|
|
dnl d_UoWVowTimesW is so that I have same code in several different locations.
|
|
dnl These four locations are texaddr1.mas, tstfail.mas, Setup Code (Monolithic and regular)
|
|
dnl
|
|
dnl mm5 is UoW and VoW for either texture 1 or texture two
|
|
dnl esi is W
|
|
dnl $1 is 1 or 2 depending on result is for texture one or texture 2
|
|
dnl
|
|
dnl
|
|
dnl Does integer W * U or V computation
|
|
dnl define(`d_WTimesUVoW', `imul32h_s20(($1), ($2))')dnl
|
|
dnl
|
|
dnl iW = 1.15.16 << 4 = 1.11.20
|
|
dnl UoW = 1.11.20 << 8 = 1.2.28
|
|
dnl
|
|
dnl 1.11.20 * 1.3.28 == 1.15.48 >> 32 == 1.15.16
|
|
|
|
dnl inline INT32 imul32h_s20(INT32 x, INT32 y)
|
|
dnl {
|
|
dnl #ifdef _X86_
|
|
dnl _asm
|
|
dnl {
|
|
dnl mov eax, x
|
|
dnl mov edx, y
|
|
dnl imul edx
|
|
dnl shr eax, 20
|
|
dnl shl edx, 12
|
|
dnl and eax, 000000fffh
|
|
dnl and edx, 0fffff000h
|
|
dnl or eax, edx
|
|
dnl }
|
|
dnl #else
|
|
dnl return (INT32)(((LONGLONG)x * y) >> 20);
|
|
dnl #endif
|
|
dnl }
|
|
|
|
|
|
define(`d_UoWVoWTimesW', `
|
|
;pCtx->SI.TexCol[i].iU = d_WTimesUVoW(pS->iW,pS->iUoW`'$1`');
|
|
;pCtx->SI.TexCol[i].iV = d_WTimesUVoW(pS->iW,pS->iVoW`'$1`');
|
|
movd eax, mm5
|
|
psrlq mm5, 32
|
|
imul esi
|
|
shrd eax, edx, 20
|
|
mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), eax
|
|
movd eax, mm5
|
|
imul esi
|
|
shrd eax, edx, 20
|
|
mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION + SIZEOF_INT32), eax
|
|
')
|
|
|
|
|
|
define(`d_UpdateNonPersp', `
|
|
;pCtx->SI.iU`'$1`' = pS->iUoW`'$1`'>>TEX_TO_FINAL_SHIFT; // 1.11.20 >> 4 == 1.15.16
|
|
;pCtx->SI.iV`'$1`' = pS->iVoW`'$1`'>>TEX_TO_FINAL_SHIFT;
|
|
; mm5 still contains iUoW and iVoW which are the iU and iV values for
|
|
; non perspective correct.
|
|
psrad mm5, TEX_TO_FINAL_SHIFT
|
|
movq XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), mm5
|
|
')
|
|
|
|
dnl d_WDivide
|
|
dnl
|
|
dnl Does incremental W divide calculation
|
|
dnl
|
|
define(`d_WDivide', `
|
|
dnl Increment counter for jump address calc stuff.
|
|
define(`d_WDIVcnt', eval(d_WDIVcnt+1))dnl
|
|
dnl This was deemed too annoying
|
|
dnl #if DBG
|
|
dnl if (iOoW <= 0)
|
|
dnl {
|
|
dnl D3D_WARN(0, "WDivide, iOoW (%d) out of Range!", iOoW);
|
|
dnl DDASSERT(0);
|
|
dnl }
|
|
dnl #endif
|
|
dnl Note: iOoW comes in as eax. So it is ready for first multiply
|
|
dnl iOoW is actual iOoW value in 1.31 form instead of 1.15 form. good.
|
|
dnl In SpecialW case I have to reload it at the end.
|
|
|
|
;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
|
|
; TODO Could do this and OoW Add at same time with MMX.
|
|
|
|
mov edx, XpS(iW)
|
|
mov LastW, edx ; Save iW to calc iDW for next time.
|
|
add edx, XpCtxSI(iDW)
|
|
|
|
;if (pCtx->SI.iSpecialW < 0)
|
|
;{
|
|
xor edi, edi
|
|
cmp di, word ptr XpCtxSI(iSpecialW)
|
|
jle DontDoSpecialW`'d_WDIVcnt`'
|
|
|
|
;DoSpecialW`'d_WDIVcnt`':
|
|
; This label is a left over from when
|
|
|
|
;if (iWn0 < 0)
|
|
;{
|
|
cmp edx, edi
|
|
jl WOutOfRange`'d_WDIVcnt`'
|
|
;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
|
|
mov edx, LastW
|
|
sar edx, 1
|
|
;}
|
|
WOutOfRange`'d_WDIVcnt`':
|
|
|
|
;VAL32 iWn1;
|
|
;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
|
|
; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
|
|
|
|
;INT32 iGiveUp = 7;
|
|
mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
|
|
;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
|
|
;{
|
|
SpecW`'d_WDIVcnt`'Loop1:
|
|
|
|
; Could move this to bottom of loop and combine results somehow.
|
|
; TBD look at it more.
|
|
dec GiveUp
|
|
jz ExitSpecWLoop`'d_WDIVcnt`'
|
|
|
|
; Shift iOoW by one since imul cannot have sign bit set
|
|
; OoW cannot reach one, only 0x7fffffff
|
|
;shr eax, 1 ; 1.31 >> 1 = 1.30
|
|
|
|
; Get ready to do Two minus iOoW*iW
|
|
mov esi, (1 SHL 16)
|
|
|
|
;iWnOld = iWn0;
|
|
mov edi, edx
|
|
|
|
; Result should be close to one so we want most of the
|
|
; precision in the low bits. Need to give more bits
|
|
; leaway since these are the bad cases.
|
|
; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
|
|
imul edx
|
|
|
|
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
|
|
sub esi, edx
|
|
|
|
;while(iWn1.i < 0)
|
|
;{
|
|
SpecW`'d_WDIVcnt`'Loop2:
|
|
test esi, esi
|
|
jns SpecW`'d_WDIVcnt`'ExitLoop2 ; This jump should be predicted correctly most of the time.
|
|
;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
|
|
add esi, (1 SHL 15)
|
|
sar esi, 1
|
|
jmp SpecW`'d_WDIVcnt`'Loop2
|
|
;}
|
|
|
|
SpecW`'d_WDIVcnt`'ExitLoop2:
|
|
|
|
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
|
|
mov eax, edi
|
|
|
|
shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
|
|
shl esi, 12 ; 4.15 << 12 = 4.27 ;
|
|
|
|
;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
|
|
; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
|
|
mul esi
|
|
|
|
; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
|
|
sub edi, edx
|
|
|
|
; These four lines are abs code.
|
|
mov eax, edi
|
|
sar eax, 31
|
|
xor edi, eax
|
|
sub edi, eax
|
|
|
|
cmp edi, 020h ;Assuming that loop will only happen once.
|
|
jbe ExitSpecWLoop`'d_WDIVcnt`'
|
|
|
|
; Reload eax with iOoW.
|
|
mov eax, XpS(iOoW)
|
|
jmp SpecW`'d_WDIVcnt`'Loop1
|
|
;}
|
|
;else
|
|
;{
|
|
DontDoSpecialW`'d_WDIVcnt`':
|
|
; Everything should be positive in Non-SpecialW case.
|
|
|
|
;INT32 iWn1;
|
|
mov esi, (1 SHL 16)
|
|
mov edi, edx
|
|
|
|
; This should be close to one so Low bits are most important.
|
|
;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
|
|
mul edx
|
|
|
|
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
|
|
sub esi, edx
|
|
|
|
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
|
|
shl esi, 15 ; 0.16.15 << 15 = 0.2.30
|
|
|
|
mov eax, esi
|
|
;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
|
|
mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
|
|
|
|
shl edx, 2 ; 1.17.14 << 2 = 1.15.16
|
|
;}
|
|
;}
|
|
|
|
ExitSpecWLoop`'d_WDIVcnt`':
|
|
|
|
;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
|
|
;pS->iW = iWn0;
|
|
mov XpS(iW), edx
|
|
mov esi, edx ; Save W for multiplying by UoW and VoW
|
|
sub edx, LastW
|
|
mov XpCtxSI(iDW), edx
|
|
|
|
;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
|
|
inc word ptr XpCtxSI(iSpecialW)
|
|
')
|
|
dnl
|
|
|
|
define(`d_TexAddr', `
|
|
;---------------------------------------------------------------------------
|
|
;void TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
; PD3DI_RASTSPAN pS, INT32 iTex)
|
|
;{
|
|
PUBLIC _MMX_TexAddr_$2_$3_$4_$5
|
|
_MMX_TexAddr_$2_$3_$4_$5:
|
|
|
|
;PD3DI_SPANTEX pTex = &pCtx->pTexture[$1];
|
|
mov edx, iTex
|
|
mov esi, XpCtx(pTexture + edx * SIZEOF_PSPANTEX)
|
|
|
|
ifelse(`$4', `MaybeBilinear', `
|
|
; In maybe bilinear just jump to point or bi-linear depending on iLOD.
|
|
define(`d_MaybeBilinearcnt', eval(d_MaybeBilinearcnt+1))dnl
|
|
;if ((((UINT16)pS->iLOD) >> 15) ^ (INT16)(pTex->uMagFilter == D3DTFG_POINT))
|
|
;{
|
|
; TODO check to see if MMX really needed here.
|
|
movd mm1, XpTex(uMagFilter)
|
|
pcmpeqd mm1, mmword ptr memD3DTFG_POINT
|
|
|
|
movsx edx, word ptr XpS(iLOD)
|
|
movd eax, mm1
|
|
sar edx, 15 ; Generates mask based on sign.
|
|
xor edx, eax
|
|
|
|
jz DoPoint`'d_MaybeBilinearcnt
|
|
|
|
;// if magnify matches Mag filter, bilinear, else point
|
|
;C_TexAddr_$2_$3_Bilinear_$5(pCtx, pP, pS);
|
|
jmp _MMX_TexAddr_$2_$3_Bilinear_$5
|
|
DoPoint`'d_MaybeBilinearcnt`':
|
|
jmp _MMX_TexAddr_$2_$3_Point_$5
|
|
', `
|
|
ifelse(`$5', `LOD', `
|
|
;INT16 iLOD0 = min(max(pS->iLOD >> 11, 0), pTex->cLOD);
|
|
|
|
movsx eax, word ptr XpS(iLOD)
|
|
sar eax, 11
|
|
|
|
mov edx, eax
|
|
sar edx, 31
|
|
xor edx, 0ffffffffh
|
|
and eax, edx
|
|
define(`d_MaxCLODcnt', eval(d_MaxCLODcnt+1))dnl
|
|
cmp eax, XpTex(cLOD)
|
|
jb NotMax`'d_MaxCLODcnt`'
|
|
mov eax, XpTex(cLOD)
|
|
|
|
NotMax`'d_MaxCLODcnt`':
|
|
|
|
movd mm3, eax
|
|
')
|
|
; ----------------------------------------
|
|
; Doing UV calculation a little more accurate
|
|
; Exactly like C code.
|
|
|
|
; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
|
|
; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
|
|
; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
|
|
; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
|
|
|
|
; COMMENT1**
|
|
; If textures have a max of 1024 then shiftU0 would be at most 10 which would
|
|
; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
|
|
; It will also give bi-linear 6 bits of precision I think it was said that
|
|
; only five was needed.
|
|
;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
|
|
;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
|
|
movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
|
|
ifelse(`$5', `NoLOD', `
|
|
;iLOD0 is zero so no subtraction needed and LOD doesnt need to be subtracted from U and V.
|
|
', `
|
|
punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V
|
|
')dnl
|
|
movd mm4, XpTex(iShiftU)
|
|
ifelse(`$5', `LOD', `
|
|
psubw mm4, mm3
|
|
')dnl
|
|
psubw mm5, mm4
|
|
movq mm4, mm5
|
|
pand mm5, MMWORD PTR Val0xffff
|
|
ifelse(`$5', `LOD', `
|
|
pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy
|
|
')
|
|
psrld mm4, 16
|
|
|
|
mov edx, iTex
|
|
movd mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
|
|
psrad mm1, mm5
|
|
movd mm2, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION + SIZEOF_INT32)
|
|
psrad mm2, mm4
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
psubd mm1, MMWORD PTR Val0x0000002000000020
|
|
')
|
|
|
|
; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
|
|
; ----------------- Start of hack
|
|
; ATTENTION This is really hacked right now. Just to get it working
|
|
; Pitch would be better for me, instead of shift pitch.
|
|
; With actual pitch, this would be two moves and a shift.
|
|
;shl eax, 1
|
|
ifelse(`$5', `LOD', `
|
|
movzx edx, word ptr XpTex(iShiftPitch+eax*2)
|
|
', `
|
|
movzx edx, word ptr XpTex(iShiftPitch)
|
|
')dnl
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
;pslld mm5, 16 ;. Use this after hack.
|
|
; not needed in hacked version since i add to shifted value.
|
|
; ----------------- End of hack
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
; Make the low 16 bits of dword one
|
|
; This helps in calculating texture address.
|
|
|
|
|
|
|
|
; Gets U and V value into mm1 so that it can be mirrored, wrapped or
|
|
; clamped. This can be done for two values in the point case
|
|
; or four values in the bilinear case.
|
|
|
|
ifelse(`$4', `Point', `
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
psrad mm1, 6
|
|
packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror
|
|
; operations assume UV in low 32 bits.
|
|
', `
|
|
;INT32 iUFrac = iU00 & 0x03f;
|
|
;INT32 iVFrac = iV00 & 0x03f;
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
movq mm2, mm1
|
|
psrad mm1, 6
|
|
;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
|
|
pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
|
|
|
|
; Going to use only 8 bits for bi-linear so that I can do a pmullw.
|
|
; Currently at 6 bits so shift up by 2.
|
|
psllw mm2, 2
|
|
|
|
movq mm0, mm2
|
|
; Replicate VFrac value for bilinear
|
|
punpckhwd mm2, mm2
|
|
punpcklwd mm2, mm2
|
|
|
|
; Replicate UFrac Value for bilinear
|
|
punpcklwd mm0, mm0
|
|
punpcklwd mm0, mm0
|
|
|
|
movq dword ptr VFrac, mm2
|
|
movq dword ptr UFrac, mm0
|
|
|
|
;INT32 iU01 = iU00 + 1;
|
|
;INT32 iV01 = iV00 + 1;
|
|
packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
|
|
paddw mm1, dword ptr IncHighandLow16
|
|
; This will make texture values be (High word to low word):
|
|
; iV01, iU00, iV00, iU01
|
|
; Need to do this to make texture look up for bilinear easier.
|
|
; I have to combine to get all combinations anyway. It just
|
|
; happens to be better for me to have iV00, iU01 pair first.
|
|
')
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, XpTex(uMaskU) ; Load U and V mask
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
; replicate mask if doing bilinear
|
|
punpckldq mm0, mm0
|
|
')
|
|
|
|
ifelse(`$5', `NoLOD', `
|
|
; iLOD0 is zero so no shift needed.
|
|
' , `
|
|
; iLOD0 shift value left over from above. TBD. Put this in in mip case
|
|
; Could do this one before or after the unpack also.
|
|
psrlw mm0, mm3
|
|
')
|
|
|
|
ifelse(`$2', `TexAddrWrapMirror', `
|
|
;INT16 iFlip;
|
|
; MM1 should contain 16 bit iU and iV for both texture locations
|
|
; End Result is MM1 value wrapped or mirrored
|
|
; in Bilinear Case, four values can be done
|
|
; iU00, iV00, iU01, iV01
|
|
; This code really does alot for the bilinear case and is kinda wasteful
|
|
; in the normal mode.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
; Point doesnt need replication
|
|
movd mm4, XpTex(iFlipMaskU)
|
|
; if bilinear replicate values together, Point doesnt need this.
|
|
ifelse(`$4', `Bilinear', `
|
|
punpckldq mm4, mm4
|
|
')
|
|
ifelse(`$5', `NoLOD', `
|
|
; iLOD0 is zero so no shift needed.
|
|
' , `
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
')
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
; Result in mm4 now since TexAddrAll ends up that way.
|
|
; Still need to look at register useage more.
|
|
movq mm4, mm1
|
|
') dnl
|
|
|
|
ifelse(`$2', `TexAddrAll', `
|
|
;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
|
|
;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
|
|
;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
|
|
;movq mm6, XpS(UVoW + iTex * SIZEOF_UV_UNION)
|
|
|
|
;movq mm6, MMWORD PTR Zero
|
|
pxor mm6, mm6
|
|
|
|
; TBD Data in SPANTEX needs to be rearange to make life simpler.
|
|
; I have rearranged some of it, but there still needs to be some
|
|
; fixes to it.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
dnl Only replicate if U and V if doing bilinear
|
|
punpckldq mm4, mm4 ; copy UV
|
|
')
|
|
ifelse(`$5', `NoLOD', `
|
|
; iLOD0 is zero so no shift needed.
|
|
' , `
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
')
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
|
|
mov edx, iTex
|
|
pcmpgtd mm6, XpS(UVoW + edx * SIZEOF_UV_UNION)
|
|
packssdw mm6, mm6
|
|
|
|
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
|
|
movd mm7, XpS(iOoW)
|
|
punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
|
|
psrld mm7, 11 ; Make OoWs Precision Match UoWs.
|
|
pcmpgtd mm7, XpS(UVoW + edx * SIZEOF_UV_UNION)
|
|
packssdw mm7, mm7
|
|
|
|
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
|
|
movd mm0, XpTex(iClampMinU)
|
|
ifelse(`$4', `Bilinear', `
|
|
punpckldq mm0, mm0
|
|
')
|
|
pand mm0, mm6
|
|
|
|
; Save clamp2 because pandn will destory value.
|
|
movq mm4, mm7
|
|
|
|
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
|
|
movd mm2, XpTex(iClampMaxU)
|
|
ifelse(`$4', `Bilinear', `
|
|
punpckldq mm2, mm2
|
|
')
|
|
ifelse(`$5', `NoLOD', `
|
|
; iLOD0 is zero so no shift needed.
|
|
' , `
|
|
psraw mm2, mm3 ; Shifts clamp max to correct bit location
|
|
')
|
|
pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
|
|
|
|
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
|
|
pandn mm6, mm4
|
|
|
|
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
|
|
movd mm2, XpTex(iClampEnU)
|
|
ifelse(`$4', `Bilinear', `
|
|
punpckldq mm2, mm2
|
|
')
|
|
pandn mm6, mm2
|
|
|
|
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
|
|
pandn mm6, mm1
|
|
|
|
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
|
|
por mm6, mm0
|
|
|
|
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
|
|
por mm6, mm7
|
|
movq mm4, mm6
|
|
') dnl
|
|
|
|
; Making other two cases for texture addressing has to be simplier than
|
|
; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
|
|
; TBD Make this better.
|
|
; values are still stored as iV01, iU00, iV00, iU01
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
movq mm2, mm4
|
|
movq mm3, mm4
|
|
') dnl Bilinear
|
|
|
|
dnl ifelse(`$2', `TexAddrAll', `
|
|
movq mm0, mm4
|
|
dnl ') dnl border code
|
|
|
|
pmaddwd mm4, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
; values are being changed to iV01, iU01, iV00, iU00
|
|
; seven instructions for this seems excessive.
|
|
pand mm2, MMWORD ptr MaskKeepUValues
|
|
pand mm3, MMWORD ptr MaskKeepVValues
|
|
movq mm1, mm2
|
|
psllq mm2, 32
|
|
psrlq mm1, 32
|
|
por mm3, mm2
|
|
por mm3, mm1
|
|
') dnl Bilinear
|
|
|
|
; From here until mov edi is code that is needed for border.
|
|
; all sign bits are stored in bytes so that border code can tell if uv went below zero.
|
|
dnl ifelse(`$2', `TexAddrAll', `
|
|
ifelse(`$4', `Point', `
|
|
; Point needs to be in same format as bilinear for border
|
|
packsswb mm0, mm0
|
|
') dnl point
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
; mm0 = iV01, iU00, iV00, iU01
|
|
; mm3 = iV01, iU01, iV00, iU00
|
|
; Need to rearrange values to be like so v1 u0 v1 u1 v0 u0 v0 u1 in bytes
|
|
; This is really bad. Just doing whatever to get it to work.
|
|
movq mm1, mm0
|
|
punpckldq mm1, mm3 ; This will make mm1 = v0 u0 v0 u1
|
|
movq mm2, mm3
|
|
punpckhdq mm2, mm0 ; This will make mm0 = v1 u0 v1 u1
|
|
packsswb mm1, mm2
|
|
movq mm0, mm1
|
|
') dnl Bilinear
|
|
dnl ') dnl TexAddrAll
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
|
|
') dnl Bilinear
|
|
|
|
dnl ; Load pTex->pBits[iLOD0] into esi. It will be needed.
|
|
dnl ; Convient that eax is still around as iLOD0. TBD make sure eax positive.
|
|
|
|
ifelse(`$5', `NoLOD', `
|
|
mov edi, XpTex(pBits)
|
|
',`
|
|
mov edi, XpTex(pBits+eax*4)
|
|
')dnl
|
|
; was esi. Cant change to esi because it is the pointer to pTex
|
|
; which is used by Border and ColorKey. Use edi for now and
|
|
; call routines through memory. Figure out if this is bad.
|
|
|
|
; load the read texture routine address into a register early
|
|
;mov edi, XpCtx(pfnTexRead + $1*SIZEOF_PFNTEXREAD)
|
|
|
|
ifelse(`$4', `Bilinear', `
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;UINT32 uTex00 = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
|
|
; Combine U and V values before making call.
|
|
;call edi
|
|
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
|
|
|
|
movd eax, mm3
|
|
movq mm7, mm1 ; Put TColor[iU0, uV0] in mm7
|
|
|
|
;UINT32 uTex10 = pCtx->pfnTexRead[$1](iU01, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
|
|
|
|
psrlq mm3, 32
|
|
psubw mm7, mm1
|
|
psllw mm1, 8
|
|
pmullw mm7, dword ptr UFrac
|
|
paddw mm7, mm1 ; Should I copy mm1 to another variable and do shift/add later?
|
|
movd eax, mm3
|
|
|
|
;UINT32 uTex01 = pCtx->pfnTexRead[$1](iU00, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
|
|
|
|
psrlq mm4, 32
|
|
movq mm6, mm1
|
|
movd eax, mm4
|
|
;UINT32 uTex11 = pCtx->pfnTexRead[$1](iU01, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
|
|
|
|
;TexFiltBilinear(&pCtx->SI.TexCol[$1], iUFrac, iVFrac, uTex00, uTex10, uTex01, uTex11);
|
|
; The amount of shifting instructions for this makes the other approach
|
|
; look pretty good.
|
|
psubw mm6, mm1
|
|
psllw mm1, 8
|
|
pmullw mm6, dword ptr UFrac ; TBD explain this code better.
|
|
movq mm4, mm7
|
|
paddw mm6, mm1
|
|
psrlw mm6, 8
|
|
psrlw mm7, 8
|
|
psubw mm6, mm7
|
|
pmullw mm6, dword ptr VFrac
|
|
paddw mm4, mm6
|
|
psrlw mm4, 8
|
|
|
|
; TBD shouldnt have to pack and then unpack later. Should keep in a register
|
|
packuswb mm4, mm4
|
|
mov edx, iTex
|
|
movd XpCtxSI(TexCol+edx*4), mm4
|
|
|
|
') dnl
|
|
|
|
ifelse(`$4', `Point', `
|
|
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;pCtx->SI.TexCol[$1] = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
|
|
|
|
; TBD Currently have to pack and then unpack later. Should be able
|
|
; to leave the value in some register for a while. I would think.
|
|
packuswb mm1, mm1
|
|
mov edx, iTex
|
|
movd XpCtxSI(TexCol+edx*4), mm1
|
|
') dnl
|
|
|
|
dnl only do update code in non-monolithic case. Monolithic code updates are done
|
|
dnl by tstfail routine.
|
|
|
|
push edi
|
|
mov edi, iTex
|
|
d_UpdateUoWandVoW()
|
|
pop edi
|
|
|
|
ifelse(`$5', `LOD', `
|
|
cmp iTex, 0
|
|
jne SkipLOD$2$3$4$5
|
|
d_UpdateLOD()
|
|
SkipLOD$2$3$4$5:
|
|
')
|
|
|
|
ifelse(`$3', `Persp', `
|
|
|
|
cmp iTex, 0
|
|
jne TexStoreW$2$3$4$5
|
|
d_UpdateOoW()
|
|
|
|
;pS->iW = 0x00800000/(pS->iOoW>>16); // 9.23/1.15 = 8.8
|
|
d_WDivide()
|
|
jmp Tex$2$3$4$5
|
|
|
|
TexStoreW$2$3$4$5:
|
|
; In Texaddr1, W is calculated and result is in esi. I need to get the W value back into esi for the multiply.
|
|
mov esi, XpS(iW)
|
|
|
|
Tex$2$3$4$5:
|
|
push edi
|
|
mov edi, iTex
|
|
d_UoWVoWTimesW()
|
|
pop edi
|
|
', `
|
|
|
|
push edi
|
|
mov edi, iTex
|
|
d_UpdateNonPersp()
|
|
pop edi
|
|
|
|
')
|
|
; load the next bead address into a register early. Not early anymore
|
|
; since so much regular non-mmx code being done for WDIV
|
|
; mov eax, XpCtx(pfnTex`'d_TexNum`'AddrEnd)
|
|
|
|
; pCtx->pfnTex`'d_TexNum`'AddrEnd(pCtx, pP, pS);
|
|
; jmp eax
|
|
|
|
; We now need to return
|
|
ret
|
|
|
|
')')
|
|
dnl
|
|
dnl
|
|
dnl d_TexAddrHdr
|
|
dnl
|
|
dnl Generates headers with the same format as d_TexAddr
|
|
dnl
|
|
define(`d_TexAddrHdr', `
|
|
void MMX_TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
PD3DI_RASTSPAN pS, INT32 iTex);')dnl
|
|
dnl
|