|
|
dnl----------------------------------------------------------------------------- dnl dnl This file contains the macro for generating texture addressing routines. dnl dnl----------------------------------------------------------------------------- dnl dnl dnl dnl d_TexAddr dnl dnl Generates all the differentiated texture address routines. dnl dnl It takes 5 parameters. dnl dnl $1 is one of 0 or 1. 0 is single texture, and 1 is the first multi-texture dnl $2 is one of TexAddrWrapMirror TexAddrAll dnl $3 is one of NoPersp Persp dnl $4 is one of Point Bilinear MaybeBilinear dnl $5 is one of NoMip dnl dnl Note that even when we are not mip mapping, we use iLOD to get to the nearest mip map dnl (so iLOD must be 0 if the texture has no mip levels) dnl
dnl two different jump counts. define(`d_WDIVcnt', 0)dnl define(`d_MaybeBilinearcnt', 0)dnl define(`d_MaxCLODcnt', 0)dnl
dnl dnl Variables needed for texturing dnl dnl
define(`texaddraVars', ` EXTERN IncHighandLow16:MMWORD EXTERN UFracVFracMask:MMWORD EXTERN UV32to15Mask:MMWORD EXTERN Makelow16one:MMWORD EXTERN MaskKeepUValues:MMWORD EXTERN MaskKeepVValues:MMWORD EXTERN UFrac:MMWORD EXTERN VFrac:MMWORD EXTERN Zero:MMWORD EXTERN memD3DTFG_POINT:MMWORD EXTERN GiveUp:MMWORD EXTERN LastW:MMWORD EXTERN Val0x000a000a:MMWORD EXTERN Val0xffff:MMWORD EXTERN Val0x0000002000000020:MMWORD EXTERN Val0x0000ffff0000ffff:MMWORD ')
dnl dnl d_UpdateUoWAndVoW dnl increments UoW and VoW for textures and can be used dnl in several different files. dnl define(`d_UpdateUoWandVoW', ` ;pS->iUoW`'d_TexNum += pP->iDUoW`'$1`'DX; ;pS->iVoW`'d_TexNum += pP->iDVoW`'$1`'DX; movq mm5, XpS(UVoW + edi * SIZEOF_UV_UNION) paddd mm5, XpP(DUVoWDX + edi * SIZEOF_UV_UNION) movq XpS(UVoW + edi * SIZEOF_UV_UNION), mm5 ')
define(`d_UpdateLOD', ` ; Seems like this should be done with something else ; i.e. group 16 bit adds together. ;pS->iLOD += pS->iDLOD; xor eax, eax mov ax, XpS(iLOD) add ax, XpS(iDLOD) mov XpS(iLOD), ax ')
define(`d_UpdateOoW', ` ;pS->iOoW += pP->iDOoWDX; mov eax, XpS(iOoW) add eax, XpP(iDOoWDX) mov XpS(iOoW), eax ')
dnl d_UoWVowTimesW is so that I have same code in several different locations. dnl These four locations are texaddr1.mas, tstfail.mas, Setup Code (Monolithic and regular) dnl dnl mm5 is UoW and VoW for either texture 1 or texture two dnl esi is W dnl $1 is 1 or 2 depending on result is for texture one or texture 2 dnl dnl dnl Does integer W * U or V computation dnl define(`d_WTimesUVoW', `imul32h_s20(($1), ($2))')dnl dnl dnl iW = 1.15.16 << 4 = 1.11.20 dnl UoW = 1.11.20 << 8 = 1.2.28 dnl dnl 1.11.20 * 1.3.28 == 1.15.48 >> 32 == 1.15.16
dnl inline INT32 imul32h_s20(INT32 x, INT32 y) dnl { dnl #ifdef _X86_ dnl _asm dnl { dnl mov eax, x dnl mov edx, y dnl imul edx dnl shr eax, 20 dnl shl edx, 12 dnl and eax, 000000fffh dnl and edx, 0fffff000h dnl or eax, edx dnl } dnl #else dnl return (INT32)(((LONGLONG)x * y) >> 20); dnl #endif dnl }
define(`d_UoWVoWTimesW', ` ;pCtx->SI.TexCol[i].iU = d_WTimesUVoW(pS->iW,pS->iUoW`'$1`'); ;pCtx->SI.TexCol[i].iV = d_WTimesUVoW(pS->iW,pS->iVoW`'$1`'); movd eax, mm5 psrlq mm5, 32 imul esi shrd eax, edx, 20 mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), eax movd eax, mm5 imul esi shrd eax, edx, 20 mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION + SIZEOF_INT32), eax ')
define(`d_UpdateNonPersp', ` ;pCtx->SI.iU`'$1`' = pS->iUoW`'$1`'>>TEX_TO_FINAL_SHIFT; // 1.11.20 >> 4 == 1.15.16 ;pCtx->SI.iV`'$1`' = pS->iVoW`'$1`'>>TEX_TO_FINAL_SHIFT; ; mm5 still contains iUoW and iVoW which are the iU and iV values for ; non perspective correct. psrad mm5, TEX_TO_FINAL_SHIFT movq XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), mm5 ')
dnl d_WDivide dnl dnl Does incremental W divide calculation dnl define(`d_WDivide', ` dnl Increment counter for jump address calc stuff. define(`d_WDIVcnt', eval(d_WDIVcnt+1))dnl dnl This was deemed too annoying dnl #if DBG dnl if (iOoW <= 0) dnl { dnl D3D_WARN(0, "WDivide, iOoW (%d) out of Range!", iOoW); dnl DDASSERT(0); dnl } dnl #endif dnl Note: iOoW comes in as eax. So it is ready for first multiply dnl iOoW is actual iOoW value in 1.31 form instead of 1.15 form. good. dnl In SpecialW case I have to reload it at the end.
;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16 ; TODO Could do this and OoW Add at same time with MMX.
mov edx, XpS(iW) mov LastW, edx ; Save iW to calc iDW for next time. add edx, XpCtxSI(iDW)
;if (pCtx->SI.iSpecialW < 0) ;{ xor edi, edi cmp di, word ptr XpCtxSI(iSpecialW) jle DontDoSpecialW`'d_WDIVcnt`'
;DoSpecialW`'d_WDIVcnt`': ; This label is a left over from when
;if (iWn0 < 0) ;{ cmp edx, edi jl WOutOfRange`'d_WDIVcnt`' ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead mov edx, LastW sar edx, 1 ;} WOutOfRange`'d_WDIVcnt`':
;VAL32 iWn1; ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
;INT32 iGiveUp = 7; mov GiveUp, 8 ; Pre decrementing instead of post decrementing. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0)) ;{ SpecW`'d_WDIVcnt`'Loop1:
; Could move this to bottom of loop and combine results somehow. ; TBD look at it more. dec GiveUp jz ExitSpecWLoop`'d_WDIVcnt`'
; Shift iOoW by one since imul cannot have sign bit set ; OoW cannot reach one, only 0x7fffffff ;shr eax, 1 ; 1.31 >> 1 = 1.30
; Get ready to do Two minus iOoW*iW mov esi, (1 SHL 16)
;iWnOld = iWn0; mov edi, edx
; Result should be close to one so we want most of the ; precision in the low bits. Need to give more bits ; leaway since these are the bad cases. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15 imul edx
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1 sub esi, edx
;while(iWn1.i < 0) ;{ SpecW`'d_WDIVcnt`'Loop2: test esi, esi jns SpecW`'d_WDIVcnt`'ExitLoop2 ; This jump should be predicted correctly most of the time. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2 add esi, (1 SHL 15) sar esi, 1 jmp SpecW`'d_WDIVcnt`'Loop2 ;}
SpecW`'d_WDIVcnt`'ExitLoop2:
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30 mov eax, edi
shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits?? shl esi, 12 ; 4.15 << 12 = 4.27 ;
;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16 ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift. mul esi
; Have to do (abs(iWnOld - iWn0) > 0x20) code here. sub edi, edx
; These four lines are abs code. mov eax, edi sar eax, 31 xor edi, eax sub edi, eax
cmp edi, 020h ;Assuming that loop will only happen once. jbe ExitSpecWLoop`'d_WDIVcnt`'
; Reload eax with iOoW. mov eax, XpS(iOoW) jmp SpecW`'d_WDIVcnt`'Loop1 ;} ;else ;{ DontDoSpecialW`'d_WDIVcnt`': ; Everything should be positive in Non-SpecialW case.
;INT32 iWn1; mov esi, (1 SHL 16) mov edi, edx
; This should be close to one so Low bits are most important. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15 mul edx
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1 sub esi, edx
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30 shl esi, 15 ; 0.16.15 << 15 = 0.2.30
mov eax, esi ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16 mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
shl edx, 2 ; 1.17.14 << 2 = 1.15.16 ;} ;}
ExitSpecWLoop`'d_WDIVcnt`':
;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW; ;pS->iW = iWn0; mov XpS(iW), edx mov esi, edx ; Save W for multiplying by UoW and VoW sub edx, LastW mov XpCtxSI(iDW), edx
;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes inc word ptr XpCtxSI(iSpecialW) ') dnl
define(`d_TexAddr', ` ;--------------------------------------------------------------------------- ;void TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP, ; PD3DI_RASTSPAN pS, INT32 iTex) ;{ PUBLIC _MMX_TexAddr_$2_$3_$4_$5 _MMX_TexAddr_$2_$3_$4_$5:
;PD3DI_SPANTEX pTex = &pCtx->pTexture[$1]; mov edx, iTex mov esi, XpCtx(pTexture + edx * SIZEOF_PSPANTEX)
ifelse(`$4', `MaybeBilinear', ` ; In maybe bilinear just jump to point or bi-linear depending on iLOD. define(`d_MaybeBilinearcnt', eval(d_MaybeBilinearcnt+1))dnl ;if ((((UINT16)pS->iLOD) >> 15) ^ (INT16)(pTex->uMagFilter == D3DTFG_POINT)) ;{ ; TODO check to see if MMX really needed here. movd mm1, XpTex(uMagFilter) pcmpeqd mm1, mmword ptr memD3DTFG_POINT
movsx edx, word ptr XpS(iLOD) movd eax, mm1 sar edx, 15 ; Generates mask based on sign. xor edx, eax
jz DoPoint`'d_MaybeBilinearcnt
;// if magnify matches Mag filter, bilinear, else point ;C_TexAddr_$2_$3_Bilinear_$5(pCtx, pP, pS); jmp _MMX_TexAddr_$2_$3_Bilinear_$5 DoPoint`'d_MaybeBilinearcnt`': jmp _MMX_TexAddr_$2_$3_Point_$5 ', ` ifelse(`$5', `LOD', ` ;INT16 iLOD0 = min(max(pS->iLOD >> 11, 0), pTex->cLOD);
movsx eax, word ptr XpS(iLOD) sar eax, 11
mov edx, eax sar edx, 31 xor edx, 0ffffffffh and eax, edx define(`d_MaxCLODcnt', eval(d_MaxCLODcnt+1))dnl cmp eax, XpTex(cLOD) jb NotMax`'d_MaxCLODcnt`' mov eax, XpTex(cLOD)
NotMax`'d_MaxCLODcnt`':
movd mm3, eax ') ; ---------------------------------------- ; Doing UV calculation a little more accurate ; Exactly like C code.
; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0 ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0)) ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
; COMMENT1** ; If textures have a max of 1024 then shiftU0 would be at most 10 which would ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6 ; It will also give bi-linear 6 bits of precision I think it was said that ; only five was needed. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0; ;INT16 iShiftV0 = pTex->iShiftV - iLOD0; movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10. ifelse(`$5', `NoLOD', ` ;iLOD0 is zero so no subtraction needed and LOD doesnt need to be subtracted from U and V. ', ` punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V ')dnl movd mm4, XpTex(iShiftU) ifelse(`$5', `LOD', ` psubw mm4, mm3 ')dnl psubw mm5, mm4 movq mm4, mm5 pand mm5, MMWORD PTR Val0xffff ifelse(`$5', `LOD', ` pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy ') psrld mm4, 16
mov edx, iTex movd mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION) psrad mm1, mm5 movd mm2, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION + SIZEOF_INT32) psrad mm2, mm4
punpckldq mm1, mm2
ifelse(`$4', `Bilinear', ` psubd mm1, MMWORD PTR Val0x0000002000000020 ')
; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table ; ----------------- Start of hack ; ATTENTION This is really hacked right now. Just to get it working ; Pitch would be better for me, instead of shift pitch. ; With actual pitch, this would be two moves and a shift. ;shl eax, 1 ifelse(`$5', `LOD', ` movzx edx, word ptr XpTex(iShiftPitch+eax*2) ', ` movzx edx, word ptr XpTex(iShiftPitch) ')dnl add edx, 16 movd mm2, edx movq mm5, MMWORD ptr Makelow16one pslld mm5, mm2
;pslld mm5, 16 ;. Use this after hack. ; not needed in hacked version since i add to shifted value. ; ----------------- End of hack
por mm5, MMWORD ptr Makelow16one ; Make the low 16 bits of dword one ; This helps in calculating texture address.
; Gets U and V value into mm1 so that it can be mirrored, wrapped or ; clamped. This can be done for two values in the point case ; or four values in the bilinear case.
ifelse(`$4', `Point', ` ;iU00 >>= 6; ;iV00 >>= 6; psrad mm1, 6 packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror ; operations assume UV in low 32 bits. ', ` ;INT32 iUFrac = iU00 & 0x03f; ;INT32 iVFrac = iV00 & 0x03f; ;iU00 >>= 6; ;iV00 >>= 6; movq mm2, mm1 psrad mm1, 6 ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
; Going to use only 8 bits for bi-linear so that I can do a pmullw. ; Currently at 6 bits so shift up by 2. psllw mm2, 2
movq mm0, mm2 ; Replicate VFrac value for bilinear punpckhwd mm2, mm2 punpcklwd mm2, mm2
; Replicate UFrac Value for bilinear punpcklwd mm0, mm0 punpcklwd mm0, mm0
movq dword ptr VFrac, mm2 movq dword ptr UFrac, mm0
;INT32 iU01 = iU00 + 1; ;INT32 iV01 = iV00 + 1; packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations paddw mm1, dword ptr IncHighandLow16 ; This will make texture values be (High word to low word): ; iV01, iU00, iV00, iU01 ; Need to do this to make texture look up for bilinear easier. ; I have to combine to get all combinations anyway. It just ; happens to be better for me to have iV00, iU01 pair first. ') ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0; ; put mask in mm3 and replicate to match location for wrap/mirror/clamp movd mm0, XpTex(uMaskU) ; Load U and V mask
ifelse(`$4', `Bilinear', ` ; replicate mask if doing bilinear punpckldq mm0, mm0 ')
ifelse(`$5', `NoLOD', ` ; iLOD0 is zero so no shift needed. ' , ` ; iLOD0 shift value left over from above. TBD. Put this in in mip case ; Could do this one before or after the unpack also. psrlw mm0, mm3 ')
ifelse(`$2', `TexAddrWrapMirror', ` ;INT16 iFlip; ; MM1 should contain 16 bit iU and iV for both texture locations ; End Result is MM1 value wrapped or mirrored ; in Bilinear Case, four values can be done ; iU00, iV00, iU01, iV01 ; This code really does alot for the bilinear case and is kinda wasteful ; in the normal mode.
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV; movq mm7, mm1 ; Point doesnt need replication movd mm4, XpTex(iFlipMaskU) ; if bilinear replicate values together, Point doesnt need this. ifelse(`$4', `Bilinear', ` punpckldq mm4, mm4 ') ifelse(`$5', `NoLOD', ` ; iLOD0 is zero so no shift needed. ' , ` psrlw mm4, mm3 ; Shifts mirror mask to correct bit location ') pand mm7, mm4
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0); pcmpeqw mm7, MMWORD PTR Zero
;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4; pandn mm7, mm0
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0; pand mm1, mm0
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4; pxor mm1, mm7
; Result in mm4 now since TexAddrAll ends up that way. ; Still need to look at register useage more. movq mm4, mm1 ') dnl
ifelse(`$2', `TexAddrAll', ` ;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT; ;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W ;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12); ;movq mm6, XpS(UVoW + iTex * SIZEOF_UV_UNION)
;movq mm6, MMWORD PTR Zero pxor mm6, mm6
; TBD Data in SPANTEX needs to be rearange to make life simpler. ; I have rearranged some of it, but there still needs to be some ; fixes to it.
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV; movq mm7, mm1 movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
ifelse(`$4', `Bilinear', ` dnl Only replicate if U and V if doing bilinear punpckldq mm4, mm4 ; copy UV ') ifelse(`$5', `NoLOD', ` ; iLOD0 is zero so no shift needed. ' , ` psrlw mm4, mm3 ; Shifts mirror mask to correct bit location ') pand mm7, mm4
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0); pcmpeqw mm7, MMWORD PTR Zero
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4; pandn mm7, mm0
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0; pand mm1, mm0
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4; pxor mm1, mm7
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj); mov edx, iTex pcmpgtd mm6, XpS(UVoW + edx * SIZEOF_UV_UNION) packssdw mm6, mm6
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj); movd mm7, XpS(iOoW) punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW. psrld mm7, 11 ; Make OoWs Precision Match UoWs. pcmpgtd mm7, XpS(UVoW + edx * SIZEOF_UV_UNION) packssdw mm7, mm7
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14; movd mm0, XpTex(iClampMinU) ifelse(`$4', `Bilinear', ` punpckldq mm0, mm0 ') pand mm0, mm6
; Save clamp2 because pandn will destory value. movq mm4, mm7
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24; movd mm2, XpTex(iClampMaxU) ifelse(`$4', `Bilinear', ` punpckldq mm2, mm2 ') ifelse(`$5', `NoLOD', ` ; iLOD0 is zero so no shift needed. ' , ` psraw mm2, mm3 ; Shifts clamp max to correct bit location ') pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14; pandn mm6, mm4
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24; movd mm2, XpTex(iClampEnU) ifelse(`$4', `Bilinear', ` punpckldq mm2, mm2 ') pandn mm6, mm2
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24; pandn mm6, mm1
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4; por mm6, mm0
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4; por mm6, mm7 movq mm4, mm6 ') dnl
; Making other two cases for texture addressing has to be simplier than ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3. ; TBD Make this better. ; values are still stored as iV01, iU00, iV00, iU01
ifelse(`$4', `Bilinear', ` movq mm2, mm4 movq mm3, mm4 ') dnl Bilinear
dnl ifelse(`$2', `TexAddrAll', ` movq mm0, mm4 dnl ') dnl border code
pmaddwd mm4, mm5 ; Throw in first address calculation. ; Just to get it started. Calculate ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
ifelse(`$4', `Bilinear', ` ; values are being changed to iV01, iU01, iV00, iU00 ; seven instructions for this seems excessive. pand mm2, MMWORD ptr MaskKeepUValues pand mm3, MMWORD ptr MaskKeepVValues movq mm1, mm2 psllq mm2, 32 psrlq mm1, 32 por mm3, mm2 por mm3, mm1 ') dnl Bilinear
; From here until mov edi is code that is needed for border. ; all sign bits are stored in bytes so that border code can tell if uv went below zero. dnl ifelse(`$2', `TexAddrAll', ` ifelse(`$4', `Point', ` ; Point needs to be in same format as bilinear for border packsswb mm0, mm0 ') dnl point
ifelse(`$4', `Bilinear', ` ; mm0 = iV01, iU00, iV00, iU01 ; mm3 = iV01, iU01, iV00, iU00 ; Need to rearrange values to be like so v1 u0 v1 u1 v0 u0 v0 u1 in bytes ; This is really bad. Just doing whatever to get it to work. movq mm1, mm0 punpckldq mm1, mm3 ; This will make mm1 = v0 u0 v0 u1 movq mm2, mm3 punpckhdq mm2, mm0 ; This will make mm0 = v1 u0 v1 u1 packsswb mm1, mm2 movq mm0, mm1 ') dnl Bilinear dnl ') dnl TexAddrAll
ifelse(`$4', `Bilinear', ` pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0 ') dnl Bilinear
dnl ; Load pTex->pBits[iLOD0] into esi. It will be needed. dnl ; Convient that eax is still around as iLOD0. TBD make sure eax positive.
ifelse(`$5', `NoLOD', ` mov edi, XpTex(pBits) ',` mov edi, XpTex(pBits+eax*4) ')dnl ; was esi. Cant change to esi because it is the pointer to pTex ; which is used by Border and ColorKey. Use edi for now and ; call routines through memory. Figure out if this is bad.
; load the read texture routine address into a register early ;mov edi, XpCtx(pfnTexRead + $1*SIZEOF_PFNTEXREAD)
ifelse(`$4', `Bilinear', ` ; iV0 iU1 address should be done by now. movd eax, mm4
;UINT32 uTex00 = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[$1]); ; Combine U and V values before making call. ;call edi
mov edx, iTex call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
movd eax, mm3 movq mm7, mm1 ; Put TColor[iU0, uV0] in mm7
;UINT32 uTex10 = pCtx->pfnTexRead[$1](iU01, iV00, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[$1]); ;call edi mov edx, iTex call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
psrlq mm3, 32 psubw mm7, mm1 psllw mm1, 8 pmullw mm7, dword ptr UFrac paddw mm7, mm1 ; Should I copy mm1 to another variable and do shift/add later? movd eax, mm3
;UINT32 uTex01 = pCtx->pfnTexRead[$1](iU00, iV01, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[$1]); ;call edi mov edx, iTex call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
psrlq mm4, 32 movq mm6, mm1 movd eax, mm4 ;UINT32 uTex11 = pCtx->pfnTexRead[$1](iU01, iV01, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[$1]); ;call edi mov edx, iTex call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
;TexFiltBilinear(&pCtx->SI.TexCol[$1], iUFrac, iVFrac, uTex00, uTex10, uTex01, uTex11); ; The amount of shifting instructions for this makes the other approach ; look pretty good. psubw mm6, mm1 psllw mm1, 8 pmullw mm6, dword ptr UFrac ; TBD explain this code better. movq mm4, mm7 paddw mm6, mm1 psrlw mm6, 8 psrlw mm7, 8 psubw mm6, mm7 pmullw mm6, dword ptr VFrac paddw mm4, mm6 psrlw mm4, 8
; TBD shouldnt have to pack and then unpack later. Should keep in a register packuswb mm4, mm4 mov edx, iTex movd XpCtxSI(TexCol+edx*4), mm4
') dnl
ifelse(`$4', `Point', `
; iV0 iU1 address should be done by now. movd eax, mm4
;pCtx->SI.TexCol[$1] = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[$1]); ;call edi mov edx, iTex call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
; TBD Currently have to pack and then unpack later. Should be able ; to leave the value in some register for a while. I would think. packuswb mm1, mm1 mov edx, iTex movd XpCtxSI(TexCol+edx*4), mm1 ') dnl
dnl only do update code in non-monolithic case. Monolithic code updates are done dnl by tstfail routine.
push edi mov edi, iTex d_UpdateUoWandVoW() pop edi
ifelse(`$5', `LOD', ` cmp iTex, 0 jne SkipLOD$2$3$4$5 d_UpdateLOD() SkipLOD$2$3$4$5: ')
ifelse(`$3', `Persp', `
cmp iTex, 0 jne TexStoreW$2$3$4$5 d_UpdateOoW()
;pS->iW = 0x00800000/(pS->iOoW>>16); // 9.23/1.15 = 8.8 d_WDivide() jmp Tex$2$3$4$5
TexStoreW$2$3$4$5: ; In Texaddr1, W is calculated and result is in esi. I need to get the W value back into esi for the multiply. mov esi, XpS(iW)
Tex$2$3$4$5: push edi mov edi, iTex d_UoWVoWTimesW() pop edi ', `
push edi mov edi, iTex d_UpdateNonPersp() pop edi
') ; load the next bead address into a register early. Not early anymore ; since so much regular non-mmx code being done for WDIV ; mov eax, XpCtx(pfnTex`'d_TexNum`'AddrEnd)
; pCtx->pfnTex`'d_TexNum`'AddrEnd(pCtx, pP, pS); ; jmp eax
; We now need to return ret
')') dnl dnl dnl d_TexAddrHdr dnl dnl Generates headers with the same format as d_TexAddr dnl define(`d_TexAddrHdr', ` void MMX_TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP, PD3DI_RASTSPAN pS, INT32 iTex);')dnl dnl
|