You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1148 lines
39 KiB
1148 lines
39 KiB
;
|
|
; WARNING WARNING WARNING
|
|
; This asm file generated from mas file.
|
|
; EDIT THE MAS FILE.
|
|
; I warned you.
|
|
; WARNING WARNING WARNING
|
|
;
|
|
;-----------------------------------------------------------------------------
|
|
|
|
include(`m4hdr.mh')dnl
|
|
include(`cvars.mh')dnl
|
|
|
|
INCLUDE iammx.inc
|
|
INCLUDE offs_acp.inc
|
|
|
|
.586
|
|
.model flat
|
|
.data
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; Current Texture
|
|
iTex dd ?
|
|
|
|
PUBLIC IncHighandLow16
|
|
IncHighandLow16 dq 0001000000000001h
|
|
PUBLIC UFracVFracMask
|
|
UFracVFracMask dq 0000003f0000003fh ; Used to be 00000fff00000fffh. Change to 6 bits.
|
|
PUBLIC UV32to15Mask
|
|
UV32to15Mask dq 0000ffff0000ffffh ; ffff or 7fff???? dunno.
|
|
PUBLIC Makelow16one
|
|
Makelow16one dq 0000000100000001h
|
|
PUBLIC MaskKeepUValues
|
|
MaskKeepUValues dq 00000ffff0000ffffh
|
|
PUBLIC MaskKeepVValues
|
|
MaskKeepVValues dq 0ffff0000ffff0000h
|
|
PUBLIC UFrac
|
|
UFrac dq ?
|
|
PUBLIC VFrac
|
|
VFrac dq ?
|
|
|
|
PUBLIC Val0x000a000a
|
|
Val0x000a000a dq 000000000000a000ah
|
|
|
|
PUBLIC Val0xffff
|
|
Val0xffff dq 0ffffh
|
|
|
|
PUBLIC Val0x0000002000000020
|
|
Val0x0000002000000020 dq 0000002000000020h
|
|
|
|
PUBLIC Val0x0000ffff0000ffff
|
|
Val0x0000ffff0000ffff dq 0000ffff0000ffffh
|
|
|
|
PUBLIC Zero
|
|
Zero dq 0
|
|
|
|
PUBLIC memD3DTFG_POINT
|
|
memD3DTFG_POINT dq D3DTFG_POINT
|
|
|
|
; Used as counter on inside SpecialW loop.
|
|
PUBLIC GiveUp
|
|
GiveUp dd ?
|
|
PUBLIC LastW
|
|
LastW dd ?
|
|
|
|
|
|
.code
|
|
|
|
include(`texaddra.mh')dnl
|
|
d_RepStr(`d_RepStr(`d_RepStr(`d_RepStr(`d_TexAddr(0, AA, BB, CC, DD, NotMonolithic)',
|
|
`AA', `TexAddrWrapMirror', `TexAddrAll')',
|
|
`BB', `NoPersp', `Persp')',
|
|
`CC', ifelse(DD, NoLOD, `Point, Bilinear', `Point, Bilinear, MaybeBilinear'))',
|
|
`DD', `NoLOD', `LOD')
|
|
|
|
;// All singing all dancing mip mapping address calculation and filtering.
|
|
;// No texture filtering code need be called after this bead.
|
|
;void Tex1AddrFilt_All_Mip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
; PD3DI_RASTSPAN pS, INT32 iTex)
|
|
;{
|
|
PUBLIC _MMX_TexAddr_Filt_All_Mip
|
|
_MMX_TexAddr_Filt_All_Mip:
|
|
|
|
;PD3DI_SPANTEX pTex = &pCtx->pTexture[0];
|
|
mov esi, XpCtx(pTexture + eax*SIZEOF_PSPANTEX)
|
|
|
|
;INT16 iLOD0 = min(max(pS->iLOD >> 11, 0), pTex->cLOD);
|
|
;INT32 iU00 = pCtx->SI.iU`'d_TexNum<<(pTex->iShiftU - iLOD0);
|
|
;INT32 iV00 = pCtx->SI.iV`'d_TexNum<<(pTex->iShiftV - iLOD0);
|
|
movq mm1, XpCtxSI(TexUV + eax * SIZEOF_UV_UNION)
|
|
movsx eax, word ptr XpS(iLOD)
|
|
sar eax, 11
|
|
|
|
mov edx, eax
|
|
sar edx, 31
|
|
not edx
|
|
;xor edx, 0ffffffffh
|
|
and eax, edx
|
|
define(`d_MaxCLODcnt', eval(d_MaxCLODcnt+1))dnl
|
|
cmp eax, XpTex(cLOD)
|
|
jb NotMax`'d_MaxCLODcnt`'
|
|
mov eax, XpTex(cLOD)
|
|
|
|
NotMax`'d_MaxCLODcnt`':
|
|
|
|
; eax is use below so we will keep iLOD0 in mm3 and put it into eax later.
|
|
movd mm3, eax
|
|
|
|
; ----------------------------------------
|
|
; Doing UV calculation a little more accurate
|
|
; Exactly like C code.
|
|
|
|
; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
|
|
; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
|
|
; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
|
|
; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
|
|
|
|
; COMMENT1**
|
|
; If textures have a max of 1024 then shiftU0 would be at most 10 which would
|
|
; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
|
|
; It will also give bi-linear 6 bits of precision I think it was said that
|
|
; only five was needed.
|
|
;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
|
|
;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
|
|
movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
|
|
punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V
|
|
movd mm4, XpTex(iShiftU)
|
|
psubw mm4, mm3
|
|
psubw mm5, mm4
|
|
movq mm4, mm5
|
|
pand mm5, MMWORD PTR Val0xffff
|
|
pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy
|
|
psrld mm4, 16
|
|
|
|
mov eax, iTex
|
|
movd mm1, XpCtxSI(TexUV + eax * SIZEOF_UV_UNION)
|
|
psrad mm1, mm5
|
|
movd mm2, XpCtxSI(TexUV + eax * SIZEOF_UV_UNION + 4)
|
|
psrad mm2, mm4
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
;// select filter based on whether we are minifying or magnifying
|
|
;D3DTEXTUREMINFILTER uFilter;
|
|
;if (pS->iLOD < 0)
|
|
;{
|
|
; // depends on the first two entries (POINT and LINEAR)
|
|
; // being the same for min and mag
|
|
; uFilter = (D3DTEXTUREMINFILTER)pTex->uMagFilter;
|
|
;}
|
|
;else
|
|
;{
|
|
; uFilter = pTex->uMinFilter;
|
|
;}
|
|
; Use edx mask from above to determine if iLOD is less than 0.
|
|
mov eax, XpTex(uMinFilter)
|
|
and eax, edx
|
|
not edx
|
|
and edx, XpTex(uMagFilter)
|
|
or eax, edx
|
|
|
|
;if (uFilter == D3DTFG_LINEAR)
|
|
;{
|
|
cmp eax, D3DTFG_LINEAR
|
|
jne NotLinear
|
|
|
|
; Get LOD from mm3 and put in eax.
|
|
movd eax, mm3
|
|
|
|
; Save this off because there is no way to keep it in a register until next time.
|
|
mov edx, iTex
|
|
movd XpCtxSI(TexCol+edx*4), mm3
|
|
|
|
; This helps in calculating texture address.
|
|
movzx edx, word ptr XpTex(iShiftPitch+eax*2)
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
|
|
;INT32 iHalf = 1<<(TEX_FINAL_SHIFT - iShiftU0 - 1);
|
|
;INT32 iUAlign = pCtx->SI.iU1 - iHalf;
|
|
;iHalf = 1<<(TEX_FINAL_SHIFT - iShiftV0 - 1);
|
|
;INT32 iVAlign = pCtx->SI.iV1 - iHalf;
|
|
;iU00 = iUAlign >> (TEX_FINAL_SHIFT - iShiftU0);
|
|
;iV00 = iVAlign >> (TEX_FINAL_SHIFT - iShiftV0);
|
|
;iUFrac0 = (iUAlign<<iShiftU0) & TEX_FINAL_FRAC_MASK;
|
|
;iVFrac0 = (iVAlign<<iShiftV0) & TEX_FINAL_FRAC_MASK;
|
|
psubd mm1, MMWORD PTR Val0x0000002000000020
|
|
|
|
;INT32 iUFrac = iU00 & 0x03f;
|
|
;INT32 iVFrac = iV00 & 0x03f;
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
movq mm2, mm1
|
|
psrad mm1, 6
|
|
;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
|
|
pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
|
|
|
|
; Going to use only 8 bits for bi-linear so that I can do a pmullw.
|
|
; Currently at 6 bits so shift up by 2.
|
|
psllw mm2, 2
|
|
|
|
movq mm0, mm2
|
|
; Replicate VFrac value for bilinear
|
|
punpckhwd mm2, mm2
|
|
punpcklwd mm2, mm2
|
|
|
|
; Replicate UFrac Value for bilinear
|
|
punpcklwd mm0, mm0
|
|
punpcklwd mm0, mm0
|
|
|
|
movq dword ptr VFrac, mm2
|
|
movq dword ptr UFrac, mm0
|
|
|
|
;INT32 iU01 = iU00 + 1;
|
|
;INT32 iV01 = iV00 + 1;
|
|
packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
|
|
paddw mm1, dword ptr IncHighandLow16
|
|
; This will make texture values be (High word to low word):
|
|
; iV01, iU00, iV00, iU01
|
|
; Need to do this to make texture look up for bilinear easier.
|
|
; I have to combine to get all combinations anyway. It just
|
|
; happens to be better for me to have iV00, iU01 pair first.
|
|
|
|
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, XpTex(uMaskU) ; Load U and V mask
|
|
|
|
; replicate mask if doing bilinear
|
|
punpckldq mm0, mm0
|
|
|
|
psrlw mm0, mm3
|
|
|
|
|
|
;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
|
|
;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
|
|
;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
|
|
;movq mm6, XpS(iUoW`'d_TexNum)
|
|
|
|
;movq mm6, MMWORD PTR Zero
|
|
pxor mm6, mm6
|
|
|
|
; TBD Data in SPANTEX needs to be rearange to make life simpler.
|
|
; I have rearranged some of it, but there still needs to be some
|
|
; fixes to it.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
|
|
punpckldq mm4, mm4 ; copy UV
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
|
|
mov edi, iTex
|
|
pcmpgtd mm6, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm6, mm6
|
|
|
|
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
|
|
movd mm7, XpS(iOoW)
|
|
punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
|
|
psrld mm7, 11 ; Make OoWs Precision Match UoWs.
|
|
pcmpgtd mm7, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm7, mm7
|
|
|
|
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
|
|
movd mm0, XpTex(iClampMinU)
|
|
punpckldq mm0, mm0
|
|
pand mm0, mm6
|
|
|
|
; Save clamp2 because pandn will destory value.
|
|
movq mm4, mm7
|
|
|
|
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
|
|
movd mm2, XpTex(iClampMaxU)
|
|
|
|
punpckldq mm2, mm2
|
|
psraw mm2, mm3 ; Shifts clamp max to correct bit location
|
|
|
|
pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
|
|
|
|
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
|
|
pandn mm6, mm4
|
|
|
|
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
|
|
movd mm2, XpTex(iClampEnU)
|
|
|
|
punpckldq mm2, mm2
|
|
|
|
pandn mm6, mm2
|
|
|
|
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
|
|
pandn mm6, mm1
|
|
|
|
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
|
|
por mm6, mm0
|
|
|
|
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
|
|
por mm6, mm7
|
|
movq mm4, mm6
|
|
|
|
|
|
; Making other two cases for texture addressing has to be simplier than
|
|
; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
|
|
; TBD Make this better.
|
|
; values are still stored as iV01, iU00, iV00, iU01
|
|
movq mm2, mm4
|
|
movq mm3, mm4
|
|
|
|
movq mm0, mm4
|
|
|
|
|
|
pmaddwd mm4, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
; values are being changed to iV01, iU01, iV00, iU00
|
|
; seven instructions for this seems excessive.
|
|
pand mm2, MMWORD ptr MaskKeepUValues
|
|
pand mm3, MMWORD ptr MaskKeepVValues
|
|
movq mm1, mm2
|
|
psllq mm2, 32
|
|
psrlq mm1, 32
|
|
por mm3, mm2
|
|
por mm3, mm1
|
|
|
|
; From here until mov edi is code that is needed for border.
|
|
; all sign bits are stored in bytes so that border code can tell if uv went below zero.
|
|
|
|
; mm0 = iV01, iU00, iV00, iU01
|
|
; mm3 = iV01, iU01, iV00, iU00
|
|
; Need to rearrange values to be like so v1 u0 v1 u1 v0 u0 v0 u1 in bytes
|
|
; This is really bad. Just doing whatever to get it to work.
|
|
movq mm1, mm0
|
|
punpckldq mm1, mm3 ; This will make mm1 = v0 u0 v0 u1
|
|
movq mm2, mm3
|
|
punpckhdq mm2, mm0 ; This will make mm0 = v1 u0 v1 u1
|
|
packsswb mm1, mm2
|
|
movq mm0, mm1
|
|
|
|
pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
|
|
|
|
mov edi, XpTex(pBits+eax*4)
|
|
|
|
; was esi. Cant change to esi because it is the pointer to pTex
|
|
; which is used by Border and ColorKey. Use edi for now and
|
|
; call routines through memory. Figure out if this is bad.
|
|
|
|
; load the read texture routine address into a register early
|
|
;mov edi, XpCtx(pfnTexRead)
|
|
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;UINT32 uTex00 = pCtx->pfnTexRead[0](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
; Combine U and V values before making call.
|
|
;call edi
|
|
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead+edx*4)
|
|
|
|
movd eax, mm3
|
|
movq mm7, mm1 ; Put TColor[iU0, uV0] in mm7
|
|
|
|
;UINT32 uTex10 = pCtx->pfnTexRead[0](iU01, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead+edx*4)
|
|
|
|
psrlq mm3, 32
|
|
psubw mm7, mm1
|
|
psllw mm1, 8
|
|
pmullw mm7, dword ptr UFrac
|
|
paddw mm7, mm1 ; Should I copy mm1 to another variable and do shift/add later?
|
|
movd eax, mm3
|
|
|
|
;UINT32 uTex01 = pCtx->pfnTexRead[0](iU00, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead+edx*4)
|
|
|
|
psrlq mm4, 32
|
|
movq mm6, mm1
|
|
movd eax, mm4
|
|
;UINT32 uTex11 = pCtx->pfnTexRead[0](iU01, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead+edx*4)
|
|
|
|
;TexFiltBilinear(&pCtx->SI.TexCol[0], iUFrac, iVFrac, uTex00, uTex10, uTex01, uTex11);
|
|
; The amount of shifting instructions for this makes the other approach
|
|
; look pretty good.
|
|
psubw mm6, mm1
|
|
psllw mm1, 8
|
|
pmullw mm6, dword ptr UFrac ; TBD explain this code better.
|
|
movq mm4, mm7
|
|
paddw mm6, mm1
|
|
psrlw mm6, 8
|
|
psrlw mm7, 8
|
|
psubw mm6, mm7
|
|
pmullw mm6, dword ptr VFrac
|
|
paddw mm4, mm6
|
|
psrlw mm4, 8
|
|
|
|
; TBD shouldnt have to pack and then unpack later. Should keep in a register
|
|
;packuswb mm4, mm4
|
|
;movd XpCtxSI(TexCol+edi*4), mm4
|
|
movq MMWORD PTR XpCtxSI(uBB), mm4
|
|
|
|
;----Calc second mip level pixel------------------------------------------------------------------------------
|
|
;INT16 iLOD1 = (INT16)(min(iLOD0+(pS->iLOD > 0), pTex->cLOD));
|
|
|
|
;****** Need to save iLOD0 from above somehow.
|
|
; Saving it in second texture color for now.
|
|
mov edx, iTex
|
|
movd mm3, XpCtxSI(TexCol+edx * 4)
|
|
|
|
pxor mm5, mm5
|
|
movd mm2, XpS(iLOD)
|
|
pcmpgtw mm2, mm5
|
|
psubw mm3, mm2
|
|
|
|
movd mm1, XpTex(cLOD)
|
|
movq mm2, mm3
|
|
pcmpgtw mm3, mm1
|
|
|
|
pand mm1, mm3
|
|
pandn mm3, mm2
|
|
por mm3, mm1
|
|
|
|
pand mm3, MMWORD PTR Val0xffff ; Get rid of any data in the high word.
|
|
|
|
; Get LOD from mm3 and put in eax.
|
|
movd eax, mm3
|
|
|
|
mov edx, iTex
|
|
movq mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
|
|
|
|
;INT16 iShiftU1 = pTex->iShiftU - iLOD1;
|
|
;INT16 iShiftV1 = pTex->iShiftV - iLOD1;
|
|
movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
|
|
punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V
|
|
movd mm4, XpTex(iShiftU)
|
|
psubw mm4, mm3
|
|
psubw mm5, mm4
|
|
movq mm4, mm5
|
|
pand mm5, MMWORD PTR Val0xffff
|
|
pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy
|
|
psrld mm4, 16
|
|
|
|
movd mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
|
|
psrad mm1, mm5
|
|
movd mm2, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION + 4)
|
|
psrad mm2, mm4
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
|
|
; This helps in calculating texture address.
|
|
movzx edx, word ptr XpTex(iShiftPitch+eax*2)
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
|
|
;INT32 iHalf = 1<<(TEX_FINAL_SHIFT - iShiftU1 - 1);
|
|
;INT32 iUAlign = pCtx->SI.iU1 - iHalf;
|
|
;iHalf = 1<<(TEX_FINAL_SHIFT - iShiftV1 - 1);
|
|
;INT32 iVAlign = pCtx->SI.iV1 - iHalf;
|
|
;iU10 = iUAlign >> (TEX_FINAL_SHIFT - iShiftU0);
|
|
;iV10 = iVAlign >> (TEX_FINAL_SHIFT - iShiftV0);
|
|
;iUFrac0 = (iUAlign<<iShiftU0) & TEX_FINAL_FRAC_MASK;
|
|
;iVFrac0 = (iVAlign<<iShiftV0) & TEX_FINAL_FRAC_MASK;
|
|
psubd mm1, MMWORD PTR Val0x0000002000000020
|
|
|
|
;INT32 iUFrac = iU00 & 0x03f;
|
|
;INT32 iVFrac = iV00 & 0x03f;
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
movq mm2, mm1
|
|
psrad mm1, 6
|
|
;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
|
|
pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
|
|
|
|
; Going to use only 8 bits for bi-linear so that I can do a pmullw.
|
|
; Currently at 6 bits so shift up by 2.
|
|
psllw mm2, 2
|
|
|
|
movq mm0, mm2
|
|
; Replicate VFrac value for bilinear
|
|
punpckhwd mm2, mm2
|
|
punpcklwd mm2, mm2
|
|
|
|
; Replicate UFrac Value for bilinear
|
|
punpcklwd mm0, mm0
|
|
punpcklwd mm0, mm0
|
|
|
|
movq dword ptr VFrac, mm2
|
|
movq dword ptr UFrac, mm0
|
|
|
|
;INT32 iU01 = iU00 + 1;
|
|
;INT32 iV01 = iV00 + 1;
|
|
packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
|
|
paddw mm1, dword ptr IncHighandLow16
|
|
; This will make texture values be (High word to low word):
|
|
; iV01, iU00, iV00, iU01
|
|
; Need to do this to make texture look up for bilinear easier.
|
|
; I have to combine to get all combinations anyway. It just
|
|
; happens to be better for me to have iV00, iU01 pair first.
|
|
|
|
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, XpTex(uMaskU) ; Load U and V mask
|
|
|
|
; replicate mask if doing bilinear
|
|
punpckldq mm0, mm0
|
|
|
|
psrlw mm0, mm3
|
|
|
|
|
|
;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
|
|
;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
|
|
;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
|
|
;movq mm6, XpS(iUoW`'d_TexNum)
|
|
|
|
;movq mm6, MMWORD PTR Zero
|
|
pxor mm6, mm6
|
|
|
|
; TBD Data in SPANTEX needs to be rearange to make life simpler.
|
|
; I have rearranged some of it, but there still needs to be some
|
|
; fixes to it.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
|
|
punpckldq mm4, mm4 ; copy UV
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
|
|
mov edi, iTex
|
|
pcmpgtd mm6, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm6, mm6
|
|
|
|
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
|
|
movd mm7, XpS(iOoW)
|
|
punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
|
|
psrld mm7, 11 ; Make OoWs Precision Match UoWs.
|
|
pcmpgtd mm7, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm7, mm7
|
|
|
|
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
|
|
movd mm0, XpTex(iClampMinU)
|
|
|
|
punpckldq mm0, mm0
|
|
|
|
pand mm0, mm6
|
|
|
|
; Save clamp2 because pandn will destory value.
|
|
movq mm4, mm7
|
|
|
|
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
|
|
movd mm2, XpTex(iClampMaxU)
|
|
|
|
punpckldq mm2, mm2
|
|
psraw mm2, mm3 ; Shifts clamp max to correct bit location
|
|
|
|
pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
|
|
|
|
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
|
|
pandn mm6, mm4
|
|
|
|
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
|
|
movd mm2, XpTex(iClampEnU)
|
|
|
|
punpckldq mm2, mm2
|
|
|
|
pandn mm6, mm2
|
|
|
|
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
|
|
pandn mm6, mm1
|
|
|
|
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
|
|
por mm6, mm0
|
|
|
|
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
|
|
por mm6, mm7
|
|
movq mm4, mm6
|
|
|
|
; Making other two cases for texture addressing has to be simplier than
|
|
; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
|
|
; TBD Make this better.
|
|
; values are still stored as iV01, iU00, iV00, iU01
|
|
movq mm2, mm4
|
|
movq mm3, mm4
|
|
|
|
movq mm0, mm4
|
|
|
|
|
|
pmaddwd mm4, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
; values are being changed to iV01, iU01, iV00, iU00
|
|
; seven instructions for this seems excessive.
|
|
pand mm2, MMWORD ptr MaskKeepUValues
|
|
pand mm3, MMWORD ptr MaskKeepVValues
|
|
movq mm1, mm2
|
|
psllq mm2, 32
|
|
psrlq mm1, 32
|
|
por mm3, mm2
|
|
por mm3, mm1
|
|
|
|
; From here until mov edi is code that is needed for border.
|
|
; all sign bits are stored in bytes so that border code can tell if uv went below zero.
|
|
|
|
; mm0 = iV01, iU00, iV00, iU01
|
|
; mm3 = iV01, iU01, iV00, iU00
|
|
; Need to rearrange values to be like so v1 u0 v1 u1 v0 u0 v0 u1 in bytes
|
|
; This is really bad. Just doing whatever to get it to work.
|
|
movq mm1, mm0
|
|
punpckldq mm1, mm3 ; This will make mm1 = v0 u0 v0 u1
|
|
movq mm2, mm3
|
|
punpckhdq mm2, mm0 ; This will make mm0 = v1 u0 v1 u1
|
|
packsswb mm1, mm2
|
|
movq mm0, mm1
|
|
|
|
pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
|
|
|
|
mov edi, XpTex(pBits+eax*4)
|
|
|
|
; was esi. Cant change to esi because it is the pointer to pTex
|
|
; which is used by Border and ColorKey. Use edi for now and
|
|
; call routines through memory. Figure out if this is bad.
|
|
|
|
; load the read texture routine address into a register early
|
|
;mov edi, XpCtx(pfnTexRead)
|
|
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;UINT32 uTex00 = pCtx->pfnTexRead[0](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
; Combine U and V values before making call.
|
|
;call edi
|
|
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
movd eax, mm3
|
|
movq mm7, mm1 ; Put TColor[iU0, uV0] in mm7
|
|
|
|
;UINT32 uTex10 = pCtx->pfnTexRead[0](iU01, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
psrlq mm3, 32
|
|
psubw mm7, mm1
|
|
psllw mm1, 8
|
|
pmullw mm7, dword ptr UFrac
|
|
paddw mm7, mm1 ; Should I copy mm1 to another variable and do shift/add later?
|
|
movd eax, mm3
|
|
|
|
;UINT32 uTex01 = pCtx->pfnTexRead[0](iU00, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
psrlq mm4, 32
|
|
movq mm6, mm1
|
|
movd eax, mm4
|
|
;UINT32 uTex11 = pCtx->pfnTexRead[0](iU01, iV01, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
;call edi
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
;TexFiltBilinear(&pCtx->SI.TexCol[0], iUFrac, iVFrac, uTex00, uTex10, uTex01, uTex11);
|
|
; The amount of shifting instructions for this makes the other approach
|
|
; look pretty good.
|
|
psubw mm6, mm1
|
|
psllw mm1, 8
|
|
pmullw mm6, dword ptr UFrac ; TBD explain this code better.
|
|
movq mm4, mm7
|
|
paddw mm6, mm1
|
|
psrlw mm6, 8
|
|
psrlw mm7, 8
|
|
psubw mm6, mm7
|
|
pmullw mm6, dword ptr VFrac
|
|
paddw mm4, mm6
|
|
psrlw mm4, 8
|
|
|
|
; TBD shouldnt have to pack and then unpack later. Should keep in a register
|
|
;packuswb mm4, mm4
|
|
;movd XpCtxSI(TexCol), mm4
|
|
|
|
jmp mipinterp
|
|
NotLinear:
|
|
|
|
; Get LOD from mm3 and put in eax.
|
|
movd eax, mm3
|
|
|
|
; This helps in calculating texture address.
|
|
movzx edx, word ptr XpTex(iShiftPitch+eax*2)
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
psrad mm1, 6
|
|
packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror
|
|
; operations assume UV in low 32 bits.
|
|
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, XpTex(uMaskU) ; Load U and V mask
|
|
|
|
; iLOD0 shift value left over from above. TBD. Put this in in mip case
|
|
; Could do this one before or after the unpack also.
|
|
psrlw mm0, mm3
|
|
|
|
|
|
;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
|
|
;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
|
|
;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
|
|
;movq mm6, XpS(iUoW`'d_TexNum)
|
|
|
|
;movq mm6, MMWORD PTR Zero
|
|
pxor mm6, mm6
|
|
|
|
; TBD Data in SPANTEX needs to be rearange to make life simpler.
|
|
; I have rearranged some of it, but there still needs to be some
|
|
; fixes to it.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
|
|
mov edi, iTex
|
|
pcmpgtd mm6, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm6, mm6
|
|
|
|
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
|
|
movd mm7, XpS(iOoW)
|
|
punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
|
|
psrld mm7, 11 ; Make OoWs Precision Match UoWs.
|
|
pcmpgtd mm7, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm7, mm7
|
|
|
|
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
|
|
movd mm0, XpTex(iClampMinU)
|
|
|
|
pand mm0, mm6
|
|
|
|
; Save clamp2 because pandn will destory value.
|
|
movq mm4, mm7
|
|
|
|
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
|
|
movd mm2, XpTex(iClampMaxU)
|
|
psraw mm2, mm3 ; Shifts clamp max to correct bit location
|
|
|
|
pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
|
|
|
|
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
|
|
pandn mm6, mm4
|
|
|
|
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
|
|
movd mm2, XpTex(iClampEnU)
|
|
|
|
pandn mm6, mm2
|
|
|
|
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
|
|
pandn mm6, mm1
|
|
|
|
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
|
|
por mm6, mm0
|
|
|
|
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
|
|
por mm6, mm7
|
|
movq mm4, mm6
|
|
|
|
movq mm0, mm4
|
|
|
|
pmaddwd mm4, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
; Point needs to be in same format as bilinear for border
|
|
packsswb mm0, mm0
|
|
|
|
mov edi, XpTex(pBits+eax*4)
|
|
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;pCtx->SI.TexCol[0] = pCtx->pfnTexRead[0](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
; TBD Currently have to pack and then unpack later. Should be able
|
|
; to leave the value in some register for a while. I would think.
|
|
;packuswb mm1, mm1
|
|
movq XpCtxSI(uBB), mm1
|
|
|
|
;----Calc second mip level pixel------------------------------------------------------------------------------
|
|
;****** iLOD0 was saved in mm3 from above.
|
|
|
|
;INT16 iLOD1 = (INT16)(min(iLOD0+(pS->iLOD > 0), pTex->cLOD));
|
|
pxor mm5, mm5
|
|
movd mm2, XpS(iLOD)
|
|
pcmpgtw mm2, mm5
|
|
psubw mm3, mm2
|
|
|
|
movd mm1, XpTex(cLOD)
|
|
movq mm2, mm3
|
|
pcmpgtw mm3, mm1
|
|
|
|
pand mm1, mm3
|
|
pandn mm3, mm2
|
|
por mm3, mm1
|
|
|
|
pand mm3, MMWORD PTR Val0xffff ; Get rid of any data in the high word.
|
|
|
|
|
|
; Get LOD from mm3 and put in eax.
|
|
movd eax, mm3
|
|
|
|
mov edx, iTex
|
|
movq mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
|
|
|
|
;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
|
|
;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
|
|
movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
|
|
punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V
|
|
movd mm4, XpTex(iShiftU)
|
|
psubw mm4, mm3
|
|
psubw mm5, mm4
|
|
movq mm4, mm5
|
|
pand mm5, MMWORD PTR Val0xffff
|
|
pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy
|
|
psrld mm4, 16
|
|
|
|
movd mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
|
|
psrad mm1, mm5
|
|
movd mm2, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION + 4)
|
|
psrad mm2, mm4
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
; This helps in calculating texture address.
|
|
movzx edx, word ptr XpTex(iShiftPitch+eax*2)
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
psrad mm1, 6
|
|
packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror
|
|
; operations assume UV in low 32 bits.
|
|
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, XpTex(uMaskU) ; Load U and V mask
|
|
|
|
; iLOD0 shift value left over from above. TBD. Put this in in mip case
|
|
; Could do this one before or after the unpack also.
|
|
psrlw mm0, mm3
|
|
|
|
|
|
;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
|
|
;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
|
|
;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
|
|
;movq mm6, XpS(iUoW`'d_TexNum)
|
|
|
|
;movq mm6, MMWORD PTR Zero
|
|
pxor mm6, mm6
|
|
|
|
; TBD Data in SPANTEX needs to be rearange to make life simpler.
|
|
; I have rearranged some of it, but there still needs to be some
|
|
; fixes to it.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
|
|
psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
|
|
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
|
|
mov edi, iTex
|
|
pcmpgtd mm6, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm6, mm6
|
|
|
|
;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
|
|
movd mm7, XpS(iOoW)
|
|
punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
|
|
psrld mm7, 11 ; Make OoWs Precision Match UoWs.
|
|
pcmpgtd mm7, XpS(UVoW + edi * SIZEOF_UV_UNION)
|
|
packssdw mm7, mm7
|
|
|
|
;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
|
|
movd mm0, XpTex(iClampMinU)
|
|
|
|
pand mm0, mm6
|
|
|
|
; Save clamp2 because pandn will destory value.
|
|
movq mm4, mm7
|
|
|
|
;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
|
|
movd mm2, XpTex(iClampMaxU)
|
|
psraw mm2, mm3 ; Shifts clamp max to correct bit location
|
|
|
|
pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
|
|
|
|
;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
|
|
pandn mm6, mm4
|
|
|
|
;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
|
|
movd mm2, XpTex(iClampEnU)
|
|
|
|
pandn mm6, mm2
|
|
|
|
;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
|
|
pandn mm6, mm1
|
|
|
|
;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
|
|
por mm6, mm0
|
|
|
|
;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
|
|
por mm6, mm7
|
|
movq mm4, mm6
|
|
|
|
movq mm0, mm4
|
|
|
|
pmaddwd mm4, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
; Point needs to be in same format as bilinear for border
|
|
packsswb mm0, mm0
|
|
|
|
mov edi, XpTex(pBits+eax*4)
|
|
|
|
; iV0 iU1 address should be done by now.
|
|
movd eax, mm4
|
|
|
|
;pCtx->SI.TexCol[0] = pCtx->pfnTexRead[0](iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
mov edx, iTex
|
|
call dword ptr XpCtx(pfnTexRead + edx*4)
|
|
|
|
; TBD Currently have to pack and then unpack later. Should be able
|
|
; to leave the value in some register for a while. I would think.
|
|
;packuswb mm1, mm1
|
|
;movd XpCtxSI(TexCol), mm1
|
|
movq mm4, mm1
|
|
|
|
mipinterp:
|
|
|
|
;INT32 r0, r1;
|
|
;INT32 g0, g1;
|
|
;INT32 b0, b1;
|
|
;INT32 a0, a1;
|
|
|
|
;r0 = RGBA_GETRED(uTex0);
|
|
;r1 = RGBA_GETRED(uTex1);
|
|
|
|
;g0 = RGBA_GETGREEN(uTex0);
|
|
;g1 = RGBA_GETGREEN(uTex1);
|
|
|
|
;b0 = RGBA_GETBLUE(uTex0);
|
|
;b1 = RGBA_GETBLUE(uTex1);
|
|
|
|
;a0 = RGBA_GETALPHA(uTex0);
|
|
;a1 = RGBA_GETALPHA(uTex1);
|
|
dnl d_bcom()
|
|
;Tex1 in mm4, tex0 will be in mm1
|
|
|
|
movq mm1, XpCtxSI(uBB)
|
|
movq mm2, mm1
|
|
psubw mm4, mm1
|
|
psllw mm2, 8
|
|
|
|
;INT32 t = pS->iLOD & 0x7ff;
|
|
|
|
mov eax, XpS(iLOD)
|
|
shr eax, 3
|
|
and eax, 0ffh
|
|
movd mm3, eax
|
|
|
|
; Replicate
|
|
punpcklwd mm3, mm3
|
|
punpckldq mm3, mm3
|
|
|
|
;INT32 mt = 0x7ff - t;
|
|
;r0 = (mt*r0 + t*r1)>>11;
|
|
;g0 = (mt*g0 + t*g1)>>11;
|
|
;b0 = (mt*b0 + t*b1)>>11;
|
|
;a0 = (mt*a0 + t*a1)>>11;
|
|
pmullw mm4, mm3
|
|
paddw mm4, mm2
|
|
dnl d_ecom()
|
|
;movq mm4, XpCtxSI(uBB)
|
|
psrlw mm4, 8
|
|
|
|
packuswb mm4, mm4
|
|
|
|
push edi
|
|
mov edi, iTex
|
|
|
|
movd XpCtxSI(TexCol + edi * 4), mm4
|
|
|
|
d_UpdateUoWandVoW()
|
|
pop edi
|
|
|
|
d_UpdateLOD()
|
|
|
|
|
|
d_UpdateOoW()
|
|
|
|
;pS->iW = 0x00800000/(pS->iOoW>>16); // 9.23/1.15 = 8.8
|
|
d_WDivide()
|
|
|
|
|
|
push edi
|
|
mov edi, iTex
|
|
d_UoWVoWTimesW()
|
|
pop edi
|
|
|
|
; load the next bead address.
|
|
;mov eax, XpCtx(pfnTex1AddrEnd)
|
|
|
|
; pCtx->pfnTex1AddrEnd(pCtx, pP, pS);
|
|
;jmp eax
|
|
|
|
; We now need to return here
|
|
ret
|
|
|
|
;}
|
|
|
|
;void TexAddr_Wrapper(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
; PD3DI_RASTSPAN pS)
|
|
;{
|
|
PUBLIC _MMX_TexAddr_Wrapper
|
|
_MMX_TexAddr_Wrapper:
|
|
;for (INT32 i = 0; i < (INT32)pCtx->cActTex; i++)
|
|
;{
|
|
; pCtx->pfnTexAddr[i](pCtx, pP, pS, i);
|
|
;}
|
|
mov iTex, 0
|
|
mov eax, 0
|
|
|
|
LOOP_TEXTURES:
|
|
cmp eax, dword ptr XpCtx(cActTex)
|
|
jz DONE_LOOP_TEXTURES
|
|
call dword ptr XpCtx(pfnTexAddr + eax * 4)
|
|
mov eax, iTex
|
|
inc eax
|
|
mov iTex, eax
|
|
jmp LOOP_TEXTURES
|
|
|
|
DONE_LOOP_TEXTURES:
|
|
|
|
; load the next bead address.
|
|
mov eax, XpCtx(pfnTexAddrEnd)
|
|
|
|
; pCtx->pfnTexAddrEnd(pCtx, pP, pS);
|
|
jmp eax
|
|
|
|
|
|
;}
|
|
END
|