You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
914 lines
27 KiB
914 lines
27 KiB
;-----------------------------------------------------------------------------
|
|
;
|
|
; Monolith 7. Perspective Correct Bi-linear gouraud modulated
|
|
; 565 input texture 16 bit Z buffered (LE or GT)
|
|
; 565 output.
|
|
;
|
|
;
|
|
; Globals
|
|
;
|
|
; StackPos - stack pos holder
|
|
; uSpans - Number of spans to process
|
|
; iSurfaceStep - what to add to screen pointer
|
|
; iZStep - what to add to Z buffer pointer
|
|
; uPix - Pixel Count
|
|
;
|
|
; Changes from general MMX assembly.
|
|
; 1) Registers renamed a to remove additional moves
|
|
; 2) Since there are 4 texels used in bi-linear and
|
|
; the 565 - 888 color conversion can convert 2
|
|
; texels at a time, two texels are loaded, combined
|
|
; and then converted at once then moved into seperate
|
|
; registers.
|
|
; 3) Most register renaming was done in the bi-linear calculation
|
|
; since the original code always read into mm1 which
|
|
; caused alot of additional moves.
|
|
; 4) Texcolor is not written to since it is just loaded
|
|
; and then written.
|
|
;
|
|
;-----------------------------------------------------------------------------
|
|
|
|
INCLUDE iammx.inc
|
|
INCLUDE offs_acp.inc
|
|
|
|
|
|
; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
|
|
; at the LSB, then six bits of green, then five bits of red.
|
|
|
|
|
|
;TBD check to see if this value is correct.
|
|
COLOR_SHIFT equ 8
|
|
|
|
|
|
.586
|
|
.model flat
|
|
|
|
; Big seperating lines seperate code into span code
|
|
; and loop code. If span and loop are not going to
|
|
; end up being combined then it will be easy to
|
|
; seperate the code.
|
|
|
|
|
|
.data
|
|
|
|
; Need externs for all of the variables that are needed for various beads
|
|
|
|
EXTERN IncHighandLow16:MMWORD
|
|
EXTERN UFracVFracMask:MMWORD
|
|
EXTERN UV32to15Mask:MMWORD
|
|
EXTERN Makelow16one:MMWORD
|
|
EXTERN MaskKeepUValues:MMWORD
|
|
EXTERN MaskKeepVValues:MMWORD
|
|
EXTERN UFrac:MMWORD
|
|
EXTERN VFrac:MMWORD
|
|
EXTERN Zero:MMWORD
|
|
EXTERN memD3DTFG_POINT:MMWORD
|
|
EXTERN GiveUp:MMWORD
|
|
EXTERN LastW:MMWORD
|
|
EXTERN Val0x000a000a:MMWORD
|
|
EXTERN Val0xffff:MMWORD
|
|
EXTERN Val0x0000002000000020:MMWORD
|
|
EXTERN Val0x0000ffff0000ffff:MMWORD
|
|
|
|
|
|
EXTERN MaskRed565to888:MMWORD
|
|
EXTERN MaskGreen565to888:MMWORD
|
|
EXTERN MaskBlue565to888:MMWORD
|
|
|
|
EXTERN MaskRed555to888:MMWORD
|
|
EXTERN MaskGreen555to888:MMWORD
|
|
EXTERN MaskBlue555to888:MMWORD
|
|
|
|
EXTERN MaskAlpha1555to8888:MMWORD
|
|
EXTERN MaskRed1555to8888:MMWORD
|
|
EXTERN MaskGreen1555to8888:MMWORD
|
|
EXTERN MaskBlue1555to8888:MMWORD
|
|
|
|
; TBD. I think that I want to do 0xffff instead of 0xff. This will
|
|
; have to be checked. There is a value very similiar to this in
|
|
; buf write.
|
|
EXTERN SetAlphato0xffff:MMWORD
|
|
EXTERN SetAlphato0xff:MMWORD
|
|
|
|
; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
|
|
RedShift565to888 equ 8
|
|
GreenShift565to888 equ 5
|
|
BlueShift565to888 equ 3
|
|
|
|
RedShift555to888 equ 9
|
|
GreenShift555to888 equ 6
|
|
BlueShift555to888 equ 3
|
|
|
|
AlphaShift1555to8888 equ 16
|
|
RedShift1555to8888 equ 9
|
|
GreenShift1555to8888 equ 6
|
|
BlueShift1555to8888 equ 3
|
|
|
|
EXTERN BilinearMaskRed565to888:MMWORD
|
|
EXTERN BilinearMaskGreen565to888:MMWORD
|
|
EXTERN BilinearMaskBlue565to888:MMWORD
|
|
EXTERN BilinearShiftRed565to888:MMWORD
|
|
EXTERN BilinearShiftGreen565to888:MMWORD
|
|
EXTERN BilinearShiftBlue565to888:MMWORD
|
|
|
|
EXTERN Zero:MMWORD
|
|
EXTERN DW_One_One:MMWORD
|
|
EXTERN MaskOffAlpha:MMWORD
|
|
EXTERN ShiftTA:MMWORD
|
|
EXTERN Val0x00ff00ff00ff00ff:MMWORD
|
|
EXTERN Val0x000000ff00ff00ff:MMWORD
|
|
EXTERN Val0X0000000001000000:MMWORD
|
|
EXTERN AlphaVal128:MMWORD
|
|
EXTERN RGBVal128:MMWORD
|
|
|
|
|
|
EXTERN g_uDitherValue:MMWORD
|
|
EXTERN SetAlphato0xff:MMWORD
|
|
EXTERN u888to565RedBlueMask:MMWORD
|
|
EXTERN u888to565GreenMask:MMWORD
|
|
EXTERN u888to565Multiplier:MMWORD
|
|
EXTERN uVal0x000007ff03ff07ff:MMWORD
|
|
EXTERN uVal0x0000078003c00780:MMWORD
|
|
EXTERN u888to555RedBlueMask:MMWORD
|
|
EXTERN u888to555GreenMask:MMWORD
|
|
EXTERN u888to555Multiplier:MMWORD
|
|
EXTERN uVal0x000007ff07ff07ff:MMWORD
|
|
EXTERN uVal0x0000078007800780:MMWORD
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; Span Variables
|
|
StackPos dd ?
|
|
uSpans dd ?
|
|
;-----------------------------------------------------------------------------
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; Loop Variables
|
|
|
|
iSurfaceStep dd ?
|
|
iZStep dd ?
|
|
uPix dd ?
|
|
|
|
;-----------------------------------------------------------------------------
|
|
|
|
.code
|
|
|
|
PUBLIC _MMXMLRast_7
|
|
_MMXMLRast_7:
|
|
push ebp
|
|
mov StackPos, esp
|
|
mov eax, esp
|
|
sub esp, 0Ch ; This will need to change if stack frame size changes.
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
|
|
; Put pCtx into ebx
|
|
mov ebx, [eax+8]
|
|
|
|
;PD3DI_RASTPRIM pP = pCtx->pPrim;
|
|
mov ecx, [ebx+RASTCTX_pPrim]
|
|
|
|
;while (pP)
|
|
;{
|
|
PrimLoop:
|
|
cmp ecx, 0
|
|
je ExitPrimLoop
|
|
|
|
;UINT16 uSpans = pP->uSpans;
|
|
movzx eax, word ptr [ecx+RASTPRIM_uSpans]
|
|
mov uSpans, eax
|
|
|
|
;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
|
|
mov ebp, ecx
|
|
add ebp, SIZEOF_RASTPRIM
|
|
|
|
|
|
;while (uSpans-- > 0)
|
|
;{
|
|
SpanLoop:
|
|
mov edx, uSpans
|
|
mov eax, edx
|
|
dec eax
|
|
mov uSpans, eax
|
|
test edx, edx
|
|
jle ExitSpanLoop
|
|
|
|
;pCtx->pfnBegin(pCtx, pP, pS);
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; LoopAny code inserted here. This is to get rid of an extra
|
|
; jump.
|
|
;-----------------------------------------------------------------------------
|
|
|
|
; Setup Code begins
|
|
|
|
; get values to iterate
|
|
|
|
;uPix = pS->uPix;
|
|
movzx eax, word ptr [ebp+RASTSPAN_uPix]
|
|
mov uPix, eax
|
|
|
|
|
|
;pCtx->SI.iDW = 0x0;
|
|
mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
|
|
|
|
mov esi, [ebp+RASTSPAN_iW]
|
|
movq mm5, MMWORD PTR [ebp+RASTSPAN_iUoW1]
|
|
|
|
|
|
;pCtx->SI.iUd_TexNum = d_WTimesUVoW(pS->iW,pS->iUoW1);
|
|
;pCtx->SI.iVd_TexNum = d_WTimesUVoW(pS->iW,pS->iVoW1);
|
|
pslld mm5, 8
|
|
shl esi, 4
|
|
movd eax, mm5
|
|
psrlq mm5, 32
|
|
imul esi
|
|
mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
|
|
movd eax, mm5
|
|
imul esi
|
|
mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
|
|
|
|
|
|
;if (pP->iDOoWDX > 0)
|
|
;{
|
|
cmp dword ptr [ecx+RASTPRIM_iDOoWDX], 0
|
|
jg SpecialWLastMonTest
|
|
;// iSpecialW should be negative for the first 3 pixels of span
|
|
;pCtx->SI.iSpecialW = -3;
|
|
mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], -3
|
|
jmp DoneSpecialWifMonTest
|
|
;}
|
|
;else
|
|
;{
|
|
SpecialWLastMonTest:
|
|
;// iSpecialW should be negative for the last 3 pixels of span
|
|
;pCtx->SI.iSpecialW = 0x7fff - uPix;
|
|
mov eax, 07fffh
|
|
sub eax, uPix
|
|
;pCtx->SI.iSpecialW += 5; // this may wrap, but it should
|
|
add eax, 5
|
|
mov [ebx+RASTCTX_SI+SPANITER_iSpecialW], eax
|
|
;}
|
|
DoneSpecialWifMonTest:
|
|
|
|
|
|
;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
|
|
;{
|
|
mov eax, [ecx+RASTPRIM_uFlags]
|
|
and eax, D3DI_RASTPRIM_X_DEC
|
|
test eax, eax
|
|
jz LeftToRightSpan
|
|
;iZStep = -pCtx->iZStep;
|
|
mov eax, [ebx+RASTCTX_iZStep]
|
|
neg eax
|
|
mov iZStep, eax
|
|
;iSurfaceStep = -pCtx->iSurfaceStep;
|
|
mov eax, [ebx+RASTCTX_iSurfaceStep]
|
|
neg eax
|
|
mov iSurfaceStep, eax
|
|
;}
|
|
jmp DoneSpanDirif
|
|
;else
|
|
;{
|
|
LeftToRightSpan:
|
|
;iZStep = pCtx->iZStep;
|
|
mov eax, [ebx+RASTCTX_iZStep]
|
|
mov iZStep, eax
|
|
;iSurfaceStep = pCtx->iSurfaceStep;
|
|
mov eax, [ebx+RASTCTX_iSurfaceStep]
|
|
mov iSurfaceStep, eax
|
|
;}
|
|
DoneSpanDirif:
|
|
|
|
; Setup Code Ends
|
|
; ----------------------------------------------------------------------------------------------------------------
|
|
; Loop Code Begins
|
|
|
|
;//while (1)
|
|
;//{
|
|
PixelLoop:
|
|
|
|
; Ztestcode
|
|
; edx is uZ
|
|
; eax is uZB
|
|
; 16 bit unsigned format
|
|
;UINT16 uZ = (UINT16)(pS->uZ>>15);
|
|
;UINT16 uZB = *((UINT16*)pS->pZ);
|
|
mov edx, [ebp+RASTSPAN_uZ]
|
|
movd mm4, edx
|
|
mov esi, [ebp+RASTSPAN_pZ]
|
|
shr edx, 15
|
|
movzx eax, word ptr [esi]
|
|
|
|
;pS->uZ += pP->iDZDX;
|
|
;if ((pCtx->iZXorMask)^(uZ > uZB))
|
|
; !(uZ > uZB) <==>
|
|
; (uZ <= uZB) <==>
|
|
; (uZ < uZB+1) <==>
|
|
;
|
|
sub eax, edx
|
|
paddd mm4, [ecx+RASTPRIM_iDZDX]
|
|
movd [ebp+RASTSPAN_uZ], mm4
|
|
xor eax, [ebx+RASTCTX_iZXorMask]
|
|
test eax, eax
|
|
js FailLabel
|
|
|
|
mov word ptr [esi], dx
|
|
; texturecode
|
|
|
|
|
|
;---------------------------------------------------------------------------
|
|
;void Tex1Addr_TexAddrWrapMirror_Persp_Bilinear_NoMip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
; PD3DI_RASTSPAN pS)
|
|
;{
|
|
;PD3DI_SPANTEX pTex = &pCtx->Texture[0];
|
|
|
|
mov esi, [ebx+RASTCTX_pTexture]
|
|
|
|
; Doing UV calculation a little more accurate
|
|
; Exactly like C code.
|
|
|
|
; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
|
|
; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
|
|
; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
|
|
; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
|
|
|
|
; COMMENT1**
|
|
; If textures have a max of 1024 then shiftU0 would be at most 10 which would
|
|
; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
|
|
; It will also give bi-linear 6 bits of precision I think it was said that
|
|
; only five was needed.
|
|
;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
|
|
;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
|
|
movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
|
|
|
|
;iLOD0 is zero in monolithic case so no subtraction needed.
|
|
movd mm4, [esi+SPANTEX_iShiftU]
|
|
psubw mm5, mm4
|
|
movq mm4, mm5
|
|
pand mm5, MMWORD PTR Val0xffff
|
|
|
|
psrld mm4, 16
|
|
|
|
movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
|
|
psrad mm1, mm5
|
|
movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
|
|
psrad mm2, mm4
|
|
|
|
punpckldq mm1, mm2
|
|
|
|
psubd mm1, MMWORD PTR Val0x0000002000000020
|
|
|
|
; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
|
|
; ----------------- Start of hack
|
|
; ATTENTION This is really hacked right now. Just to get it working
|
|
; Pitch would be better for me, instead of shift pitch.
|
|
; With actual pitch, this would be two moves and a shift.
|
|
;shl eax, 1
|
|
movzx edx, word ptr [esi+SPANTEX_iShiftPitch]
|
|
add edx, 16
|
|
movd mm2, edx
|
|
movq mm5, MMWORD ptr Makelow16one
|
|
pslld mm5, mm2
|
|
|
|
;pslld mm5, 16 ;. Use this after hack.
|
|
; not needed in hacked version since i add to shifted value.
|
|
; ----------------- End of hack
|
|
|
|
por mm5, MMWORD ptr Makelow16one
|
|
; Make the low 16 bits of dword one
|
|
; This helps in calculating texture address.
|
|
|
|
; Gets U and V value into mm1 so that it can be mirrored, wrapped or
|
|
; clamped. This can be done for two values in the point case
|
|
; or four values in the bilinear case.
|
|
|
|
|
|
;INT32 iUFrac = iU00 & 0x03f;
|
|
;INT32 iVFrac = iV00 & 0x03f;
|
|
;iU00 >>= 6;
|
|
;iV00 >>= 6;
|
|
movq mm2, mm1
|
|
psrad mm1, 6
|
|
;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
|
|
pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
|
|
|
|
; Going to use only 8 bits for bi-linear so that I can do a pmullw.
|
|
; Currently at 6 bits so shift up by 2.
|
|
psllw mm2, 2
|
|
|
|
movq mm0, mm2
|
|
; Replicate VFrac value for bilinear
|
|
punpckhwd mm2, mm2
|
|
punpcklwd mm2, mm2
|
|
|
|
; Replicate UFrac Value for bilinear
|
|
punpcklwd mm0, mm0
|
|
punpcklwd mm0, mm0
|
|
|
|
movq dword ptr VFrac, mm2
|
|
movq dword ptr UFrac, mm0
|
|
|
|
;INT32 iU01 = iU00 + 1;
|
|
;INT32 iV01 = iV00 + 1;
|
|
packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
|
|
paddw mm1, dword ptr IncHighandLow16
|
|
; This will make texture values be (High word to low word):
|
|
; iV01, iU00, iV00, iU01
|
|
; Need to do this to make texture look up for bilinear easier.
|
|
; I have to combine to get all combinations anyway. It just
|
|
; happens to be better for me to have iV00, iU01 pair first.
|
|
|
|
;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
|
|
; put mask in mm3 and replicate to match location for wrap/mirror/clamp
|
|
movd mm0, [esi+SPANTEX_uMaskU] ; Load U and V mask
|
|
|
|
|
|
; replicate mask if doing bilinear
|
|
punpckldq mm0, mm0
|
|
|
|
|
|
; Monolith cases assumed that iLOD0 was zero so no shift needed.
|
|
|
|
|
|
;INT16 iFlip;
|
|
; MM1 should contain 16 bit iU and iV for both texture locations
|
|
; End Result is MM1 value wrapped or mirrored
|
|
; in Bilinear Case, four values can be done
|
|
; iU00, iV00, iU01, iV01
|
|
; This code really does alot for the bilinear case and is kinda wasteful
|
|
; in the normal mode.
|
|
|
|
;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
|
|
movq mm7, mm1
|
|
|
|
; Point doesnt need replication
|
|
movd mm4, [esi+SPANTEX_iFlipMaskU]
|
|
|
|
; if bilinear replicate values together, Point doesnt need this.
|
|
punpckldq mm4, mm4
|
|
|
|
; Monolith cases assumed that iLOD0 was zero so no shift needed.
|
|
pand mm7, mm4
|
|
|
|
;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
|
|
pcmpeqw mm7, MMWORD PTR Zero
|
|
|
|
;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
|
|
pandn mm7, mm0
|
|
|
|
;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
|
|
pand mm1, mm0
|
|
|
|
;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
|
|
pxor mm1, mm7
|
|
|
|
; Result in mm1 now since TexAddrAll ends up that way.
|
|
; Making other two cases for texture addressing has to be simplier than
|
|
; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
|
|
; TBD Make this better.
|
|
; values are still stored as iV01, iU00, iV00, iU01
|
|
|
|
movq mm2, mm1
|
|
movq mm3, mm1
|
|
|
|
; Calculate 1st and 3rd texel addresses
|
|
pmaddwd mm1, mm5 ; Throw in first address calculation.
|
|
; Just to get it started. Calculate
|
|
; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
|
|
|
|
|
|
; values are being changed to iV01, iU01, iV00, iU00
|
|
; seven instructions for this seems excessive.
|
|
pand mm2, MMWORD ptr MaskKeepUValues
|
|
pand mm3, MMWORD ptr MaskKeepVValues
|
|
movq mm4, mm2
|
|
psllq mm2, 32
|
|
psrlq mm4, 32
|
|
por mm3, mm2
|
|
por mm3, mm4
|
|
|
|
; From here until mov edi is code that is needed for border.
|
|
; all sign bits are stored in bytes so that border code can tell if uv went below zero.
|
|
|
|
|
|
; Calculate 2nd and 4th texel addresses
|
|
pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
|
|
|
|
|
|
mov edi, [esi+SPANTEX_pBits]
|
|
; was esi. Cant change to esi because it is the pointer to pTex
|
|
; which is used by Border and ColorKey. Use edi for now and
|
|
; call routines through memory. Figure out if this is bad.
|
|
|
|
; load the read texture routine address into a register early
|
|
;mov edi, [ebx+RASTCTX_pfnTexRead]
|
|
|
|
|
|
;UINT32 uTex00 = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
|
|
; pTex->pBits[iLOD0], &pCtx->Texture[0]);
|
|
; Combine U and V values before making call.
|
|
;call edi
|
|
|
|
|
|
; -------------------- In Monolithic version calls are inlined.
|
|
|
|
;D3DCOLOR TexRead_B5G6R5_NoBorder(INT32 iU, INT32 iV, INT32 iShiftU, PUINT8 pBits, PD3DI_SPANTEX pTex)
|
|
;{
|
|
|
|
|
|
; Color convert 2 pixels at a time
|
|
; iV0 iU1 address should be done by now.
|
|
; movq mm2, MMWORD PTR Zero
|
|
pxor mm2, mm2
|
|
; 1st (mm1) and 2nd (mm3) texel
|
|
movd eax, mm3 ; load 2nd texel address
|
|
movzx eax, word ptr [edi+2*eax]
|
|
movd mm4, eax ; mm4 = 2nd texel
|
|
|
|
movd eax, mm1 ; load 1st texel address
|
|
movzx eax, word ptr [edi+2*eax]
|
|
movd mm7, eax ; mm7 = 1st texel
|
|
; mm7 = 2nd texel (high 32 bits), 1st texel (low 32 bits)
|
|
punpckldq mm7, mm4
|
|
|
|
movq mm5, mm7
|
|
movq mm4, mm7
|
|
|
|
pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
|
|
pand mm7, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
|
|
pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
|
|
|
|
pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
|
|
pslld mm7, MMWORD PTR BilinearShiftGreen565to888 ; = 5
|
|
pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
|
|
|
|
por mm7, mm5 ; combine R+G
|
|
por mm7, mm4 ; combine (R+G) + B
|
|
movq mm4, mm7 ; copy 1st and 2nd texels
|
|
|
|
; mm4 calculated from high 32 bits of mm3 (2nd texel)
|
|
; pad high 8 bits of each component with zeros because D3DCOLOR has 16
|
|
; bits for each color component
|
|
punpckhbw mm4, mm2
|
|
|
|
; mm7 calculated from low 32 bits of mm1 (1st texel)
|
|
; pad high 8 bits of each component with zeros because D3DCOLOR has 16
|
|
; bits for each color component
|
|
punpcklbw mm7, mm2
|
|
|
|
|
|
psrlq mm3, 32 ; shift 4th texel address into low 32 bits
|
|
; mm7 = final calc on 1st and 2nd texel
|
|
psubw mm7, mm4
|
|
psllw mm4, 8
|
|
pmullw mm7, dword ptr UFrac
|
|
paddw mm7, mm4
|
|
|
|
|
|
|
|
; 3rd (mm1) and 4th (mm3) texel
|
|
movd eax, mm3 ; load 4th texel address
|
|
psrlq mm1, 32 ; shift 3rd texel address into low 32 bits
|
|
movzx eax, word ptr [edi+2*eax]
|
|
movd mm6, eax ; mm6 = 4th texel
|
|
|
|
movd eax, mm1 ; load 3rd texel address
|
|
movzx eax, word ptr [edi+2*eax]
|
|
movd mm4, eax ; mm4 = 3rd texel
|
|
; mm6 = 4th texel (high 32 bits), 3rd texel (low 32 bits)
|
|
punpckldq mm6, mm4
|
|
|
|
movq mm5, mm6
|
|
movq mm4, mm6
|
|
|
|
pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
|
|
pand mm6, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
|
|
pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
|
|
|
|
pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
|
|
pslld mm6, MMWORD PTR BilinearShiftGreen565to888 ; = 5
|
|
pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
|
|
|
|
por mm6, mm5 ; combine R+G
|
|
por mm6, mm4 ; combine (R+G) + B
|
|
movq mm4, mm6 ; copy 3rd and 4th texels
|
|
|
|
; mm4 calculated from high 32 bits of mm3 (4th texel)
|
|
; pad high 8 bits of each component with zeros because D3DCOLOR has 16
|
|
; bits for each color component
|
|
punpckhbw mm4, mm2
|
|
|
|
; mm6 calculated from low 32 bits of mm1 (3rd texel)
|
|
; pad high 8 bits of each component with zeros because D3DCOLOR has 16
|
|
; bits for each color component
|
|
punpcklbw mm6, mm2
|
|
|
|
|
|
psubw mm6, mm4
|
|
psllw mm4, 8
|
|
pmullw mm6, dword ptr UFrac
|
|
movq mm1, mm7
|
|
; mm6 = final calc on 3rd and 4th texel
|
|
paddw mm6, mm4
|
|
|
|
|
|
; mm1 = final calc on 1st+2nd texel and 3rd+4th texel
|
|
psrlw mm6, 8
|
|
psrlw mm7, 8
|
|
psubw mm6, mm7
|
|
pmullw mm6, dword ptr VFrac
|
|
paddw mm1, mm6
|
|
|
|
psrlw mm1, 8
|
|
|
|
;modulate
|
|
|
|
|
|
; ATTENTION shouldnt have to move to and from memory in monolithic case. Use registers
|
|
|
|
;UINT16 uB = pS->uB>>COLOR_SHIFT;
|
|
;UINT16 uG = pS->uG>>COLOR_SHIFT;
|
|
;UINT16 uR = pS->uR>>COLOR_SHIFT;
|
|
movq mm4, [ebp+RASTSPAN_uB]
|
|
psrlw mm4, COLOR_SHIFT ; COLOR_SHIFT is set to 8.
|
|
|
|
|
|
;UINT16 uTB = (UINT16)(RGBA_GETBLUE(pCtx->SI.TexCol[0]));
|
|
;UINT16 uTG = (UINT16)(RGBA_GETGREEN(pCtx->SI.TexCol[0]));
|
|
;UINT16 uTR = (UINT16)(RGBA_GETRED(pCtx->SI.TexCol[0]));
|
|
;UINT16 uTA = (UINT16)(RGBA_GETALPHA(pCtx->SI.TexCol[0]));
|
|
|
|
; this is a PMULLW, which works on unsigned 16 bit quantities
|
|
;pCtx->SI.uBB = uB*uTB;
|
|
;pCtx->SI.uBG = uG*uTG;
|
|
;pCtx->SI.uBR = uR*uTR;
|
|
;pCtx->SI.uBA = uTA<<COLOR_SHIFT;
|
|
pmullw mm4, mm1
|
|
|
|
|
|
; write
|
|
;*(PUINT16)pS->pSurface =
|
|
; ((pCtx->SI.uBR >> 0) & 0xf800) |
|
|
; ((pCtx->SI.uBG >> 5) & 0x07e0) |
|
|
; ((pCtx->SI.uBB >> 11) & 0x001f);
|
|
|
|
mov edi, [ebp+RASTSPAN_pSurface]
|
|
|
|
psrlw mm4, 8 ; Convert color1 from 8.8 two 0.8
|
|
packuswb mm4, mm7 ; pack one color
|
|
movq mm3, mm4
|
|
pand mm4, MMWORD PTR u888to565RedBlueMask
|
|
pmaddwd mm4, MMWORD PTR u888to565Multiplier
|
|
pand mm3, MMWORD PTR u888to565GreenMask
|
|
por mm4, mm3
|
|
psrld mm4, 5
|
|
|
|
movd edx, mm4
|
|
mov [edi], dx
|
|
|
|
|
|
FailLabel:
|
|
;if (--uPix <= 0)
|
|
; break;
|
|
dec uPix
|
|
jle ExitPixelLoop
|
|
|
|
; Doing update code after span length test so that an extra update is not done.
|
|
|
|
;void TestFail_Gouraud_PerspTex1_NoSpecularFog(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
|
|
; PD3DI_RASTSPAN pS)
|
|
;{
|
|
|
|
|
|
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
|
|
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
|
|
movq mm1, [ebp+RASTSPAN_uB]
|
|
paddw mm1, [ecx+RASTPRIM_iDBDX]
|
|
movq [ebp+RASTSPAN_uB], mm1
|
|
|
|
|
|
|
|
;pS->iUoW1 += pP->iDUoW1DX;
|
|
;pS->iVoW1 += pP->iDVoW1DX;
|
|
movq mm5, [ebp+RASTSPAN_iUoW1]
|
|
paddd mm5, [ecx+RASTPRIM_iDUoW1DX]
|
|
movq [ebp+RASTSPAN_iUoW1], mm5
|
|
|
|
|
|
|
|
;pS->iOoW += pP->iDOoWDX;
|
|
mov eax, [ebp+RASTSPAN_iOoW]
|
|
add eax, [ecx+RASTPRIM_iDOoWDX]
|
|
mov [ebp+RASTSPAN_iOoW], eax
|
|
|
|
|
|
|
|
;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
|
|
; TODO Could do this and OoW Add at same time with MMX.
|
|
|
|
mov edx, [ebp+RASTSPAN_iW]
|
|
mov LastW, edx ; Save iW to calc iDW for next time.
|
|
add edx, [ebx+RASTCTX_SI+SPANITER_iDW]
|
|
|
|
;if (pCtx->SI.iSpecialW < 0)
|
|
;{
|
|
xor edi, edi
|
|
cmp di, word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
|
|
jle DontDoSpecialW1
|
|
|
|
;DoSpecialW1:
|
|
; This label is a left over from when
|
|
|
|
;if (iWn0 < 0)
|
|
;{
|
|
cmp edx, edi
|
|
jl WOutOfRange1
|
|
;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
|
|
mov edx, LastW
|
|
sar edx, 1
|
|
;}
|
|
WOutOfRange1:
|
|
|
|
;VAL32 iWn1;
|
|
;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
|
|
; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
|
|
|
|
;INT32 iGiveUp = 7;
|
|
mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
|
|
;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
|
|
;{
|
|
SpecW1Loop1:
|
|
|
|
; Could move this to bottom of loop and combine results somehow.
|
|
; TBD look at it more.
|
|
dec GiveUp
|
|
jz ExitSpecWLoop1
|
|
|
|
; Shift iOoW by one since imul cannot have sign bit set
|
|
; OoW cannot reach one, only 0x7fffffff
|
|
;shr eax, 1 ; 1.31 >> 1 = 1.30
|
|
|
|
; Get ready to do Two minus iOoW*iW
|
|
mov esi, (1 SHL 16)
|
|
|
|
;iWnOld = iWn0;
|
|
mov edi, edx
|
|
|
|
; Result should be close to one so we want most of the
|
|
; precision in the low bits. Need to give more bits
|
|
; leaway since these are the bad cases.
|
|
; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
|
|
imul edx
|
|
|
|
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
|
|
sub esi, edx
|
|
|
|
;while(iWn1.i < 0)
|
|
;{
|
|
SpecW1Loop2:
|
|
test esi, esi
|
|
jns SpecW1ExitLoop2 ; This jump should be predicted correctly most of the time.
|
|
;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
|
|
add esi, (1 SHL 15)
|
|
sar esi, 1
|
|
jmp SpecW1Loop2
|
|
;}
|
|
|
|
SpecW1ExitLoop2:
|
|
|
|
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
|
|
mov eax, edi
|
|
|
|
shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
|
|
shl esi, 12 ; 4.15 << 12 = 4.27 ;
|
|
|
|
;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
|
|
; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
|
|
mul esi
|
|
|
|
; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
|
|
sub edi, edx
|
|
|
|
; These four lines are abs code.
|
|
mov eax, edi
|
|
sar eax, 31
|
|
xor edi, eax
|
|
sub edi, eax
|
|
|
|
cmp edi, 020h ;Assuming that loop will only happen once.
|
|
jbe ExitSpecWLoop1
|
|
|
|
; Reload eax with iOoW.
|
|
mov eax, [ebp+RASTSPAN_iOoW]
|
|
jmp SpecW1Loop1
|
|
;}
|
|
;else
|
|
;{
|
|
DontDoSpecialW1:
|
|
; Everything should be positive in Non-SpecialW case.
|
|
|
|
;INT32 iWn1;
|
|
mov esi, (1 SHL 16)
|
|
mov edi, edx
|
|
|
|
; This should be close to one so Low bits are most important.
|
|
;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
|
|
mul edx
|
|
|
|
;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
|
|
sub esi, edx
|
|
|
|
;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
|
|
shl esi, 15 ; 0.16.15 << 15 = 0.2.30
|
|
|
|
mov eax, esi
|
|
;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
|
|
mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
|
|
|
|
shl edx, 2 ; 1.17.14 << 2 = 1.15.16
|
|
;}
|
|
;}
|
|
|
|
ExitSpecWLoop1:
|
|
|
|
;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
|
|
;pS->iW = iWn0;
|
|
mov [ebp+RASTSPAN_iW], edx
|
|
mov esi, edx ; Save W for multiplying by UoW and VoW
|
|
sub edx, LastW
|
|
mov [ebx+RASTCTX_SI+SPANITER_iDW], edx
|
|
|
|
;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
|
|
inc word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
|
|
|
|
|
|
|
|
;pCtx->SI.iU1 = d_WTimesUVoW(pS->iW,pS->iUoW1);
|
|
;pCtx->SI.iV1 = d_WTimesUVoW(pS->iW,pS->iVoW1);
|
|
pslld mm5, 8
|
|
shl esi, 4
|
|
movd eax, mm5
|
|
psrlq mm5, 32
|
|
imul esi
|
|
mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
|
|
movd eax, mm5
|
|
imul esi
|
|
mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
|
|
|
|
|
|
|
|
;//pS->pZ += iZStep;
|
|
;//pS->pSurface += iSurfaceStep;
|
|
mov eax, dword ptr [ebp+RASTSPAN_pZ]
|
|
mov edx, dword ptr [ebp+RASTSPAN_pSurface]
|
|
|
|
add eax, iZStep
|
|
add edx, iSurfaceStep
|
|
|
|
mov dword ptr [ebp+RASTSPAN_pZ], eax
|
|
mov dword ptr [ebp+RASTSPAN_pSurface], edx
|
|
|
|
;#ifdef DBG
|
|
;// handy for debug to see where we are
|
|
;//pS->uX += (INT16)pCtx->SI.iXStep;
|
|
;#endif
|
|
;// } // while
|
|
jmp PixelLoop
|
|
|
|
|
|
ExitPixelLoop:
|
|
; Loop code ends
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; LoopAny code ends here
|
|
;-----------------------------------------------------------------------------
|
|
|
|
;pS++;
|
|
add ebp, SIZEOF_RASTSPAN
|
|
|
|
;}
|
|
jmp SpanLoop
|
|
ExitSpanLoop:
|
|
;pP = pP->pNext;
|
|
mov ecx, [ecx+RASTPRIM_pNext]
|
|
;}
|
|
jmp PrimLoop
|
|
|
|
ExitPrimLoop:
|
|
;_asm{
|
|
emms
|
|
;}
|
|
|
|
;return S_OK;
|
|
xor eax, eax
|
|
;}
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
mov esp, StackPos
|
|
pop ebp
|
|
ret
|
|
|
|
END
|