;----------------------------------------------------------------------------- ; ; Monolith 2. Non-perspective 16 bit Z buffered 565 ; ; Globals (ATTENTION. Need to move all globals to stack.) ; ; uSpans - Count containing the number of spans. ; StackPos - Saves stack position. ; uPix - Pixel count ; iSurfaceStep - what to add to screen pointer ; iZStep - What to add to Z buffer pointer ; ; (The below globals are used to save esi register which ; was normally used for pTex pointer. Now esi is used ; for z buffer pointer.) ; uMaskU ; iShiftU ; iShiftPitch ; pBits ; ; Register Useage ; ; esi - Z buffer pointer ; edi - Screen buffer pointer ; ; All other are temporary ; ; ; The only differences between this monolith and the regular ; MMX assembly code are: ; ; 1) Uses LE/GR Z compare code that all other monoliths use. ; 2) All texture info is stored in Globals to free up esi. ; 3) This code does Wrap only for texture addressing ; 4) Since there is no modulation or bi-linear and the source ; and destination color formats are the same, there is ; no need to convert to and from the internal color format. ; 5) esi is reserved for the Zbuffer ; 6) edi is reserved for the screen buffer. ; 7) Texture read does not use edi as pBits so that edi can be ; preserved for screen buffer. ; ;----------------------------------------------------------------------------- INCLUDE iammx.inc INCLUDE offs_acp.inc ; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting ; at the LSB, then six bits of green, then five bits of red. ;TBD check to see if this value is correct. COLOR_SHIFT equ 8 .586 .model flat ; Big separating lines seperate code into span code ; and loop code. If span and loop are not going to ; end up being combined then it will be easy to ; seperate the code. .data ; Need externs for all of the variables that are needed for various beads EXTERN IncHighandLow16:MMWORD EXTERN UFracVFracMask:MMWORD EXTERN UV32to15Mask:MMWORD EXTERN Makelow16one:MMWORD EXTERN MaskKeepUValues:MMWORD EXTERN MaskKeepVValues:MMWORD EXTERN UFrac:MMWORD EXTERN VFrac:MMWORD EXTERN Zero:MMWORD EXTERN memD3DTFG_POINT:MMWORD EXTERN GiveUp:MMWORD EXTERN LastW:MMWORD EXTERN Val0x000a000a:MMWORD EXTERN Val0xffff:MMWORD EXTERN Val0x0000002000000020:MMWORD EXTERN Val0x0000ffff0000ffff:MMWORD EXTERN MaskRed565to888:MMWORD EXTERN MaskGreen565to888:MMWORD EXTERN MaskBlue565to888:MMWORD EXTERN MaskRed555to888:MMWORD EXTERN MaskGreen555to888:MMWORD EXTERN MaskBlue555to888:MMWORD EXTERN MaskAlpha1555to8888:MMWORD EXTERN MaskRed1555to8888:MMWORD EXTERN MaskGreen1555to8888:MMWORD EXTERN MaskBlue1555to8888:MMWORD ; TBD. I think that I want to do 0xffff instead of 0xff. This will ; have to be checked. There is a value very similiar to this in ; buf write. EXTERN SetAlphato0xffff:MMWORD EXTERN SetAlphato0xff:MMWORD ; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file. RedShift565to888 equ 8 GreenShift565to888 equ 5 BlueShift565to888 equ 3 RedShift555to888 equ 9 GreenShift555to888 equ 6 BlueShift555to888 equ 3 AlphaShift1555to8888 equ 16 RedShift1555to8888 equ 9 GreenShift1555to8888 equ 6 BlueShift1555to8888 equ 3 EXTERN Zero:MMWORD EXTERN DW_One_One:MMWORD EXTERN MaskOffAlpha:MMWORD EXTERN ShiftTA:MMWORD EXTERN Val0x00ff00ff00ff00ff:MMWORD EXTERN Val0x000000ff00ff00ff:MMWORD EXTERN Val0X0000000001000000:MMWORD EXTERN AlphaVal128:MMWORD EXTERN RGBVal128:MMWORD EXTERN g_uDitherValue:MMWORD EXTERN SetAlphato0xff:MMWORD EXTERN u888to565RedBlueMask:MMWORD EXTERN u888to565GreenMask:MMWORD EXTERN u888to565Multiplier:MMWORD EXTERN uVal0x000007ff03ff07ff:MMWORD EXTERN uVal0x0000078003c00780:MMWORD EXTERN u888to555RedBlueMask:MMWORD EXTERN u888to555GreenMask:MMWORD EXTERN u888to555Multiplier:MMWORD EXTERN uVal0x000007ff07ff07ff:MMWORD EXTERN uVal0x0000078007800780:MMWORD ;----------------------------------------------------------------------------- ; Span Variables uMaskU dq ? StackPos dd ? uSpans dd ? iShiftU dd ? iShiftPitch dd ? pBits dd ? ;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- ; Loop Variables iSurfaceStep dd ? iZStep dd ? uPix dd ? ;----------------------------------------------------------------------------- .code PUBLIC _MMXMLRast_2 _MMXMLRast_2: push ebp mov StackPos, esp mov eax, esp sub esp, 0Ch ; This will need to change if stack frame size changes. push ebx push esi push edi ; Put pCtx into ebx mov ebx, [eax+8] ;PD3DI_RASTPRIM pP = pCtx->pPrim; mov ecx, [ebx+RASTCTX_pPrim] ;while (pP) ;{ PrimLoop: cmp ecx, 0 je ExitPrimLoop ;UINT16 uSpans = pP->uSpans; movzx eax, word ptr [ecx+RASTPRIM_uSpans] mov uSpans, eax ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1); mov ebp, ecx add ebp, SIZEOF_RASTPRIM SpanLoop: mov edx, uSpans mov eax, edx dec eax mov uSpans, eax test edx, edx jle ExitSpanLoop ;pCtx->pfnBegin(pCtx, pP, pS); ;----------------------------------------------------------------------------- ; LoopAny code inserted here. This is to get rid of an extra ; jump. ;----------------------------------------------------------------------------- ; Setup Code begins - get values to iterate movzx eax, word ptr [ebp+RASTSPAN_uPix] mov uPix, eax movq mm5, [ebp+RASTSPAN_iUoW1] ; non perspective correct. psrad mm5, TEX_TO_FINAL_SHIFT movq [ebx+RASTCTX_SI+SPANITER_iU1], mm5 mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0 mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], 0 mov eax, [ecx+RASTPRIM_uFlags] and eax, D3DI_RASTPRIM_X_DEC test eax, eax jz LeftToRightSpan mov eax, [ebx+RASTCTX_iZStep] neg eax mov iZStep, eax mov eax, [ebx+RASTCTX_iSurfaceStep] neg eax mov iSurfaceStep, eax jmp DoneSpanDirif LeftToRightSpan: mov eax, [ebx+RASTCTX_iZStep] mov iZStep, eax mov eax, [ebx+RASTCTX_iSurfaceStep] mov iSurfaceStep, eax DoneSpanDirif: ;****************************************** ; Extra Globals are used here. mov esi, [ebx+RASTCTX_pTexture] mov edx, [esi + SPANTEX_iShiftU] mov iShiftU, edx movzx edx, word ptr [esi + SPANTEX_iShiftPitch] mov iShiftPitch, edx movd mm0, dword ptr [esi+SPANTEX_uMaskU] ; Load U and V mask movq MMWORD PTR uMaskU, mm0 mov edx, [esi+SPANTEX_pBits] mov pBits, edx mov edi, [ebp+RASTSPAN_pSurface] mov esi, [ebp+RASTSPAN_pZ] ;****************************************** PixelLoop: ; Ztestcode ; edx is uZ ; eax is uZB ; 16 bit unsigned format ;UINT16 uZ = (UINT16)(pS->uZ>>15); ;UINT16 uZB = *((UINT16*)pS->pZ); mov edx, [ebp+RASTSPAN_uZ] movd mm4, edx shr edx, 15 movzx eax, word ptr [esi] ;pS->uZ += pP->iDZDX; ;if ((pCtx->iZXorMask)^(uZ > uZB)) sub eax, edx paddd mm4, [ecx+RASTPRIM_iDZDX] movd [ebp+RASTSPAN_uZ], mm4 xor eax, [ebx+RASTCTX_iZXorMask] test eax, eax js FailLabel mov word ptr [esi], dx ; texturecode xor eax, eax ; Doing UV calculation a little more accurate ; Exactly like C code. ; I iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0 ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0)) ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU) ; COMMENT1** ; If textures have a max of 1024 then shiftU0 would be at most 10 which would ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6 ; It will also give bi-linear 6 bits of precision I think it was said that ; only five was needed. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0; ;INT16 iShiftV0 = pTex->iShiftV - iLOD0; movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10. movd mm4, iShiftU psubw mm5, mm4 movq mm4, mm5 pand mm5, MMWORD PTR Val0xffff psrld mm4, 16 movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1] psrad mm1, mm5 movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1] psrad mm2, mm4 punpckldq mm1, mm2 ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table mov edx, iShiftPitch add edx, 16 movd mm2, edx movq mm5, MMWORD ptr Makelow16one pslld mm5, mm2 por mm5, MMWORD ptr Makelow16one ; Make the low 16 bits of dword one ; This helps in calculating texture address. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or ; clamped. This can be done for two values in the point case ; or four values in the bilinear case. ;iU00 >>= 6; ;iV00 >>= 6; psrad mm1, 6 packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror ; operations assume UV in low 32 bits. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0; ; put mask in mm3 and replicate to match location for wrap/mirror/clamp ; Replace general purpose wrap/mirror code with specific wrap code. pand mm1, MMWORD PTR uMaskU movq mm4, mm1 ; Making other two cases for texture addressing has to be simplier than ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3. ; TBD Make this better. ; values are still stored as iV01, iU00, iV00, iU01 pmaddwd mm4, mm5 ; Throw in first address calculation. ; Just to get it started. Calculate ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0 ; From here until mov edi is code that is needed for border. ; all sign bits are stored in bytes so that border code can tell if uv went below zero. ; iV0 iU1 address should be done by now. movd eax, mm4 shl eax, 1 add eax, pBits ;pCtx->SI.TexCol[0] = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU, ; pTex->pBits[iLOD0], &pCtx->Texture[0]); mov dx, word ptr [eax] ; Write Texture. mov [edi], dx FailLabel: dec uPix jle ExitPixelLoop ; Doing update code after span length test so that an extra update is not done. movq mm5, [ebp+RASTSPAN_iUoW1] paddd mm5, [ecx+RASTPRIM_iDUoW1DX] movq [ebp+RASTSPAN_iUoW1], mm5 ; mm5 still contains iUoW and iVoW which are the iU and iV values for ; non perspective correct. psrad mm5, TEX_TO_FINAL_SHIFT movq [ebx+RASTCTX_SI+SPANITER_iU1], mm5 add esi, iZStep add edi, iSurfaceStep jmp PixelLoop ExitPixelLoop: ; Loop code ends ;----------------------------------------------------------------------------- ; LoopAny code ends here ;----------------------------------------------------------------------------- ;pS++; add ebp, SIZEOF_RASTSPAN ;} jmp SpanLoop ExitSpanLoop: ;pP = pP->pNext; mov ecx, [ecx+RASTPRIM_pNext] ;} jmp PrimLoop ExitPrimLoop: ;_asm{ emms ;} ;return S_OK; xor eax, eax ;} pop edi pop esi pop ebx mov esp, StackPos pop ebp ret END