Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

708 lines
23 KiB

;-----------------------------------------------------------------------------
;
; Monolith 8. Gouraud No Z buffer 565
;
; Globals(ATTENTION Darn multiprocessing.)
;
; StackPos - saves stack position
; uSpans - number of spans
;
;
; This monolith tries to processes 4 colors at once.
; This would allow writing of qwords. Since it writes
; qwords, then it is most benificial if they are aligned.
; The beginingpixels loop writes pixels until there screen
; memory is aligned. Then if there are four pixels, then the
; deltacolor values are check to make sure that they dont overflow
; when they get multiplied by four. Next, color,
; color+deltacolor, color+2*deltacolor, and color+3*deltacolor
; are calculated (Generate four starting pixels). To make color
; conversion easier, the red green and blue are all seperated
; into thier own registers (Seperate colors). Deltas for each component
; are also seperated (Seperate delta colors). Since all of the
; components are seperated, three additions will be needed to
; update the color.
;
; This allows 565 pixels to be calculated just like it would
; be done in C except it will generate 4 at a time.
;
; int16 red, green blue
; color = (red&f800) | ((green&07e0) >> 5) | (blue>>11);
;
; So in 16 instructions, four pixels are written to the screen,
; the color is updated, four pixels are converted from internal
; to 565 for next pass and dest and count are updated.
;
; Register Usage for FourPixelLoop
; edi - Dest screen pointer
; mm3 - result of four consecutive color converted 565 colors
; mm4 - four consecutive red values
; mm5 - four consecutive green values
; mm6 - four consecutive blue values
; mm0 - four delta red values (4 times actual delta red)
; mm1 - four delta green values (4 times actual delta green)
; mm2 - four delta blue values (4 times actual delta blue)
; mm7 - temp used to convert to 565
;
; This is the FourPixelLoop
;
; movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
; sub edi, 8
;
; paddw mm5, mm1 ; 4 greens plus 4 delta greens
; paddw mm4, mm0 ; 4 reds plus 4 delta reds
;
; movq mm3, mm5 ; Copy green
; paddw mm6, mm2 ; 4 blues plus 4 delta blues
;
; psrlw mm3, 5 ; Shift green to correct location
; movq mm7, mm4 ; Copy Red
;
; pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
; pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
;
; por mm7, mm3 ; Combine red and green
; movq mm3, mm6 ; Copy Blue
;
; psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
; por mm3, mm7
;
; dec edx
; jnz FourPixelLoopRtoL
;
; If there are any pixels to write after the four pixel loop,
; they have already been calculated and eax will determine
; what needs to be written by its two lower bits.
;
; WriteIndividualPixels is called when there are less than
; four pixels to write after alignment or the deltacolor would
; have had an overflow.
;
;-----------------------------------------------------------------------------
INCLUDE iammx.inc
INCLUDE offs_acp.inc
; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
; at the LSB, then six bits of green, then five bits of red.
;TBD check to see if this value is correct.
COLOR_SHIFT equ 8
.586
.model flat
; Big seperating lines seperate code into span code
; and loop code. If span and loop are not going to
; end up being combined then it will be easy to
; seperate the code.
.data
Val0xe000e000e000 dq 00000e000e000e000h
Val0xf800f800f800f800 dq 0f800f800f800f800h
Val0x07e007e007e007e0 dq 007e007e007e007e0h
; Need externs for all of the variables that are needed for various beads
EXTERN MaskRed565to888:MMWORD
EXTERN MaskGreen565to888:MMWORD
EXTERN MaskBlue565to888:MMWORD
EXTERN MaskRed555to888:MMWORD
EXTERN MaskGreen555to888:MMWORD
EXTERN MaskBlue555to888:MMWORD
EXTERN MaskAlpha1555to8888:MMWORD
EXTERN MaskRed1555to8888:MMWORD
EXTERN MaskGreen1555to8888:MMWORD
EXTERN MaskBlue1555to8888:MMWORD
; TBD. I think that I want to do 0xffff instead of 0xff. This will
; have to be checked. There is a value very similiar to this in
; buf write.
EXTERN SetAlphato0xffff:MMWORD
EXTERN SetAlphato0xff:MMWORD
; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
RedShift565to888 equ 8
GreenShift565to888 equ 5
BlueShift565to888 equ 3
RedShift555to888 equ 9
GreenShift555to888 equ 6
BlueShift555to888 equ 3
AlphaShift1555to8888 equ 16
RedShift1555to8888 equ 9
GreenShift1555to8888 equ 6
BlueShift1555to8888 equ 3
EXTERN Zero:MMWORD
EXTERN SetAlphato0xff:MMWORD
EXTERN u888to565RedBlueMask:MMWORD
EXTERN u888to565GreenMask:MMWORD
EXTERN u888to565Multiplier:MMWORD
EXTERN uVal0x000007ff03ff07ff:MMWORD
EXTERN uVal0x0000078003c00780:MMWORD
EXTERN u888to555RedBlueMask:MMWORD
EXTERN u888to555GreenMask:MMWORD
EXTERN u888to555Multiplier:MMWORD
EXTERN uVal0x000007ff07ff07ff:MMWORD
EXTERN uVal0x0000078007800780:MMWORD
;-----------------------------------------------------------------------------
; Span Variables
StackPos dd ?
uSpans dd ?
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; Loop Variables
;-----------------------------------------------------------------------------
.code
PUBLIC _MMXMLRast_8
_MMXMLRast_8:
push ebp
mov StackPos, esp
mov eax, esp
sub esp, 0Ch ; This will need to change if stack frame size changes.
push ebx
push esi
push edi
xor edi, edi
; Put pCtx into ebx
mov ebx, [eax+8]
;PD3DI_RASTPRIM pP = pCtx->pPrim;
mov ecx, [ebx+RASTCTX_pPrim]
;while (pP)
;{
PrimLoop:
cmp ecx, 0
je ExitPrimLoop
;UINT16 uSpans = pP->uSpans;
movzx eax, word ptr [ecx+RASTPRIM_uSpans]
mov uSpans, eax
;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
mov ebp, ecx
add ebp, SIZEOF_RASTPRIM
;while (uSpans-- > 0)
;{
SpanLoop:
mov edx, uSpans
mov eax, edx
dec eax
mov uSpans, eax
test edx, edx
jle ExitSpanLoop
mov edi, dword ptr [ebp+RASTSPAN_pSurface]
movzx eax, word ptr [ebp+RASTSPAN_uPix]
;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
;{
mov edx, [ecx+RASTPRIM_uFlags]
and edx, D3DI_RASTPRIM_X_DEC
test edx, edx
jz LeftToRightSpan
; SCREWED UP RIGHT TO LEFT CASE
movq mm0, [ebp+RASTSPAN_uB]
; This case is unnecessary if 16 bit color is always word aligned.
; It really should be, but it doesnt hurt to be safe for a 2 instruction
; penalty.
test edi, 1
jnz WriteIndividualPixelsRtoL
beginingpixelsRtoL:
; Align color. Alignment is very different when drawing Right to Left.
xor edi, 6 ; Simple trick to make alignment code work same as LtoR
; Could Possibly be better though.
test edi, 7 ;Test to see if we are 4 pixel aligned.
jz SetupFourPixelLoopRtoL
xor edi, 6 ; Make pointer back to what it used to be
;WritePixel
movq mm5, mm0
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
movq mm3, mm5
pand mm5, MMWORD PTR u888to565RedBlueMask
pmaddwd mm5, MMWORD PTR u888to565Multiplier
pand mm3, MMWORD PTR u888to565GreenMask
por mm5, mm3
psrld mm5, 5
movd edx, mm5
mov [edi], dx
sub edi, 2 ; Increase destination pointer
dec eax ; Reduce Pixel count
jz NoMorePixelsRtoL
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
paddw mm0, [ecx+RASTPRIM_iDBDX]
jmp beginingpixelsRtoL
SetupFourPixelLoopRtoL:
xor edi, 6 ; Make pointer back to what it used to be
; Only go through trouble of setting up four pixels if we have four pixels.
; Check to see if there are four pixels left over after aligning pixels.
mov edx, eax
shr edx, 2
jz WriteIndividualPixelsRtoL
xor edi, 6 ; Writing pixels from r to left so address needs to be left most address.
; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
; Need to check for potential delta overflow.
; For example, if there are 5 pixels that change from 255 to 0
; then the delta would be -255/(5-1) = -63. Since we are doing at least 4 pixels,
; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
movq mm1, [ecx+RASTPRIM_iDBDX]
movq mm2, mm1
psraw mm1, 15 ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
pxor mm2, mm1
psubw mm2, mm1 ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
pand mm2, MMWORD PTR Val0xe000e000e000 ; Check to see if any of the high three bits are set
packuswb mm2, mm2 ; This will saturate if the high three bits are set.
movd esi, mm2
test esi, esi
jnz WriteIndividualPixelsRtoL
; -----------------------------------
; Generate four starting color pixels
; -----------------------------------
; Put color + 0*delta in mm0. Changed it so that it started in mm0
movq mm1, mm0
paddw mm1, [ecx+RASTPRIM_iDBDX] ; Put color + 1*delta in mm1
movq mm2, mm1
paddw mm2, [ecx+RASTPRIM_iDBDX] ; Put color + 2*delta in mm2
movq mm3, mm2
paddw mm3, [ecx+RASTPRIM_iDBDX] ; Put color + 3*delta in mm3
; -----------------------------------
; Seperate colors.
; -----------------------------------
; Combine all reds into one mmx reg.
movq mm4, mm3
punpckhwd mm4, mm2
movq mm5, mm1
punpckhwd mm5, mm0
punpckldq mm4, mm5
; Combine all greens into one mmx reg.
movq mm5, mm3
punpcklwd mm5, mm2
movq mm6, mm1
punpcklwd mm6, mm0
punpckhdq mm5, mm6
; Combine all blues into one mmx reg.
movq mm6, mm3
punpcklwd mm6, mm2
movq mm7, mm1
punpcklwd mm7, mm0
punpckldq mm6, mm7
; -----------------------------------
; Seperate delta colors.
; -----------------------------------
; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
movq mm7, [ecx+RASTPRIM_iDBDX]
psllw mm7, 2 ; Doing 4 pixels at a time so delta must be times 4.
; Combine all delta reds into one mmx reg.
movq mm0, mm7
punpckhwd mm0, mm7
movq mm1, mm7
punpckhwd mm1, mm7
punpckldq mm0, mm1
; Combine all delta greens into one mmx reg.
movq mm1, mm7
punpcklwd mm1, mm7
movq mm2, mm7
punpcklwd mm2, mm7
punpckhdq mm1, mm2
; Combine all delta blues into one mmx reg.
movq mm2, mm7
punpcklwd mm2, mm7
movq mm3, mm7
punpcklwd mm3, mm7
punpckldq mm2, mm3
; Need to combine for first pixel write.
movq mm3, mm5 ; Copy green
psrlw mm3, 5 ; Shift green to correct location
movq mm7, mm4 ; Copy Red
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
por mm7, mm3
movq mm3, mm6
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
por mm3, mm7
FourPixelLoopRtoL:
movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
sub edi, 8
paddw mm5, mm1 ; 4 greens plus 4 delta greens
paddw mm4, mm0 ; 4 reds plus 4 delta reds
movq mm3, mm5 ; Copy green
paddw mm6, mm2 ; 4 blues plus 4 delta blues
psrlw mm3, 5 ; Shift green to correct location
movq mm7, mm4 ; Copy Red
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
por mm7, mm3 ; Combine red and green
movq mm3, mm6 ; Copy Blue
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
por mm3, mm7
dec edx
jnz FourPixelLoopRtoL
LastPixelsRtoL:
; These can be written from values in mm0
test eax, 2
jz OnePixelLeftRtoL
movq mm1, mm3
psrlq mm1, 32
movd MMWORD PTR [edi+4], mm1
sub edi, 4
psrlq mm0, 32
OnePixelLeftRtoL:
test eax, 1
jz NoMorePixelsRtoL
psrlq mm3, 48
movd edx, mm3
mov word ptr [edi+6], dx
jmp NoMorePixelsRtoL
WriteIndividualPixelsRtoL:
movq mm5, mm0
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
movq mm3, mm5
pand mm5, MMWORD PTR u888to565RedBlueMask
pmaddwd mm5, MMWORD PTR u888to565Multiplier
pand mm3, MMWORD PTR u888to565GreenMask
por mm5, mm3
psrld mm5, 5
movd edx, mm5
mov [edi], dx
sub edi, 2
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
paddw mm0, [ecx+RASTPRIM_iDBDX]
dec eax
jnz WriteIndividualPixelsRtoL
NoMorePixelsRtoL:
jmp DoneSpanDirif
;else
;{
LeftToRightSpan:
; NORMAL LEFT TO RIGHT CASE
movq mm0, [ebp+RASTSPAN_uB]
; This case is unnecessary if 16 bit color is always word aligned.
; It really should be, but it doesnt hurt to be safe for a 2 instruction
; penalty.
test edi, 1
jnz WriteIndividualPixelsLtoR
beginingpixelsLtoR:
test edi, 7 ;Test to see if we are 4 pixel aligned.
jz SetupFourPixelLoopLtoR
;WritePixel
movq mm5, mm0
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
movq mm3, mm5
pand mm5, MMWORD PTR u888to565RedBlueMask
pmaddwd mm5, MMWORD PTR u888to565Multiplier
pand mm3, MMWORD PTR u888to565GreenMask
por mm5, mm3
psrld mm5, 5
movd edx, mm5
mov [edi], dx
add edi, 2 ; Increase destination pointer
dec eax ; Reduce Pixel count
jz NoMorePixelsLtoR
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
paddw mm0, [ecx+RASTPRIM_iDBDX]
jmp beginingpixelsLtoR
SetupFourPixelLoopLtoR:
; Only go through trouble of setting up four pixels if we have four pixels.
mov edx, eax
shr edx, 2
jz WriteIndividualPixelsLtoR
; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
; Need to check for potential delta overflow.
; For example, if there are 5 pixels that change from 255 to 0
; then the delta would be -255/(5-1) = -63. Since we are doing at least 4 pixels,
; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
movq mm1, [ecx+RASTPRIM_iDBDX]
movq mm2, mm1
psraw mm1, 15 ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
pxor mm2, mm1
psubw mm2, mm1 ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
pand mm2, MMWORD PTR Val0xe000e000e000 ; Check to see if any of the high three bits are set
packuswb mm2, mm2 ; This will saturate if the high three bits are set.
movd esi, mm2
test esi, esi
jnz WriteIndividualPixelsLtoR
; -----------------------------------
; Generate four starting color pixels
; -----------------------------------
; Put color + 0*delta in mm0. Changed it so that it started in mm0
movq mm1, mm0
paddw mm1, [ecx+RASTPRIM_iDBDX] ; Put color + 1*delta in mm1
movq mm2, mm1
paddw mm2, [ecx+RASTPRIM_iDBDX] ; Put color + 2*delta in mm2
movq mm3, mm2
paddw mm3, [ecx+RASTPRIM_iDBDX] ; Put color + 3*delta in mm3
; -----------------------------------
; Seperate colors.
; -----------------------------------
; Combine all reds into one mmx reg.
movq mm4, mm0
punpckhwd mm4, mm1
movq mm5, mm2
punpckhwd mm5, mm3
punpckldq mm4, mm5
; Combine all greens into one mmx reg.
movq mm5, mm0
punpcklwd mm5, mm1
movq mm6, mm2
punpcklwd mm6, mm3
punpckhdq mm5, mm6
; Combine all blues into one mmx reg.
movq mm6, mm0
punpcklwd mm6, mm1
movq mm7, mm2
punpcklwd mm7, mm3
punpckldq mm6, mm7
; -----------------------------------
; Seperate delta colors.
; -----------------------------------
; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
movq mm7, [ecx+RASTPRIM_iDBDX]
psllw mm7, 2 ; Doing 4 pixels at a time so delta must be times 4.
; Combine all delta reds into one mmx reg.
movq mm0, mm7
punpckhwd mm0, mm7
movq mm1, mm7
punpckhwd mm1, mm7
punpckldq mm0, mm1
; Combine all delta greens into one mmx reg.
movq mm1, mm7
punpcklwd mm1, mm7
movq mm2, mm7
punpcklwd mm2, mm7
punpckhdq mm1, mm2
; Combine all delta blues into one mmx reg.
movq mm2, mm7
punpcklwd mm2, mm7
movq mm3, mm7
punpcklwd mm3, mm7
punpckldq mm2, mm3
; Need to combine for first pixel write.
movq mm3, mm5 ; Copy green
psrlw mm3, 5 ; Shift green to correct location
movq mm7, mm4 ; Copy Red
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
por mm7, mm3
movq mm3, mm6
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
por mm3, mm7
FourPixelLoopLtoR:
movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
add edi, 8
paddw mm5, mm1 ; 4 greens plus 4 delta greens
paddw mm4, mm0 ; 4 reds plus 4 delta reds
movq mm3, mm5 ; Copy green
paddw mm6, mm2 ; 4 blues plus 4 delta blues
psrlw mm3, 5 ; Shift green to correct location
movq mm7, mm4 ; Copy Red
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
por mm7, mm3
movq mm3, mm6
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
por mm3, mm7
dec edx
jnz FourPixelLoopLtoR
LastPixelsLtoR:
; These can be written from values in mm3
test eax, 2
jz OnePixelLeftLtoR
movd MMWORD PTR [edi], mm3
add edi, 4
psrlq mm0, 32
OnePixelLeftLtoR:
test eax, 1
jz NoMorePixelsLtoR
movd edx, mm3
mov word ptr [edi], dx
jmp NoMorePixelsLtoR
WriteIndividualPixelsLtoR:
movq mm5, mm0
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
movq mm3, mm5
pand mm5, MMWORD PTR u888to565RedBlueMask
pmaddwd mm5, MMWORD PTR u888to565Multiplier
pand mm3, MMWORD PTR u888to565GreenMask
por mm5, mm3
psrld mm5, 5
movd edx, mm5
mov [edi], dx
add edi, 2
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
paddw mm0, [ecx+RASTPRIM_iDBDX]
dec eax
jnz WriteIndividualPixelsLtoR
NoMorePixelsLtoR:
;}
DoneSpanDirif:
; Setup Code Ends
; ----------------------------------------------------------------------------------------------------------------
; Loop Code Begins
ExitPixelLoop:
; Loop code ends
;-----------------------------------------------------------------------------
; LoopAny code ends here
;-----------------------------------------------------------------------------
;pS++;
add ebp, SIZEOF_RASTSPAN
;}
jmp SpanLoop
ExitSpanLoop:
;pP = pP->pNext;
mov ecx, [ecx+RASTPRIM_pNext]
;}
jmp PrimLoop
ExitPrimLoop:
;_asm{
emms
;}
;return S_OK;
xor eax, eax
;}
pop edi
pop esi
pop ebx
mov esp, StackPos
pop ebp
ret
END