You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
708 lines
23 KiB
708 lines
23 KiB
;-----------------------------------------------------------------------------
|
|
;
|
|
; Monolith 8. Gouraud No Z buffer 565
|
|
;
|
|
; Globals(ATTENTION Darn multiprocessing.)
|
|
;
|
|
; StackPos - saves stack position
|
|
; uSpans - number of spans
|
|
;
|
|
;
|
|
; This monolith tries to processes 4 colors at once.
|
|
; This would allow writing of qwords. Since it writes
|
|
; qwords, then it is most benificial if they are aligned.
|
|
; The beginingpixels loop writes pixels until there screen
|
|
; memory is aligned. Then if there are four pixels, then the
|
|
; deltacolor values are check to make sure that they dont overflow
|
|
; when they get multiplied by four. Next, color,
|
|
; color+deltacolor, color+2*deltacolor, and color+3*deltacolor
|
|
; are calculated (Generate four starting pixels). To make color
|
|
; conversion easier, the red green and blue are all seperated
|
|
; into thier own registers (Seperate colors). Deltas for each component
|
|
; are also seperated (Seperate delta colors). Since all of the
|
|
; components are seperated, three additions will be needed to
|
|
; update the color.
|
|
;
|
|
; This allows 565 pixels to be calculated just like it would
|
|
; be done in C except it will generate 4 at a time.
|
|
;
|
|
; int16 red, green blue
|
|
; color = (red&f800) | ((green&07e0) >> 5) | (blue>>11);
|
|
;
|
|
; So in 16 instructions, four pixels are written to the screen,
|
|
; the color is updated, four pixels are converted from internal
|
|
; to 565 for next pass and dest and count are updated.
|
|
;
|
|
; Register Usage for FourPixelLoop
|
|
; edi - Dest screen pointer
|
|
; mm3 - result of four consecutive color converted 565 colors
|
|
; mm4 - four consecutive red values
|
|
; mm5 - four consecutive green values
|
|
; mm6 - four consecutive blue values
|
|
; mm0 - four delta red values (4 times actual delta red)
|
|
; mm1 - four delta green values (4 times actual delta green)
|
|
; mm2 - four delta blue values (4 times actual delta blue)
|
|
; mm7 - temp used to convert to 565
|
|
;
|
|
; This is the FourPixelLoop
|
|
;
|
|
; movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
|
|
; sub edi, 8
|
|
;
|
|
; paddw mm5, mm1 ; 4 greens plus 4 delta greens
|
|
; paddw mm4, mm0 ; 4 reds plus 4 delta reds
|
|
;
|
|
; movq mm3, mm5 ; Copy green
|
|
; paddw mm6, mm2 ; 4 blues plus 4 delta blues
|
|
;
|
|
; psrlw mm3, 5 ; Shift green to correct location
|
|
; movq mm7, mm4 ; Copy Red
|
|
;
|
|
; pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
|
|
; pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
|
|
;
|
|
; por mm7, mm3 ; Combine red and green
|
|
; movq mm3, mm6 ; Copy Blue
|
|
;
|
|
; psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
|
|
; por mm3, mm7
|
|
;
|
|
; dec edx
|
|
; jnz FourPixelLoopRtoL
|
|
;
|
|
; If there are any pixels to write after the four pixel loop,
|
|
; they have already been calculated and eax will determine
|
|
; what needs to be written by its two lower bits.
|
|
;
|
|
; WriteIndividualPixels is called when there are less than
|
|
; four pixels to write after alignment or the deltacolor would
|
|
; have had an overflow.
|
|
;
|
|
;-----------------------------------------------------------------------------
|
|
|
|
INCLUDE iammx.inc
|
|
INCLUDE offs_acp.inc
|
|
|
|
|
|
; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
|
|
; at the LSB, then six bits of green, then five bits of red.
|
|
|
|
|
|
;TBD check to see if this value is correct.
|
|
COLOR_SHIFT equ 8
|
|
|
|
.586
|
|
.model flat
|
|
|
|
; Big seperating lines seperate code into span code
|
|
; and loop code. If span and loop are not going to
|
|
; end up being combined then it will be easy to
|
|
; seperate the code.
|
|
|
|
.data
|
|
|
|
Val0xe000e000e000 dq 00000e000e000e000h
|
|
Val0xf800f800f800f800 dq 0f800f800f800f800h
|
|
Val0x07e007e007e007e0 dq 007e007e007e007e0h
|
|
|
|
; Need externs for all of the variables that are needed for various beads
|
|
|
|
EXTERN MaskRed565to888:MMWORD
|
|
EXTERN MaskGreen565to888:MMWORD
|
|
EXTERN MaskBlue565to888:MMWORD
|
|
|
|
EXTERN MaskRed555to888:MMWORD
|
|
EXTERN MaskGreen555to888:MMWORD
|
|
EXTERN MaskBlue555to888:MMWORD
|
|
|
|
EXTERN MaskAlpha1555to8888:MMWORD
|
|
EXTERN MaskRed1555to8888:MMWORD
|
|
EXTERN MaskGreen1555to8888:MMWORD
|
|
EXTERN MaskBlue1555to8888:MMWORD
|
|
|
|
; TBD. I think that I want to do 0xffff instead of 0xff. This will
|
|
; have to be checked. There is a value very similiar to this in
|
|
; buf write.
|
|
EXTERN SetAlphato0xffff:MMWORD
|
|
EXTERN SetAlphato0xff:MMWORD
|
|
|
|
; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
|
|
RedShift565to888 equ 8
|
|
GreenShift565to888 equ 5
|
|
BlueShift565to888 equ 3
|
|
|
|
RedShift555to888 equ 9
|
|
GreenShift555to888 equ 6
|
|
BlueShift555to888 equ 3
|
|
|
|
AlphaShift1555to8888 equ 16
|
|
RedShift1555to8888 equ 9
|
|
GreenShift1555to8888 equ 6
|
|
BlueShift1555to8888 equ 3
|
|
|
|
EXTERN Zero:MMWORD
|
|
|
|
EXTERN SetAlphato0xff:MMWORD
|
|
EXTERN u888to565RedBlueMask:MMWORD
|
|
EXTERN u888to565GreenMask:MMWORD
|
|
EXTERN u888to565Multiplier:MMWORD
|
|
EXTERN uVal0x000007ff03ff07ff:MMWORD
|
|
EXTERN uVal0x0000078003c00780:MMWORD
|
|
EXTERN u888to555RedBlueMask:MMWORD
|
|
EXTERN u888to555GreenMask:MMWORD
|
|
EXTERN u888to555Multiplier:MMWORD
|
|
EXTERN uVal0x000007ff07ff07ff:MMWORD
|
|
EXTERN uVal0x0000078007800780:MMWORD
|
|
|
|
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; Span Variables
|
|
StackPos dd ?
|
|
uSpans dd ?
|
|
;-----------------------------------------------------------------------------
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; Loop Variables
|
|
;-----------------------------------------------------------------------------
|
|
|
|
.code
|
|
|
|
|
|
PUBLIC _MMXMLRast_8
|
|
_MMXMLRast_8:
|
|
push ebp
|
|
mov StackPos, esp
|
|
mov eax, esp
|
|
sub esp, 0Ch ; This will need to change if stack frame size changes.
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
|
|
xor edi, edi
|
|
|
|
; Put pCtx into ebx
|
|
mov ebx, [eax+8]
|
|
|
|
;PD3DI_RASTPRIM pP = pCtx->pPrim;
|
|
mov ecx, [ebx+RASTCTX_pPrim]
|
|
|
|
;while (pP)
|
|
;{
|
|
PrimLoop:
|
|
cmp ecx, 0
|
|
je ExitPrimLoop
|
|
|
|
;UINT16 uSpans = pP->uSpans;
|
|
movzx eax, word ptr [ecx+RASTPRIM_uSpans]
|
|
mov uSpans, eax
|
|
|
|
;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
|
|
mov ebp, ecx
|
|
add ebp, SIZEOF_RASTPRIM
|
|
|
|
|
|
;while (uSpans-- > 0)
|
|
;{
|
|
SpanLoop:
|
|
mov edx, uSpans
|
|
mov eax, edx
|
|
dec eax
|
|
mov uSpans, eax
|
|
test edx, edx
|
|
jle ExitSpanLoop
|
|
|
|
|
|
mov edi, dword ptr [ebp+RASTSPAN_pSurface]
|
|
movzx eax, word ptr [ebp+RASTSPAN_uPix]
|
|
|
|
;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
|
|
;{
|
|
mov edx, [ecx+RASTPRIM_uFlags]
|
|
and edx, D3DI_RASTPRIM_X_DEC
|
|
test edx, edx
|
|
jz LeftToRightSpan
|
|
|
|
; SCREWED UP RIGHT TO LEFT CASE
|
|
|
|
movq mm0, [ebp+RASTSPAN_uB]
|
|
; This case is unnecessary if 16 bit color is always word aligned.
|
|
; It really should be, but it doesnt hurt to be safe for a 2 instruction
|
|
; penalty.
|
|
test edi, 1
|
|
jnz WriteIndividualPixelsRtoL
|
|
|
|
beginingpixelsRtoL:
|
|
|
|
; Align color. Alignment is very different when drawing Right to Left.
|
|
|
|
xor edi, 6 ; Simple trick to make alignment code work same as LtoR
|
|
; Could Possibly be better though.
|
|
|
|
test edi, 7 ;Test to see if we are 4 pixel aligned.
|
|
jz SetupFourPixelLoopRtoL
|
|
|
|
xor edi, 6 ; Make pointer back to what it used to be
|
|
|
|
;WritePixel
|
|
movq mm5, mm0
|
|
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
|
|
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
|
|
movq mm3, mm5
|
|
pand mm5, MMWORD PTR u888to565RedBlueMask
|
|
pmaddwd mm5, MMWORD PTR u888to565Multiplier
|
|
pand mm3, MMWORD PTR u888to565GreenMask
|
|
por mm5, mm3
|
|
psrld mm5, 5
|
|
|
|
movd edx, mm5
|
|
mov [edi], dx
|
|
sub edi, 2 ; Increase destination pointer
|
|
|
|
dec eax ; Reduce Pixel count
|
|
jz NoMorePixelsRtoL
|
|
|
|
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
|
|
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
|
|
paddw mm0, [ecx+RASTPRIM_iDBDX]
|
|
|
|
jmp beginingpixelsRtoL
|
|
|
|
SetupFourPixelLoopRtoL:
|
|
|
|
xor edi, 6 ; Make pointer back to what it used to be
|
|
|
|
; Only go through trouble of setting up four pixels if we have four pixels.
|
|
; Check to see if there are four pixels left over after aligning pixels.
|
|
mov edx, eax
|
|
shr edx, 2
|
|
jz WriteIndividualPixelsRtoL
|
|
|
|
xor edi, 6 ; Writing pixels from r to left so address needs to be left most address.
|
|
|
|
; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
|
|
; Need to check for potential delta overflow.
|
|
; For example, if there are 5 pixels that change from 255 to 0
|
|
; then the delta would be -255/(5-1) = -63. Since we are doing at least 4 pixels,
|
|
; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
|
|
movq mm1, [ecx+RASTPRIM_iDBDX]
|
|
movq mm2, mm1
|
|
psraw mm1, 15 ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
|
|
pxor mm2, mm1
|
|
psubw mm2, mm1 ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
|
|
pand mm2, MMWORD PTR Val0xe000e000e000 ; Check to see if any of the high three bits are set
|
|
packuswb mm2, mm2 ; This will saturate if the high three bits are set.
|
|
movd esi, mm2
|
|
test esi, esi
|
|
jnz WriteIndividualPixelsRtoL
|
|
|
|
; -----------------------------------
|
|
; Generate four starting color pixels
|
|
; -----------------------------------
|
|
; Put color + 0*delta in mm0. Changed it so that it started in mm0
|
|
movq mm1, mm0
|
|
paddw mm1, [ecx+RASTPRIM_iDBDX] ; Put color + 1*delta in mm1
|
|
movq mm2, mm1
|
|
paddw mm2, [ecx+RASTPRIM_iDBDX] ; Put color + 2*delta in mm2
|
|
movq mm3, mm2
|
|
paddw mm3, [ecx+RASTPRIM_iDBDX] ; Put color + 3*delta in mm3
|
|
|
|
|
|
; -----------------------------------
|
|
; Seperate colors.
|
|
; -----------------------------------
|
|
|
|
; Combine all reds into one mmx reg.
|
|
movq mm4, mm3
|
|
punpckhwd mm4, mm2
|
|
|
|
movq mm5, mm1
|
|
punpckhwd mm5, mm0
|
|
punpckldq mm4, mm5
|
|
|
|
; Combine all greens into one mmx reg.
|
|
movq mm5, mm3
|
|
punpcklwd mm5, mm2
|
|
|
|
movq mm6, mm1
|
|
punpcklwd mm6, mm0
|
|
punpckhdq mm5, mm6
|
|
|
|
; Combine all blues into one mmx reg.
|
|
movq mm6, mm3
|
|
punpcklwd mm6, mm2
|
|
|
|
movq mm7, mm1
|
|
punpcklwd mm7, mm0
|
|
punpckldq mm6, mm7
|
|
|
|
; -----------------------------------
|
|
; Seperate delta colors.
|
|
; -----------------------------------
|
|
|
|
; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
|
|
movq mm7, [ecx+RASTPRIM_iDBDX]
|
|
psllw mm7, 2 ; Doing 4 pixels at a time so delta must be times 4.
|
|
; Combine all delta reds into one mmx reg.
|
|
movq mm0, mm7
|
|
punpckhwd mm0, mm7
|
|
|
|
movq mm1, mm7
|
|
punpckhwd mm1, mm7
|
|
punpckldq mm0, mm1
|
|
|
|
; Combine all delta greens into one mmx reg.
|
|
movq mm1, mm7
|
|
punpcklwd mm1, mm7
|
|
|
|
movq mm2, mm7
|
|
punpcklwd mm2, mm7
|
|
punpckhdq mm1, mm2
|
|
|
|
; Combine all delta blues into one mmx reg.
|
|
movq mm2, mm7
|
|
punpcklwd mm2, mm7
|
|
|
|
movq mm3, mm7
|
|
punpcklwd mm3, mm7
|
|
punpckldq mm2, mm3
|
|
|
|
; Need to combine for first pixel write.
|
|
movq mm3, mm5 ; Copy green
|
|
psrlw mm3, 5 ; Shift green to correct location
|
|
movq mm7, mm4 ; Copy Red
|
|
|
|
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
|
|
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
|
|
|
|
por mm7, mm3
|
|
movq mm3, mm6
|
|
|
|
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
|
|
por mm3, mm7
|
|
|
|
FourPixelLoopRtoL:
|
|
movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
|
|
sub edi, 8
|
|
|
|
paddw mm5, mm1 ; 4 greens plus 4 delta greens
|
|
paddw mm4, mm0 ; 4 reds plus 4 delta reds
|
|
|
|
movq mm3, mm5 ; Copy green
|
|
paddw mm6, mm2 ; 4 blues plus 4 delta blues
|
|
|
|
psrlw mm3, 5 ; Shift green to correct location
|
|
movq mm7, mm4 ; Copy Red
|
|
|
|
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
|
|
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
|
|
|
|
por mm7, mm3 ; Combine red and green
|
|
movq mm3, mm6 ; Copy Blue
|
|
|
|
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
|
|
por mm3, mm7
|
|
|
|
dec edx
|
|
jnz FourPixelLoopRtoL
|
|
|
|
LastPixelsRtoL:
|
|
; These can be written from values in mm0
|
|
test eax, 2
|
|
jz OnePixelLeftRtoL
|
|
movq mm1, mm3
|
|
psrlq mm1, 32
|
|
movd MMWORD PTR [edi+4], mm1
|
|
sub edi, 4
|
|
psrlq mm0, 32
|
|
|
|
OnePixelLeftRtoL:
|
|
test eax, 1
|
|
jz NoMorePixelsRtoL
|
|
|
|
psrlq mm3, 48
|
|
movd edx, mm3
|
|
mov word ptr [edi+6], dx
|
|
jmp NoMorePixelsRtoL
|
|
|
|
WriteIndividualPixelsRtoL:
|
|
movq mm5, mm0
|
|
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
|
|
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
|
|
movq mm3, mm5
|
|
pand mm5, MMWORD PTR u888to565RedBlueMask
|
|
pmaddwd mm5, MMWORD PTR u888to565Multiplier
|
|
pand mm3, MMWORD PTR u888to565GreenMask
|
|
por mm5, mm3
|
|
psrld mm5, 5
|
|
|
|
movd edx, mm5
|
|
mov [edi], dx
|
|
sub edi, 2
|
|
|
|
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
|
|
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
|
|
paddw mm0, [ecx+RASTPRIM_iDBDX]
|
|
|
|
dec eax
|
|
jnz WriteIndividualPixelsRtoL
|
|
|
|
NoMorePixelsRtoL:
|
|
jmp DoneSpanDirif
|
|
;else
|
|
;{
|
|
LeftToRightSpan:
|
|
|
|
|
|
; NORMAL LEFT TO RIGHT CASE
|
|
movq mm0, [ebp+RASTSPAN_uB]
|
|
|
|
; This case is unnecessary if 16 bit color is always word aligned.
|
|
; It really should be, but it doesnt hurt to be safe for a 2 instruction
|
|
; penalty.
|
|
test edi, 1
|
|
jnz WriteIndividualPixelsLtoR
|
|
|
|
beginingpixelsLtoR:
|
|
test edi, 7 ;Test to see if we are 4 pixel aligned.
|
|
jz SetupFourPixelLoopLtoR
|
|
|
|
;WritePixel
|
|
movq mm5, mm0
|
|
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
|
|
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
|
|
movq mm3, mm5
|
|
pand mm5, MMWORD PTR u888to565RedBlueMask
|
|
pmaddwd mm5, MMWORD PTR u888to565Multiplier
|
|
pand mm3, MMWORD PTR u888to565GreenMask
|
|
por mm5, mm3
|
|
psrld mm5, 5
|
|
|
|
movd edx, mm5
|
|
mov [edi], dx
|
|
add edi, 2 ; Increase destination pointer
|
|
|
|
dec eax ; Reduce Pixel count
|
|
jz NoMorePixelsLtoR
|
|
|
|
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
|
|
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
|
|
paddw mm0, [ecx+RASTPRIM_iDBDX]
|
|
|
|
jmp beginingpixelsLtoR
|
|
|
|
SetupFourPixelLoopLtoR:
|
|
; Only go through trouble of setting up four pixels if we have four pixels.
|
|
mov edx, eax
|
|
shr edx, 2
|
|
jz WriteIndividualPixelsLtoR
|
|
|
|
|
|
; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
|
|
; Need to check for potential delta overflow.
|
|
; For example, if there are 5 pixels that change from 255 to 0
|
|
; then the delta would be -255/(5-1) = -63. Since we are doing at least 4 pixels,
|
|
; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
|
|
movq mm1, [ecx+RASTPRIM_iDBDX]
|
|
movq mm2, mm1
|
|
psraw mm1, 15 ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
|
|
pxor mm2, mm1
|
|
psubw mm2, mm1 ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
|
|
pand mm2, MMWORD PTR Val0xe000e000e000 ; Check to see if any of the high three bits are set
|
|
packuswb mm2, mm2 ; This will saturate if the high three bits are set.
|
|
movd esi, mm2
|
|
test esi, esi
|
|
jnz WriteIndividualPixelsLtoR
|
|
|
|
; -----------------------------------
|
|
; Generate four starting color pixels
|
|
; -----------------------------------
|
|
; Put color + 0*delta in mm0. Changed it so that it started in mm0
|
|
movq mm1, mm0
|
|
paddw mm1, [ecx+RASTPRIM_iDBDX] ; Put color + 1*delta in mm1
|
|
movq mm2, mm1
|
|
paddw mm2, [ecx+RASTPRIM_iDBDX] ; Put color + 2*delta in mm2
|
|
movq mm3, mm2
|
|
paddw mm3, [ecx+RASTPRIM_iDBDX] ; Put color + 3*delta in mm3
|
|
|
|
|
|
; -----------------------------------
|
|
; Seperate colors.
|
|
; -----------------------------------
|
|
|
|
; Combine all reds into one mmx reg.
|
|
movq mm4, mm0
|
|
punpckhwd mm4, mm1
|
|
|
|
movq mm5, mm2
|
|
punpckhwd mm5, mm3
|
|
punpckldq mm4, mm5
|
|
|
|
; Combine all greens into one mmx reg.
|
|
movq mm5, mm0
|
|
punpcklwd mm5, mm1
|
|
|
|
movq mm6, mm2
|
|
punpcklwd mm6, mm3
|
|
punpckhdq mm5, mm6
|
|
|
|
; Combine all blues into one mmx reg.
|
|
movq mm6, mm0
|
|
punpcklwd mm6, mm1
|
|
|
|
movq mm7, mm2
|
|
punpcklwd mm7, mm3
|
|
punpckldq mm6, mm7
|
|
|
|
; -----------------------------------
|
|
; Seperate delta colors.
|
|
; -----------------------------------
|
|
|
|
; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
|
|
movq mm7, [ecx+RASTPRIM_iDBDX]
|
|
psllw mm7, 2 ; Doing 4 pixels at a time so delta must be times 4.
|
|
; Combine all delta reds into one mmx reg.
|
|
movq mm0, mm7
|
|
punpckhwd mm0, mm7
|
|
|
|
movq mm1, mm7
|
|
punpckhwd mm1, mm7
|
|
punpckldq mm0, mm1
|
|
|
|
; Combine all delta greens into one mmx reg.
|
|
movq mm1, mm7
|
|
punpcklwd mm1, mm7
|
|
|
|
movq mm2, mm7
|
|
punpcklwd mm2, mm7
|
|
punpckhdq mm1, mm2
|
|
|
|
; Combine all delta blues into one mmx reg.
|
|
movq mm2, mm7
|
|
punpcklwd mm2, mm7
|
|
|
|
movq mm3, mm7
|
|
punpcklwd mm3, mm7
|
|
punpckldq mm2, mm3
|
|
|
|
; Need to combine for first pixel write.
|
|
movq mm3, mm5 ; Copy green
|
|
psrlw mm3, 5 ; Shift green to correct location
|
|
movq mm7, mm4 ; Copy Red
|
|
|
|
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
|
|
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
|
|
|
|
por mm7, mm3
|
|
movq mm3, mm6
|
|
|
|
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
|
|
por mm3, mm7
|
|
|
|
FourPixelLoopLtoR:
|
|
movq MMWORD PTR [edi], mm3 ; Write four 565 pixels at once.
|
|
add edi, 8
|
|
|
|
paddw mm5, mm1 ; 4 greens plus 4 delta greens
|
|
paddw mm4, mm0 ; 4 reds plus 4 delta reds
|
|
|
|
movq mm3, mm5 ; Copy green
|
|
paddw mm6, mm2 ; 4 blues plus 4 delta blues
|
|
|
|
psrlw mm3, 5 ; Shift green to correct location
|
|
movq mm7, mm4 ; Copy Red
|
|
|
|
pand mm3, MMWORD PTR Val0x07E007E007E007E0 ; Mask off green 6 bits.
|
|
pand mm7, MMWORD PTR Val0xf800f800f800f800 ; Mask off reds 5 upper bits
|
|
|
|
por mm7, mm3
|
|
movq mm3, mm6
|
|
|
|
psrlw mm3, 11 ; Move blues 5 upper bits to the bottom.
|
|
por mm3, mm7
|
|
|
|
dec edx
|
|
jnz FourPixelLoopLtoR
|
|
|
|
LastPixelsLtoR:
|
|
; These can be written from values in mm3
|
|
test eax, 2
|
|
jz OnePixelLeftLtoR
|
|
movd MMWORD PTR [edi], mm3
|
|
add edi, 4
|
|
psrlq mm0, 32
|
|
|
|
OnePixelLeftLtoR:
|
|
test eax, 1
|
|
jz NoMorePixelsLtoR
|
|
|
|
movd edx, mm3
|
|
mov word ptr [edi], dx
|
|
jmp NoMorePixelsLtoR
|
|
|
|
WriteIndividualPixelsLtoR:
|
|
movq mm5, mm0
|
|
psrlw mm5, 8 ; Convert color1 from 8.8 two 0.8
|
|
packuswb mm5, mm5 ; Just makes a copy of itself in high and low dwords.
|
|
movq mm3, mm5
|
|
pand mm5, MMWORD PTR u888to565RedBlueMask
|
|
pmaddwd mm5, MMWORD PTR u888to565Multiplier
|
|
pand mm3, MMWORD PTR u888to565GreenMask
|
|
por mm5, mm3
|
|
psrld mm5, 5
|
|
|
|
movd edx, mm5
|
|
mov [edi], dx
|
|
add edi, 2
|
|
|
|
;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
|
|
;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
|
|
paddw mm0, [ecx+RASTPRIM_iDBDX]
|
|
|
|
dec eax
|
|
jnz WriteIndividualPixelsLtoR
|
|
|
|
NoMorePixelsLtoR:
|
|
|
|
;}
|
|
DoneSpanDirif:
|
|
|
|
; Setup Code Ends
|
|
; ----------------------------------------------------------------------------------------------------------------
|
|
; Loop Code Begins
|
|
|
|
|
|
ExitPixelLoop:
|
|
; Loop code ends
|
|
|
|
;-----------------------------------------------------------------------------
|
|
; LoopAny code ends here
|
|
;-----------------------------------------------------------------------------
|
|
|
|
;pS++;
|
|
add ebp, SIZEOF_RASTSPAN
|
|
|
|
;}
|
|
jmp SpanLoop
|
|
ExitSpanLoop:
|
|
;pP = pP->pNext;
|
|
mov ecx, [ecx+RASTPRIM_pNext]
|
|
;}
|
|
jmp PrimLoop
|
|
|
|
ExitPrimLoop:
|
|
;_asm{
|
|
emms
|
|
;}
|
|
|
|
;return S_OK;
|
|
xor eax, eax
|
|
;}
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
mov esp, StackPos
|
|
pop ebp
|
|
ret
|
|
|
|
END
|