;-----------------------------------------------------------------------------
;
;   Monolith 8. Gouraud No Z buffer 565
;
;   Globals(ATTENTION Darn multiprocessing.)
;
;   StackPos - saves stack position
;   uSpans - number of spans
;
;
;   This monolith tries to processes 4 colors at once.
;   This would allow writing of qwords.  Since it writes
;   qwords, then it is most benificial if they are aligned.
;   The beginingpixels loop writes pixels until there screen
;   memory is aligned.  Then if there are four pixels, then the
;   deltacolor values are check to make sure that they dont overflow
;   when they get multiplied by four.  Next, color,
;   color+deltacolor, color+2*deltacolor, and color+3*deltacolor
;   are calculated (Generate four starting pixels).  To make color
;   conversion easier, the red green and blue are all seperated
;   into thier own registers (Seperate colors).  Deltas for each component
;   are also seperated (Seperate delta colors).  Since all of the
;   components are seperated, three additions will be needed to
;   update the color.
;
;   This allows 565 pixels to be calculated just like it would
;   be done in C except it will generate 4 at a time.
;
;   int16   red, green blue
;   color = (red&f800) | ((green&07e0) >> 5) | (blue>>11);
;
;   So in 16 instructions, four pixels are written to the screen,
;   the color is updated, four pixels are converted from internal
;   to 565 for next pass and dest and count are updated.
;
;   Register Usage for FourPixelLoop
;   edi - Dest screen pointer
;   mm3 - result of four consecutive color converted 565 colors
;   mm4 - four consecutive red values
;   mm5 - four consecutive green values
;   mm6 - four consecutive blue values
;   mm0 - four delta red values (4 times actual delta red)
;   mm1 - four delta green values (4 times actual delta green)
;   mm2 - four delta blue values (4 times actual delta blue)
;   mm7 - temp used to convert to 565
;
;   This is the FourPixelLoop
;
;        movq    MMWORD PTR [edi], mm3   ; Write four 565 pixels at once.
;        sub     edi, 8
;
;        paddw   mm5, mm1    ; 4 greens plus 4 delta greens
;        paddw   mm4, mm0    ; 4 reds plus 4 delta reds
;
;        movq    mm3, mm5    ; Copy green
;        paddw   mm6, mm2    ; 4 blues plus 4 delta blues
;
;        psrlw   mm3, 5      ; Shift green to correct location
;        movq    mm7, mm4    ; Copy Red
;
;        pand    mm3, MMWORD PTR Val0x07E007E007E007E0          ; Mask off green 6 bits.
;        pand    mm7, MMWORD PTR Val0xf800f800f800f800          ; Mask off reds 5 upper bits
;
;        por     mm7, mm3    ; Combine red and green
;        movq    mm3, mm6    ; Copy Blue
;
;        psrlw   mm3, 11                          ; Move blues 5 upper bits to the bottom.
;        por     mm3, mm7
;
;        dec     edx
;        jnz     FourPixelLoopRtoL
;
;   If there are any pixels to write after the four pixel loop,
;   they have already been calculated and eax will determine
;   what needs to be written by its two lower bits.
;
;   WriteIndividualPixels is called when there are less than
;   four pixels to write after alignment or the deltacolor would
;   have had an overflow.
;
;-----------------------------------------------------------------------------

INCLUDE iammx.inc
INCLUDE offs_acp.inc


; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
; at the LSB, then six bits of green, then five bits of red.


;TBD check to see if this value is correct.
COLOR_SHIFT equ 8

.586
.model flat

; Big seperating lines seperate code into span code
; and loop code.  If span and loop are not going to
; end up being combined then it will be easy to
; seperate the code.

.data

Val0xe000e000e000       dq  00000e000e000e000h
Val0xf800f800f800f800   dq  0f800f800f800f800h
Val0x07e007e007e007e0   dq  007e007e007e007e0h

; Need externs for all of the variables that are needed for various beads

EXTERN MaskRed565to888:MMWORD
EXTERN MaskGreen565to888:MMWORD
EXTERN MaskBlue565to888:MMWORD

EXTERN MaskRed555to888:MMWORD
EXTERN MaskGreen555to888:MMWORD
EXTERN MaskBlue555to888:MMWORD

EXTERN MaskAlpha1555to8888:MMWORD
EXTERN MaskRed1555to8888:MMWORD
EXTERN MaskGreen1555to8888:MMWORD
EXTERN MaskBlue1555to8888:MMWORD

; TBD. I think that I want to do 0xffff instead of 0xff.  This will
; have to be checked.  There is a value very similiar to this in
; buf write.
EXTERN SetAlphato0xffff:MMWORD
EXTERN SetAlphato0xff:MMWORD

; TODO This equate are identical to the ones in texread.mas.  Maybe they should be in a common .inc file.
RedShift565to888     equ 8
GreenShift565to888   equ 5
BlueShift565to888    equ 3

RedShift555to888     equ 9
GreenShift555to888   equ 6
BlueShift555to888    equ 3

AlphaShift1555to8888 equ 16
RedShift1555to8888   equ 9
GreenShift1555to8888 equ 6
BlueShift1555to8888  equ 3

EXTERN  Zero:MMWORD

EXTERN  SetAlphato0xff:MMWORD
EXTERN  u888to565RedBlueMask:MMWORD
EXTERN  u888to565GreenMask:MMWORD
EXTERN  u888to565Multiplier:MMWORD
EXTERN  uVal0x000007ff03ff07ff:MMWORD
EXTERN  uVal0x0000078003c00780:MMWORD
EXTERN  u888to555RedBlueMask:MMWORD
EXTERN  u888to555GreenMask:MMWORD
EXTERN  u888to555Multiplier:MMWORD
EXTERN  uVal0x000007ff07ff07ff:MMWORD
EXTERN  uVal0x0000078007800780:MMWORD



;-----------------------------------------------------------------------------
; Span Variables
StackPos    dd  ?
uSpans      dd  ?
;-----------------------------------------------------------------------------

;-----------------------------------------------------------------------------
; Loop Variables
;-----------------------------------------------------------------------------

.code


PUBLIC _MMXMLRast_8
_MMXMLRast_8:
    push    ebp
    mov     StackPos, esp
    mov     eax, esp
    sub     esp, 0Ch        ; This will need to change if stack frame size changes.
    push    ebx
    push    esi
    push    edi

    xor        edi, edi

    ; Put pCtx into ebx
    mov     ebx, [eax+8]

    ;PD3DI_RASTPRIM pP = pCtx->pPrim;
    mov     ecx, [ebx+RASTCTX_pPrim]

    ;while (pP)
    ;{
PrimLoop:
    cmp     ecx, 0
    je      ExitPrimLoop

    ;UINT16 uSpans = pP->uSpans;
    movzx   eax, word ptr [ecx+RASTPRIM_uSpans]
    mov     uSpans, eax

    ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
    mov     ebp, ecx
    add     ebp, SIZEOF_RASTPRIM


    ;while (uSpans-- > 0)
    ;{
SpanLoop:
    mov     edx, uSpans
    mov     eax, edx
    dec     eax
    mov     uSpans, eax
    test    edx, edx
    jle     ExitSpanLoop


        mov     edi, dword ptr [ebp+RASTSPAN_pSurface]
        movzx   eax, word ptr [ebp+RASTSPAN_uPix]

    ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
    ;{
        mov     edx, [ecx+RASTPRIM_uFlags]
        and     edx, D3DI_RASTPRIM_X_DEC
        test    edx, edx
        jz      LeftToRightSpan

; SCREWED UP RIGHT TO LEFT CASE

        movq    mm0, [ebp+RASTSPAN_uB]
        ; This case is unnecessary if 16 bit color is always word aligned.
        ; It really should be, but it doesnt hurt to be safe for a 2 instruction
        ; penalty.
        test    edi, 1
        jnz     WriteIndividualPixelsRtoL

beginingpixelsRtoL:

        ; Align color. Alignment is very different when drawing Right to Left.

        xor     edi, 6                  ; Simple trick to make alignment code work same as LtoR
                                        ; Could Possibly be better though.

        test    edi, 7                  ;Test to see if we are 4 pixel aligned.
        jz      SetupFourPixelLoopRtoL

        xor     edi, 6                  ; Make pointer back to what it used to be

        ;WritePixel
        movq    mm5, mm0
        psrlw   mm5, 8          ; Convert color1 from 8.8 two 0.8
    packuswb    mm5, mm5        ; Just makes a copy of itself in high and low dwords.
        movq    mm3, mm5
        pand    mm5, MMWORD PTR u888to565RedBlueMask
        pmaddwd mm5, MMWORD PTR u888to565Multiplier
        pand    mm3, MMWORD PTR u888to565GreenMask
        por     mm5, mm3
        psrld   mm5, 5

        movd    edx, mm5
        mov     [edi], dx
        sub     edi, 2          ; Increase destination pointer

        dec     eax             ; Reduce Pixel count
        jz      NoMorePixelsRtoL

        ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
        ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
        paddw   mm0, [ecx+RASTPRIM_iDBDX]

        jmp     beginingpixelsRtoL

SetupFourPixelLoopRtoL:

        xor     edi, 6                  ; Make pointer back to what it used to be

        ; Only go through trouble of setting up four pixels if we have four pixels.
        ; Check to see if there are four pixels left over after aligning pixels.
        mov     edx, eax
        shr     edx, 2
        jz      WriteIndividualPixelsRtoL

        xor     edi, 6                  ; Writing pixels from r to left so address needs to be left most address.

        ; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
        ; Need to check for potential delta overflow.
        ; For example, if there are 5 pixels that change from 255 to 0
        ; then the delta would be -255/(5-1) = -63.  Since we are doing at least 4 pixels,
        ; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
        movq    mm1, [ecx+RASTPRIM_iDBDX]
        movq    mm2, mm1
        psraw   mm1, 15     ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
        pxor    mm2, mm1
        psubw   mm2, mm1    ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
        pand    mm2, MMWORD PTR Val0xe000e000e000  ; Check to see if any of the high three bits are set
       packuswb mm2, mm2                ; This will saturate if the high three bits are set.
        movd    esi, mm2
        test    esi, esi
        jnz     WriteIndividualPixelsRtoL

        ; -----------------------------------
        ; Generate four starting color pixels
        ; -----------------------------------
                                                ; Put color + 0*delta in mm0.  Changed it so that it started in mm0
        movq    mm1, mm0
        paddw   mm1, [ecx+RASTPRIM_iDBDX]       ; Put color + 1*delta in mm1
        movq    mm2, mm1
        paddw   mm2, [ecx+RASTPRIM_iDBDX]       ; Put color + 2*delta in mm2
        movq    mm3, mm2
        paddw   mm3, [ecx+RASTPRIM_iDBDX]       ; Put color + 3*delta in mm3


        ; -----------------------------------
        ; Seperate colors.
        ; -----------------------------------

        ; Combine all reds into one mmx reg.
        movq    mm4, mm3
     punpckhwd  mm4, mm2

        movq    mm5, mm1
     punpckhwd  mm5, mm0
     punpckldq  mm4, mm5

        ; Combine all greens into one mmx reg.
        movq    mm5, mm3
     punpcklwd  mm5, mm2

        movq    mm6, mm1
     punpcklwd  mm6, mm0
     punpckhdq  mm5, mm6

        ; Combine all blues into one mmx reg.
        movq    mm6, mm3
     punpcklwd  mm6, mm2

        movq    mm7, mm1
     punpcklwd  mm7, mm0
     punpckldq  mm6, mm7

        ; -----------------------------------
        ; Seperate delta colors.
        ; -----------------------------------

        ; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
        movq    mm7, [ecx+RASTPRIM_iDBDX]
        psllw   mm7, 2                      ; Doing 4 pixels at a time so delta must be times 4.
        ; Combine all delta reds into one mmx reg.
        movq    mm0, mm7
     punpckhwd  mm0, mm7

        movq    mm1, mm7
     punpckhwd  mm1, mm7
     punpckldq  mm0, mm1

        ; Combine all delta greens into one mmx reg.
        movq    mm1, mm7
     punpcklwd  mm1, mm7

        movq    mm2, mm7
     punpcklwd  mm2, mm7
     punpckhdq  mm1, mm2

        ; Combine all delta blues into one mmx reg.
        movq    mm2, mm7
     punpcklwd  mm2, mm7

        movq    mm3, mm7
     punpcklwd  mm3, mm7
     punpckldq  mm2, mm3

        ; Need to combine for first pixel write.
        movq    mm3, mm5    ; Copy green
        psrlw   mm3, 5      ; Shift green to correct location
        movq    mm7, mm4    ; Copy Red

        pand    mm3, MMWORD PTR Val0x07E007E007E007E0          ; Mask off green 6 bits.
        pand    mm7, MMWORD PTR Val0xf800f800f800f800          ; Mask off reds 5 upper bits

        por     mm7, mm3
        movq    mm3, mm6

        psrlw   mm3, 11                          ; Move blues 5 upper bits to the bottom.
        por     mm3, mm7

FourPixelLoopRtoL:
        movq    MMWORD PTR [edi], mm3   ; Write four 565 pixels at once.
        sub     edi, 8

        paddw   mm5, mm1    ; 4 greens plus 4 delta greens
        paddw   mm4, mm0    ; 4 reds plus 4 delta reds

        movq    mm3, mm5    ; Copy green
        paddw   mm6, mm2    ; 4 blues plus 4 delta blues

        psrlw   mm3, 5      ; Shift green to correct location
        movq    mm7, mm4    ; Copy Red

        pand    mm3, MMWORD PTR Val0x07E007E007E007E0          ; Mask off green 6 bits.
        pand    mm7, MMWORD PTR Val0xf800f800f800f800          ; Mask off reds 5 upper bits

        por     mm7, mm3    ; Combine red and green
        movq    mm3, mm6    ; Copy Blue

        psrlw   mm3, 11                          ; Move blues 5 upper bits to the bottom.
        por     mm3, mm7

        dec     edx
        jnz     FourPixelLoopRtoL

LastPixelsRtoL:
        ; These can be written from values in mm0
        test    eax, 2
        jz      OnePixelLeftRtoL
        movq    mm1, mm3
        psrlq   mm1, 32
        movd    MMWORD PTR [edi+4], mm1
        sub     edi, 4
        psrlq   mm0, 32

OnePixelLeftRtoL:
        test    eax, 1
        jz      NoMorePixelsRtoL

        psrlq   mm3, 48
        movd    edx, mm3
        mov     word ptr [edi+6], dx
        jmp     NoMorePixelsRtoL

WriteIndividualPixelsRtoL:
        movq    mm5, mm0
        psrlw   mm5, 8          ; Convert color1 from 8.8 two 0.8
    packuswb    mm5, mm5        ; Just makes a copy of itself in high and low dwords.
        movq    mm3, mm5
        pand    mm5, MMWORD PTR u888to565RedBlueMask
        pmaddwd mm5, MMWORD PTR u888to565Multiplier
        pand    mm3, MMWORD PTR u888to565GreenMask
        por     mm5, mm3
        psrld   mm5, 5

        movd    edx, mm5
        mov     [edi], dx
        sub     edi, 2

        ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
        ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
        paddw   mm0, [ecx+RASTPRIM_iDBDX]

        dec     eax
        jnz     WriteIndividualPixelsRtoL

NoMorePixelsRtoL:
        jmp     DoneSpanDirif
    ;else
    ;{
LeftToRightSpan:


; NORMAL LEFT TO RIGHT CASE
        movq    mm0, [ebp+RASTSPAN_uB]

        ; This case is unnecessary if 16 bit color is always word aligned.
        ; It really should be, but it doesnt hurt to be safe for a 2 instruction
        ; penalty.
        test    edi, 1
        jnz     WriteIndividualPixelsLtoR

beginingpixelsLtoR:
        test    edi, 7                  ;Test to see if we are 4 pixel aligned.
        jz      SetupFourPixelLoopLtoR

        ;WritePixel
        movq    mm5, mm0
        psrlw   mm5, 8          ; Convert color1 from 8.8 two 0.8
    packuswb    mm5, mm5        ; Just makes a copy of itself in high and low dwords.
        movq    mm3, mm5
        pand    mm5, MMWORD PTR u888to565RedBlueMask
        pmaddwd mm5, MMWORD PTR u888to565Multiplier
        pand    mm3, MMWORD PTR u888to565GreenMask
        por     mm5, mm3
        psrld   mm5, 5

        movd    edx, mm5
        mov     [edi], dx
        add     edi, 2          ; Increase destination pointer

        dec     eax             ; Reduce Pixel count
        jz      NoMorePixelsLtoR

        ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
        ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
        paddw   mm0, [ecx+RASTPRIM_iDBDX]

        jmp     beginingpixelsLtoR

SetupFourPixelLoopLtoR:
        ; Only go through trouble of setting up four pixels if we have four pixels.
        mov     edx, eax
        shr     edx, 2
        jz      WriteIndividualPixelsLtoR


        ; !!! THIS EXTRA SETUP CODE PREVENTS A BUG THAT WOULD HAPPEN VERY SELDOMLY !!!
        ; Need to check for potential delta overflow.
        ; For example, if there are 5 pixels that change from 255 to 0
        ; then the delta would be -255/(5-1) = -63.  Since we are doing at least 4 pixels,
        ; then we have delta*4 = -63*4 = -252 which doesnt fit in 8 signed bits.
        movq    mm1, [ecx+RASTPRIM_iDBDX]
        movq    mm2, mm1
        psraw   mm1, 15     ; Make sign bit mask for a conditional negate. (Also called Absolute value last I checked.)
        pxor    mm2, mm1
        psubw   mm2, mm1    ; value should be between 0-128 in the upper byte of the words. (0-32768 for the word)
        pand    mm2, MMWORD PTR  Val0xe000e000e000  ; Check to see if any of the high three bits are set
       packuswb mm2, mm2                ; This will saturate if the high three bits are set.
        movd    esi, mm2
        test    esi, esi
        jnz     WriteIndividualPixelsLtoR

        ; -----------------------------------
        ; Generate four starting color pixels
        ; -----------------------------------
                                                ; Put color + 0*delta in mm0.  Changed it so that it started in mm0
        movq    mm1, mm0
        paddw   mm1, [ecx+RASTPRIM_iDBDX]       ; Put color + 1*delta in mm1
        movq    mm2, mm1
        paddw   mm2, [ecx+RASTPRIM_iDBDX]       ; Put color + 2*delta in mm2
        movq    mm3, mm2
        paddw   mm3, [ecx+RASTPRIM_iDBDX]       ; Put color + 3*delta in mm3


        ; -----------------------------------
        ; Seperate colors.
        ; -----------------------------------

        ; Combine all reds into one mmx reg.
        movq    mm4, mm0
     punpckhwd  mm4, mm1

        movq    mm5, mm2
     punpckhwd  mm5, mm3
     punpckldq  mm4, mm5

        ; Combine all greens into one mmx reg.
        movq    mm5, mm0
     punpcklwd  mm5, mm1

        movq    mm6, mm2
     punpcklwd  mm6, mm3
     punpckhdq  mm5, mm6

        ; Combine all blues into one mmx reg.
        movq    mm6, mm0
     punpcklwd  mm6, mm1

        movq    mm7, mm2
     punpcklwd  mm7, mm3
     punpckldq  mm6, mm7

        ; -----------------------------------
        ; Seperate delta colors.
        ; -----------------------------------

        ; If extra registers are needed, (i.e. Z buffer or dither) then deltas can be saved to memory. Three more regs.
        movq    mm7, [ecx+RASTPRIM_iDBDX]
        psllw   mm7, 2                      ; Doing 4 pixels at a time so delta must be times 4.
        ; Combine all delta reds into one mmx reg.
        movq    mm0, mm7
     punpckhwd  mm0, mm7

        movq    mm1, mm7
     punpckhwd  mm1, mm7
     punpckldq  mm0, mm1

        ; Combine all delta greens into one mmx reg.
        movq    mm1, mm7
     punpcklwd  mm1, mm7

        movq    mm2, mm7
     punpcklwd  mm2, mm7
     punpckhdq  mm1, mm2

        ; Combine all delta blues into one mmx reg.
        movq    mm2, mm7
     punpcklwd  mm2, mm7

        movq    mm3, mm7
     punpcklwd  mm3, mm7
     punpckldq  mm2, mm3

        ; Need to combine for first pixel write.
        movq    mm3, mm5    ; Copy green
        psrlw   mm3, 5      ; Shift green to correct location
        movq    mm7, mm4    ; Copy Red

        pand    mm3, MMWORD PTR Val0x07E007E007E007E0          ; Mask off green 6 bits.
        pand    mm7, MMWORD PTR Val0xf800f800f800f800          ; Mask off reds 5 upper bits

        por     mm7, mm3
        movq    mm3, mm6

        psrlw   mm3, 11                          ; Move blues 5 upper bits to the bottom.
        por     mm3, mm7

FourPixelLoopLtoR:
        movq    MMWORD PTR [edi], mm3   ; Write four 565 pixels at once.
        add     edi, 8

        paddw   mm5, mm1    ; 4 greens plus 4 delta greens
        paddw   mm4, mm0    ; 4 reds plus 4 delta reds

        movq    mm3, mm5    ; Copy green
        paddw   mm6, mm2    ; 4 blues plus 4 delta blues

        psrlw   mm3, 5      ; Shift green to correct location
        movq    mm7, mm4    ; Copy Red

        pand    mm3, MMWORD PTR Val0x07E007E007E007E0          ; Mask off green 6 bits.
        pand    mm7, MMWORD PTR Val0xf800f800f800f800          ; Mask off reds 5 upper bits

        por     mm7, mm3
        movq    mm3, mm6

        psrlw   mm3, 11                          ; Move blues 5 upper bits to the bottom.
        por     mm3, mm7

        dec     edx
        jnz     FourPixelLoopLtoR

LastPixelsLtoR:
        ; These can be written from values in mm3
        test    eax, 2
        jz      OnePixelLeftLtoR
        movd    MMWORD PTR [edi], mm3
        add     edi, 4
        psrlq   mm0, 32

OnePixelLeftLtoR:
        test    eax, 1
        jz      NoMorePixelsLtoR

        movd    edx, mm3
        mov     word ptr [edi], dx
        jmp     NoMorePixelsLtoR

WriteIndividualPixelsLtoR:
        movq    mm5, mm0
        psrlw   mm5, 8          ; Convert color1 from 8.8 two 0.8
    packuswb    mm5, mm5        ; Just makes a copy of itself in high and low dwords.
        movq    mm3, mm5
        pand    mm5, MMWORD PTR u888to565RedBlueMask
        pmaddwd mm5, MMWORD PTR u888to565Multiplier
        pand    mm3, MMWORD PTR u888to565GreenMask
        por     mm5, mm3
        psrld   mm5, 5

        movd    edx, mm5
        mov     [edi], dx
        add     edi, 2

        ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
        ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
        paddw   mm0, [ecx+RASTPRIM_iDBDX]

        dec     eax
        jnz     WriteIndividualPixelsLtoR

NoMorePixelsLtoR:

    ;}
DoneSpanDirif:

; Setup Code Ends
; ----------------------------------------------------------------------------------------------------------------
; Loop Code Begins


ExitPixelLoop:
; Loop code ends

;-----------------------------------------------------------------------------
;  LoopAny code ends here
;-----------------------------------------------------------------------------

    ;pS++;
    add     ebp, SIZEOF_RASTSPAN

    ;}
    jmp     SpanLoop
ExitSpanLoop:
    ;pP = pP->pNext;
    mov     ecx, [ecx+RASTPRIM_pNext]
    ;}
    jmp     PrimLoop

ExitPrimLoop:
    ;_asm{
    emms
    ;}

    ;return S_OK;
    xor     eax, eax
;}
    pop     edi
    pop     esi
    pop     ebx
    mov     esp, StackPos
    pop     ebp
    ret

END