/**************************************************************************\
* 
* Copyright (c) 2000  Microsoft Corporation
*
* Module name:
*
*   Include file to generate either 5-5-5 or 5-6-5 versions of the
*   dither code.
*
* Notes:
*
*   When DITHER_BLEND_555 is #defined to 1, then this file will generate
*   5-5-5 versions of the included routines.
*
*   When DITHER_BLEND_555 is #defined to 0, then we will generate 5-6-5
*   versions.
*
* Revision History:
*
*   03/15/2000 andrewgo
*       Created it.
*
\**************************************************************************/

#undef DITHER_ARRAY
#undef RED_SHIFT
#undef GREEN_SHIFT
#undef BLUE_SHIFT
#undef DITHERBLEND_FUNC
#undef DITHER_FUNC

#if DITHER_BLEND_555

    #define DITHER_ARRAY Dither555
    #define RED_SHIFT 9
    #define GREEN_SHIFT 6
    #define BLUE_SHIFT 3
    #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_555_MMX
    #define DITHER_FUNC ScanOperation::Dither_sRGB_555_MMX

#else

    #define DITHER_ARRAY Dither565
    #define RED_SHIFT 8
    #define GREEN_SHIFT 5
    #define BLUE_SHIFT 3
    #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_565_MMX
    #define DITHER_FUNC ScanOperation::Dither_sRGB_565_MMX
    
#endif

// Do a dithered blend to 16bpp using MMX

VOID FASTCALL
DITHERBLEND_FUNC(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
#if defined(_X86_)

    DEFINE_POINTERS(ARGB, WORD);
    DEFINE_BLEND_POINTER(ARGB);
    
    ASSERT(count != 0);
    ASSERT(otherParams);

    static ULONGLONG redBlueMask = 0x00f800f800f800f8;
    static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;

#if DITHER_BLEND_555
    static ULONGLONG greenMask = 0x0000f8000000f800;
    static ULONGLONG redBlueMultiplier = 0x0400000104000001;
#else
    static ULONGLONG greenMask = 0x0000fc000000fc00;
    static ULONGLONG redBlueMultiplier = 0x0800000108000001;
#endif

    INT x = otherParams->X;
    INT y = otherParams->Y;

    UINT32 *dither = (otherParams->DoingDither) 
                   ? &DITHER_ARRAY[8 * (y & 3)] 
                   : &DitherNone[0];
    UINT32 ditherIncrement = (x & 3) * 4;               

    const ARGB *blendPixel = bl;

    _asm
    {
        ; ecx = count
        ; esi = source
        ; edi = destination
        ; mm4 = red and blue mask (0xf800f8)
        ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
        ; mm6 = C1 | C0 dither
        ; mm7 = C3 | C2 dither

        mov             eax, ditherIncrement
        mov             esi, blendPixel
        mov             edi, d
        mov             ecx, count
        movq            mm4, redBlueMask
        movq            mm5, greenMask
    
        ; We always want our qword reads from the screen to be aligned.
        ; So if the initial pixel is not qword-aligned, we handle up to
        ; three pixels up front to make it qword-aligned.
        ;
        ; (Note that as a consequence of us aligning to the destination,
        ; we're often doing unaligned reads on the source.  But it's
        ; a much bigger performance win to align operations to the screen
        ; than to system memory, due to the terrible screen read
        ; performance.)

alignment_loop:
        add             eax, dither
        test            edi, 6
        movq            mm6, [eax]      
        movq            mm7, [eax+8]    
        jz              done_start_alignment
        call            do_single_pixel        

        ; Adjust our pointers and load our new dither values:

        mov             eax, ditherIncrement
        add             eax, 4
        and             eax, 0x0000000F
        mov             ditherIncrement, eax
        add             esi, 4
        add             edi, 2
        dec             ecx
        jz              all_done
        jmp             alignment_loop

    done_start_alignment:
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_main_loop:
        sub             ecx, 4                  ; pre-decrement by 4
        jl              do_pair

        ; We do chunks of 4 pixels at a time so that we can unroll our
        ; dither loop (our dither repeats every 4 pixels).
        
    do_main_loop_2:
        mov             al, [esi+3]           
        and             al, [esi+7]
        and             al, [esi+11]
        and             al, [esi+15]
        inc             al                      ; if all alphas were 0xff, this
        jnz             do_pair                 ;   will wrap to zero
    

        ; The four pixels starting at [esi] are opaque.  We only need to
        ; dither them and convert to 16bpp.  The following codepath will
        ; process all four in parallel (two at a time) in order to optimize
        ; usage of the execution units and minimize dependencies between
        ; consecutive instructions.
            
        ; We start by reading the four pixels into mm0 and mm1, adding
        ; the dither component, and then breaking into group 0 (pixels 0
        ; and 2) and group 1 (pixels 1 and 3).  I will use **0** and **1**
        ; in the comments below to show which pixel group the instruction is
        ; processing

        movq            mm0, [esi]              ; mm0 = DW1 | DW0
        movq            mm1, [esi + 8]          ; mm1 = DW3 | DW2

        paddusb         mm0, mm6                ; add dither
        movq            mm2, mm0
        paddusb         mm1, mm7                ; add dither

        add             edi, 8
        
        punpckhdq       mm2, mm1         ; **1**  mm2 = DW3 | DW1
        punpckldq       mm0, mm1         ; **0**  mm0 = DW2 | DW0
        
        movq            mm3, mm2         ; **1**
        pand            mm2, mm4         ; **1**  red and blue
        
        movq            mm1, mm0         ; **0**
        pand            mm0, mm4         ; **0**  red and blue

        pand            mm3, mm5         ; **1**  green

        psrlw           mm0, 3           ; **0**  shift red and blue to lowest 
                                         ; 5 bits in register
        
        ; Note the use of the pmaddwd to simultaneously shift both the red and
        ; blue bits into their appropriate positions.  The constant 
        ; redBlueMultiplier contains four shorts, each of which is equal to
        ; 2^i where i is the number of bits that we need to shift that color
        ; component by in order to attain the correct position in the 16bpp
        ; color.  This is possible only because the red and blue
        ; components lie on different shorts in the 64bits register (green has
        ; been masked earlier), and so we can dedicate an entire 16bit short
        ; to red and to blue.

        pmaddwd         mm2, redBlueMultiplier  ; **1**
                
        add             esi, 16

        pand            mm1, mm5         ; **0**  green

        psrld           mm3, GREEN_SHIFT-3 ; **1**
        
        pmaddwd         mm0, redBlueMultiplier  ; **0**
            
        sub             ecx, 4                  ; pre-decrement for next iteration

        por             mm2, mm3         ; **1**  combine green with red/blue
                                         ;        mm2 = 0  | W3 | 0  | W1
        
        psrld           mm1, GREEN_SHIFT ; **0**

        psllq           mm2, 13          ; **1**  mm2 = W3 | 0  | W1 | 0

        por             mm0, mm1         ; **0**  combine green with red/blue
                                         ;        mm1 = 0  | W2 | 0  | W0

        por             mm0, mm2                ; mm2 = W3 | W2 | W1 | W0
        movq            [edi - 8], mm0

        jge             do_main_loop_2
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_pair:
        add             ecx, 2                  ; pre-decrement for this iteration
        jl              do_last_pixel
    
        ; We're doing only a single pair of pixels, so swap our dither
        ; values in preparation for the next iteration:
    
        pxor            mm6, mm7
        pxor            mm7, mm6
        pxor            mm6, mm7                ; swap mm6 and mm7
    
        mov             al, [esi+3]
        inc             al
        cmp             al, 1
        ja              do_pair_blend

        mov             al, [esi+7]
        inc             al
        cmp             al, 1
        ja              do_pair_blend

        mov             al, [esi+3]             ; Do we really want this here?
        or              al, [esi+7]
        jz              do_pair_done
    
        movq            mm0, [esi]
        paddusb         mm0, mm7                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT        ; green (6 for 5-5-5)
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT         ; blue
        psrld           mm2, RED_SHIFT          ; red (9 for 5-5-5)
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
    
        movd            eax, mm0
        cmp             byte ptr [esi+3], 0
        je              do_pair_done_first_write        
        mov             [edi], ax        
    do_pair_done_first_write:
        cmp             byte ptr [esi+7], 0
        je              do_pair_done_second_write
        shr             eax, 16
        mov             [edi+2], ax
    do_pair_done_second_write:
        add             edi, 4
        add             esi, 8
        jmp             do_main_loop
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_pair_blend:
        movd            mm1, [edi]              ; read destination, X | X | C1 | C0
        punpcklwd       mm1, mm1                ; C1 | C1 | C0 | C0
        psrld           mm1, 16                 ; 0 | C1 | 0 | C0
                                                ;  (trick using single red and
                                                ;  blue mask requires high bits
                                                ;  to be zero)
        movq            mm0, mm1
        movq            mm2, mm1
        pslld           mm1, BLUE_SHIFT         ; blue
        pslld           mm0, GREEN_SHIFT        ; green (6 for 5-5-5)
        pslld           mm2, RED_SHIFT          ; red (9 for 5-5-5)
        por             mm1, mm2                ; combine red and blue
        pand            mm1, mm4                ; leave valid red and blue bits
        pand            mm0, mm5                ; leave valid green bits
        por             mm1, mm0                ; mm1 = C1 | C0        
    
        ; Okay now we've got the destination read and split.  Handle the first 
        ; blend:
    
        movd            mm2, [esi]
        punpcklbw       mm2, mm2
        psrlw           mm2, 8                  ; mm2 = S
        movq            mm3, mm2
        punpckhwd       mm3, mm3
        punpckhdq       mm3, mm3                ; mm3 = alpha
        movq            mm0, mm1
        punpcklbw       mm0, mm0
        psrlw           mm0, 8                  ; mm0 = D
    #if NO_PREMULTIPLIED_ALPHA
        psubw           mm2, mm0               
        pmullw          mm2, mm3                ; mm2 = alpha * (S - D)
        movq            mm3, mm2
        psrlw           mm3, 8
        paddw           mm2, mm3                ; approximate x/255 by 257/65536
        psrlw           mm2, 8                  ; mm2 = alpha * (S - D)
        paddb           mm0, mm2                ; mm0 = C0 = D + alpha * (S - D)
    #else
        pxor            mm3, flipAlphaBits
        pmullw          mm0, mm3                ; mm2 = (255 - alpha) * D
        movq            mm3, mm0
        psrlw           mm0, 8                  ; approximate x/255 by 257/65536
        paddw           mm0, mm3                ; mm2 = (255 - alpha) * D / 255
        psrlw           mm0, 8                  ; don't care about rounding, not enough bits
        paddb           mm0, mm2                ; mm0 = C0 = S + (1 - alpha) * D
    #endif
    
        ; Handle the second blend (change mm0 to mm1):
    
        movd            mm2, [esi+4]
        punpcklbw       mm2, mm2
        psrlw           mm2, 8                  ; mm2 = S
        movq            mm3, mm2
        punpckhwd       mm3, mm3
        punpckhdq       mm3, mm3                ; mm3 = alpha
        punpckhbw       mm1, mm1
        psrlw           mm1, 8                  ; mm1 = D
    #if NO_PREMULTIPLIED_ALPHA
        psubw           mm2, mm1               
        pmullw          mm2, mm3                ; mm2 = alpha * (S - D)
        movq            mm3, mm2
        psrlw           mm3, 8
        paddw           mm2, mm3                ; approximate x/255 by 257/65536
        psrlw           mm2, 8                  ; mm2 = alpha * (S - D)
        paddb           mm1, mm2                ; mm1 = C1 = D + alpha * (S - D)
    #else
        pxor            mm3, flipAlphaBits    
        pmullw          mm1, mm3                ; mm2 = (255 - alpha) * D
        movq            mm3, mm1
        psrlw           mm1, 8                  ; approximate x/255 by 257/65536
        paddw           mm1, mm3                ; mm2 = (255 - alpha) * D / 255
        psrlw           mm1, 8                  ; don't care about rounding, not enough bits
        paddb           mm1, mm2                ; mm1 = C1 = S + (1 - alpha) * D
    #endif
        packuswb        mm0, mm1                ; mm0 = C1 | C0
    
        ; Dither and pack everything back up:
    
        paddusb         mm0, mm7                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT        ; green
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT         ; blue
        psrld           mm2, RED_SHIFT          ; red
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32                 ; mm1 = 0 | 0 | X | C1
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
    
        movd            [edi], mm0
    
    do_pair_done:
        add             edi, 4
        add             esi, 8
        jmp             do_main_loop
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_single_pixel:
        movd            mm0, [esi]
        mov             al, [esi+3]
        inc             al
        jnz             do_single_blend         ; if not completely opaque 
    
        paddusb         mm0, mm6                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT
        psrld           mm2, RED_SHIFT
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
    
        movd            eax, mm0
        mov             [edi], ax        
    do_single_done:
        ret
    
    do_single_blend:
        dec             al
        jz              do_single_done          ; completely transparent pixel
    
        ; alpha is between 0 and 255
    
        movzx           eax, word ptr [edi]     ; do the destination read
        movd            mm1, eax                ; mm1 = 0 | 0 | 0 | C0
        movq            mm0, mm1
        movq            mm2, mm1
        pslld           mm1, BLUE_SHIFT         ; blue 
        pslld           mm0, GREEN_SHIFT        ; green (6 for 5-5-5)
        pslld           mm2, RED_SHIFT          ; red (9 for 5-5-5)
        por             mm1, mm2                ; combine red and blue
        pand            mm1, mm4                ; leave valid red and blue bits
        pand            mm0, mm5                ; leave valid green bits
        por             mm1, mm0                ; mm1 = C1 | C0        
    
        ; Okay now we've got the destination read and split.  Handle the first blend:
    
        movd            mm2, [esi]
        punpcklbw       mm2, mm2
        psrlw           mm2, 8                  ; mm2 = S
        movq            mm3, mm2
        punpckhwd       mm3, mm3
        punpckhdq       mm3, mm3                ; mm3 = alpha
        movq            mm0, mm1
        punpcklbw       mm0, mm0
        psrlw           mm0, 8                  ; mm0 = D
    #if NO_PREMULTIPLIED_ALPHA
        psubw           mm2, mm0               
        pmullw          mm2, mm3                ; mm2 = alpha * (S - D)
        movq            mm3, mm2
        psrlw           mm3, 8
        paddw           mm2, mm3                ; approximate x/255 by 257/65536
        psrlw           mm2, 8                  ; mm2 = alpha * (S - D)
        paddb           mm0, mm2                ; mm0 = C0 = D + alpha * (S - D)
    #else
        pxor            mm3, flipAlphaBits    
        pmullw          mm0, mm3                ; mm2 = (255 - alpha) * D
        movq            mm3, mm0
        psrlw           mm0, 8                  ; approximate x/255 by 257/65536
        paddw           mm0, mm3                ; mm2 = (255 - alpha) * D / 255
        psrlw           mm0, 8                  ; don't care about rounding, not enough bits
        paddb           mm0, mm2                ; mm0 = C0 = S + (1 - alpha) * D
    #endif
        packuswb        mm0, mm0                ; mm0 = C1 | C0
    
        ; Dither and pack everything back up:
    
        paddusb         mm0, mm6                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT
        psrld           mm2, RED_SHIFT
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
    
        movd            eax, mm0
        mov             [edi], ax        
        ret
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_last_pixel:
        test            ecx, 1
        jz              all_done
        call            do_single_pixel

    all_done:
        emms      
    }

#endif
}

// Dither to 16bpp using MMX

VOID FASTCALL
DITHER_FUNC(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
#if defined(_X86_)

    DEFINE_POINTERS(ARGB, WORD);
    
    ASSERT(count != 0);
    ASSERT(otherParams);

    static ULONGLONG redBlueMask = 0x00f800f800f800f8;
    static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;

#if DITHER_BLEND_555
    static ULONGLONG greenMask = 0x0000f8000000f800;
#else
    static ULONGLONG greenMask = 0x0000fc000000fc00;
#endif

    INT x = otherParams->X;
    INT y = otherParams->Y;

    UINT32 *dither = (otherParams->DoingDither) 
                   ? &DITHER_ARRAY[8 * (y & 3) + (x & 3)] 
                   : &DitherNone[0];
                   
    _asm
    {
        ; ecx = count
        ; esi = source
        ; edi = destination
        ; mm4 = red and blue mask (0xf800f8)
        ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
        ; mm6 = C1 | C0 dither
        ; mm7 = C3 | C2 dither

        mov             eax, dither
        mov             esi, s
        mov             edi, d
        mov             ecx, count
        movq            mm4, redBlueMask
        movq            mm5, greenMask
        movq            mm6, [eax]      
        movq            mm7, [eax+8]    
        sub             ecx, 4                  ; pre-decrement by 4
        jl              do_last_3_pixels_or_less
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        ; We do chunks of 4 pixels at a time so that we can unroll our
        ; dither loop (our dither repeats every 4 pixels).

    do_main_loop:
        movq            mm0, [esi]
        paddusb         mm0, mm6                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT        ; green (6 for 5-5-5)
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT         ; blue
        psrld           mm2, RED_SHIFT          ; red (9 for 5-5-5)
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32                 ; mm1 = X | X | X | C1
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
        movd            [edi], mm0
    
        movq            mm0, [esi+8]
        paddusb         mm0, mm7                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT
        psrld           mm2, RED_SHIFT
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
        movd            [edi+4], mm0
    
        add             edi, 8
        add             esi, 16
        sub             ecx, 4                  ; pre-decrement for next iteration
        jge             do_main_loop
    
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    
    do_last_3_pixels_or_less:
        add             ecx, 4                  ; get back 'real' count
        jz              all_done

        dec             ecx                     ; if exactly 1 pixel left
        jz              do_last_pixel
        
    ; do 2 pixels
        ; we'll decrement ecx again later

        movq            mm0, [esi]
        paddusb         mm0, mm6                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT        ; green (6 for 5-5-5)
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT         ; blue
        psrld           mm2, RED_SHIFT          ; red (9 for 5-5-5)
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32                 ; mm1 = X | X | X | C1
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
        movd            eax, mm0
        mov             [edi], eax
        
        dec             ecx
        jz              all_done
        
        add             esi, 8
        add             edi, 4

    do_last_pixel:    
        movd            mm0, [esi]
        paddusb         mm0, mm7                ; add dither
        movq            mm2, mm0
        pand            mm0, mm5                ; green
        pand            mm2, mm4                ; red and blue
        psrld           mm0, GREEN_SHIFT
        movq            mm3, mm2
        psrld           mm3, BLUE_SHIFT
        psrld           mm2, RED_SHIFT
        por             mm0, mm3
        por             mm0, mm2                ; mm0 = X | C1 | X | C0
        movq            mm1, mm0
        psrlq           mm1, 32
        punpcklwd       mm0, mm1                ; mm0 = X | X | C1 | C0
        movd            eax, mm0
        mov             [edi], ax

    all_done:
        emms      
    }

#endif
}