/**************************************************************************\
* 
* Copyright (c) 1999-2000  Microsoft Corporation
*
* Module name:
*
*   The "Blend" scan operation.
*
* Abstract:
*
*   See Gdiplus\Specs\ScanOperation.doc for an overview.
*
* Notes:
*
* Revision History:
*
*   12/07/1999 agodfrey
*       Created it.
*
\**************************************************************************/

#include "precomp.hpp"

/**************************************************************************\
*
* Operation Description:
*
*   Blend: Does a SrcOver alpha-blend operation.
*
* Arguments:
*
*   dst         - The destination scan
*   src         - The source scan (usually equal to dst).
*   count       - The length of the scan, in pixels
*   otherParams - Additional data. (We use BlendingScan.)
*
* Return Value:
*
*   None
*
* Notes:
*
*   This is a ternary operation. We take pixels from 'src', blend pixels
*   from 'otherParams->BlendingScan' over them, and write the result to 'dst'.
*
*   Since the formats of the 'dst' and 'src' scans are the same for all
*   the blend functions we implement, the naming is simplified to list just
*   the format of BlendingScan, then the format of 'dst'.
*
*   src and dst may be equal; otherwise, they must point to scans which do
*   not overlap in memory.
*
*   The blend operation adheres to the following rule:
*   "If the blending alpha value is zero, do not write the destination pixel."
*   
*   In other words, it is also a 'WriteRMW' operation. This allows us to
*   avoid a separate 'WriteRMW' step in some cases. See SOReadRMW.cpp and 
*   SOWriteRMW.cpp.
*
*   The impact of this is that you have to be careful if you want 'blend'
*   to be a true ternary operation. Remember, if a blend pixel
*   is transparent, NOTHING gets written to the corresponding destination
*   pixel. One way to solve this is to make sure that the final operation in
*   your pipeline is a WriteRMW operation.
*
* History:
*
*   04/04/1999 andrewgo
*       Created it.
*   12/07/1999 agodfrey
*       Included the 32bpp blend (moved from from Ddi/scan.cpp)
*   01/06/2000 agodfrey
*       Added AndrewGo's code for 565, 555, RGB24 and BGR24. Changed the
*       blends to be 'almost' ternary operations.
*
\**************************************************************************/


VOID FASTCALL
ScanOperation::BlendLinear_sRGB_32RGB(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(ARGB, ARGB)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while ((count>0) && (((*((DWORD*)bl))>>24)==0xFF))
            {
                *d=*bl;
                count--;
                d++;
                bl++;
                s++;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            GammaConvert_sRGB_sRGB64(buffer1,s,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination.
            // Must blend using the previous result as the bl
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(d,buffer1,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

VOID FASTCALL
ScanOperation::BlendLinear_sRGB_32RGB_MMX(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(ARGB, ARGB)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while ((count>0) && (((*((DWORD*)bl))>>24)==0xFF))
            {
                *d=*bl;
                count--;
                d++;
                bl++;
                s++;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            GammaConvert_sRGB_sRGB64(buffer1,s,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination
            // Must blend using the previous result as the bl
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(d,buffer1,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

VOID FASTCALL
ScanOperation::BlendLinear_sRGB_565(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(UINT16,UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while (((*((DWORD*)bl+nRun))>>24)==0xFF)
            {
                nRun++;
                if (nRun==count) { break; }
            }
            if (nRun>0)
            {
                Dither_sRGB_565(d,bl,nRun,otherParams);

                count-=nRun;
                d+=nRun;
                bl+=nRun;
                s+=nRun;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            Convert_565_sRGB(buffer2,s,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);

            Dither_sRGB_565(d,buffer2,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

VOID FASTCALL
ScanOperation::BlendLinear_sRGB_565_MMX(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(UINT16,UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while (((*((DWORD*)bl+nRun))>>24)==0xFF)
            {
                nRun++;
                if (nRun==count) { break; }
            }
            if (nRun>0)
            {
                Dither_sRGB_565_MMX(d,bl,nRun,otherParams);

                count-=nRun;
                d+=nRun;
                bl+=nRun;
                s+=nRun;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            Convert_565_sRGB(buffer2,s,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);

            Dither_sRGB_565_MMX(d,buffer2,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

VOID FASTCALL
ScanOperation::BlendLinear_sRGB_555(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(UINT16,UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while (((*((DWORD*)bl+nRun))>>24)==0xFF)
            {
                nRun++;
                if (nRun==count) { break; }
            }
            if (nRun>0)
            {
                Dither_sRGB_555(d,bl,nRun,otherParams);

                count-=nRun;
                d+=nRun;
                bl+=nRun;
                s+=nRun;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            Convert_555_sRGB(buffer2,s,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);

            Dither_sRGB_555(d,buffer2,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

VOID FASTCALL
ScanOperation::BlendLinear_sRGB_555_MMX(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    int nRun;
    void *buffer0=otherParams->TempBuffers[0];
    void *buffer1=otherParams->TempBuffers[1];
    void *buffer2=otherParams->TempBuffers[2];
    DEFINE_POINTERS(UINT16,UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    using namespace sRGB;
    OtherParams otherParams2=*otherParams;

    while (count>0)
    {
        // Find the run of translucent pixels
        nRun=0;
        while (isTranslucent(*((ARGB*)(bl+nRun))))
        {
            nRun++;
            if (nRun==count) { break; }
        }

        if (nRun==0)
        {
            while (((*((DWORD*)bl+nRun))>>24)==0xFF)
            {
                nRun++;
                if (nRun==count) { break; }
            }
            if (nRun>0)
            {
                Dither_sRGB_555_MMX(d,bl,nRun,otherParams);

                count-=nRun;
                d+=nRun;
                bl+=nRun;
                s+=nRun;
            }
            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))
            {
                count--;
                d++;
                bl++;
                s++;
            }
        }
        else
        {
            // Source
            Convert_555_sRGB(buffer2,s,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);

            // Surface to blend
            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);
            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);
            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);

            // Blend to destination
            otherParams2.BlendingScan=buffer0;
            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);
            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);

            Dither_sRGB_555_MMX(d,buffer2,nRun,otherParams);

            count-=nRun;
            d+=nRun;
            bl+=nRun;
            s+=nRun;
        }
    }
}

// Blend sRGB over sRGB, ignoring the non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_sRGB(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(ARGB, ARGB)
    DEFINE_BLEND_POINTER(ARGB)

    ASSERT(count>0);

    UINT32 dstPixel;
    do {
        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 24;

        // If alpha is zero, skip everything, including writing the
        // destination pixel. This is needed for the RMW optimization.
        
        if (alpha != 0)
        {

            if (alpha == 255)
            {
                dstPixel = blendPixel;
            }
            else
            {
                //
                // Dst = B + (1-Alpha) * S
                //

                dstPixel = *s;

                ULONG Multa = 255 - alpha;
                ULONG _D1_00AA00GG = (dstPixel & 0xff00ff00) >> 8;
                ULONG _D1_00RR00BB = (dstPixel & 0x00ff00ff);

                ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
                ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;

                ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
                ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;

                ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
                ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;

                dstPixel = blendPixel + _D4_AA00GG00 + _D4_00RR00BB;
            }

            *d = dstPixel;
        }

        bl++;
        s++;
        d++;
    } while (--count != 0);
}

VOID FASTCALL
ScanOperation::Blend_sRGB_sRGB_MMX(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
#if defined(_X86_)
    using namespace sRGB;
    DEFINE_POINTERS(ARGB64, ARGB64)
    const void *pbl=otherParams->BlendingScan;
    static ULONGLONG halfMask=0x0080008000800080;
    DWORD dwBlendPixel;

    _asm {
        mov        ecx,count                   ; ecx=pixel counter
        mov        ebx,pbl                     ; ebx=blend pixel pointer
        mov        esi,s                       ; esi=source pixel pointer
        mov        edi,d                       ; edi=dest pixel pointer
        pxor       mm7,mm7                     ; mm7=[0|0|0|0]
        movq       mm3,halfMask

main_loop:
        mov        eax,DWORD ptr [ebx]
        mov        edx,eax                     ; eax=blend pixel
        shr        edx,24                      ; edx=alpha
        cmp        edx,0                       ; For some reason, doing a jz right after a shr stalls
        jz         alpha_blend_done            ; if alpha=0, no blending

        cmp        edx,0xFF
        jne        alpha_blend
        mov        [edi],eax                   ; if alpha=0xFF, copy bl to dest
        jmp        alpha_blend_done

alpha_blend:
        movd       mm4,eax

        mov        eax,[esi]                   ; eax=source
        movd       mm0,eax                     ; mm0=[0|0|AR|GB]
        punpcklbw  mm0,mm7                     ; mm0=[A|R|G|B]

        xor        edx,0xFF                    ; C=255-Alpha
        movd       mm2,edx                     ; mm2=[0|0|0|C]
        punpcklwd  mm2,mm2                     ; mm2=[0|0|C|C]
        punpckldq  mm2,mm2                     ; mm2=[C|C|C|C]

        pmullw     mm0,mm2
        paddw      mm0,mm3                     ; mm0=[AA|RR|GG|BB]
        movq       mm2,mm0                     ; mm2=[AA|RR|GG|BB]

        psrlw      mm0,8                       ; mm0=[A|R|G|B]
        paddw      mm0,mm2                     ; mm0=[AA|RR|GG|BB]
        psrlw      mm0,8                       ; mm0=[A|R|G|B]

        packuswb   mm0,mm0                     ; mm0=[AR|GB|AR|GB]
        paddd      mm0,mm4                     ; Add the blend pixel
        movd       edx,mm0                     ; edx=[ARGB] -> result pixel
        mov        [edi],edx

alpha_blend_done:
        add        edi,4
        add        esi,4
        add        ebx,4
        dec        ecx
        jg         main_loop

        emms
    }
#endif
}

// Blend from sRGB64 to sRGB64.

VOID FASTCALL
ScanOperation::Blend_sRGB64_sRGB64(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(ARGB64, ARGB64)
    DEFINE_BLEND_POINTER(ARGB64)
    using namespace sRGB;
    
    while (count--)
    {
        sRGB64Color blendPixel;
        blendPixel.argb = *bl;
        INT16 alpha = blendPixel.a;

        // If alpha is zero, skip everything, including writing the
        // destination pixel. This is needed for the RMW optimization.
        
        if (alpha != 0)
        {
            sRGB64Color dstPixel;

            if (alpha == SRGB_ONE)
            {
                dstPixel.argb = blendPixel.argb;
            }
            else
            {
                //
                // Dst = Src + (1-Alpha) * Dst
                //

                dstPixel.argb = *s;

                INT Multa = SRGB_ONE - alpha;
                
                dstPixel.r = ((dstPixel.r * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.r;
                dstPixel.g = ((dstPixel.g * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.g;
                dstPixel.b = ((dstPixel.b * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.b;
                dstPixel.a = ((dstPixel.a * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.a;
            }

            *d = dstPixel.argb;
        }

        bl++;
        s++;
        d++;
    }
}

// Blend from sRGB64 to sRGB64 MMX.

VOID FASTCALL
ScanOperation::Blend_sRGB64_sRGB64_MMX(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
#if defined(_X86_)
    using namespace sRGB;
    DEFINE_POINTERS(ARGB64, ARGB64)
    const void *pbl=otherParams->BlendingScan;
    static ULONGLONG ullSRGBHalfMask=0x1000100010001000;

    _asm {
        mov        ecx,count                   ; ecx=pixel counter
        mov        ebx,pbl                     ; ebx=blend pixel pointer
        mov        esi,s                       ; esi=source pixel pointer
        mov        edi,d                       ; edi=dest pixel pointer
        movq       mm4,ullSRGBHalfMask         ; mm4=mask with srgb half

main_loop:
        movsx      eax,word ptr [ebx+3*2]      ; eax=alpha
        or         eax,eax                     ; eax==0?
        jz         alpha_blend_done            ; if alpha=0, no blending

        movq       mm0,[ebx]                   ; mm0=blend pixel
        cmp        eax,SRGB_ONE                ; if alpha=SRGB_ONE, dest=blend
        jne        alpha_blend
        movq       [edi],mm0                   ; copy blend pixel to dest
        jmp        alpha_blend_done

alpha_blend:
        ; Get SRGB_ONE-Alpha
        neg        eax
        add        eax,SRGB_ONE                ; C=SRGB_ONE-Alpha
        movd       mm2, eax                    ; mm2=[0|0|0|C]
        punpcklwd  mm2, mm2
        punpckldq  mm2, mm2                    ; mm2=[C|C|C|C]

        ; Blend pixels
        movq       mm1,[esi]                   ; mm1=[A|R|G|B] source pixel
        movq       mm3,mm1                     ; mm3=[A|R|G|B] source pixel
        pmullw     mm1,mm2                     ; low word of source*C
        paddw      mm1,mm4                     ; add an srgb half for rounding
        psrlw      mm1,SRGB_FRACTIONBITS       ; truncate low SRGB_FRACTIONBITS
        pmulhw     mm3,mm2                     ; high word of source*C
        psllw      mm3,SRGB_INTEGERBITS        ; truncate high SRGB_INTEGERBITS
        por        mm1,mm3                     ; mm1=[A|R|G|B]
        paddw      mm1,mm0                     ; add blend pixel
        movq       [edi],mm1                   ; copy result to dest

alpha_blend_done:
        add        edi,8
        add        esi,8
        add        ebx,8

        dec        ecx
        jg         main_loop
        emms
    }
#endif
}


// Blend from sRGB to 16bpp 565, ignoring sRGB's non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_565(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(UINT16, UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    
    ASSERT(count>0);

    do {
        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 27;

        if (alpha != 0)
        {
            UINT32 dstPixel;

            // Blend: S + [ (255 - sA) * D ] / 255

            // First, convert the source pixel from 32bpp BGRA to
            // 5-5-5 16bpp, pre-multiplied.  
            //
            // Note: No rounding needs to be done on this conversion!

            blendPixel = ((blendPixel >> 8) & 0xf800) |
                         ((blendPixel >> 5) & 0x07e0) |
                         ((blendPixel >> 3) & 0x001f);
        
            if (alpha == 31)
            {
                dstPixel = blendPixel;
            }
            else
            {
                dstPixel = (UINT32) *s;

                UINT32 multA = 31 - alpha;

                UINT32 D1_00rr00bb = (dstPixel & 0xf81f);
                UINT32 D2_rrrrbbbb = D1_00rr00bb * multA + 0x00008010;
                UINT32 D3_00rr00bb = (D2_rrrrbbbb & 0x001f03e0) >> 5;
                UINT32 D4_rrxxbbxx = ((D2_rrrrbbbb + D3_00rr00bb) >> 5) & 0xf81f;

                UINT32 D1_000000gg = (dstPixel & 0x7e0) >> 5;
                UINT32 D2_0000gggg = D1_000000gg * 2 * multA + 0x00000020;
                UINT32 D3_000000gg = (D2_0000gggg & 0x00000fc0) >> 6;
                UINT32 D4_0000ggxx = ((D2_0000gggg + D3_000000gg) & 0x0fc0) >> 1;

                dstPixel = (UINT16) ((D4_rrxxbbxx | D4_0000ggxx) + blendPixel);
            }

            *d = (UINT16) dstPixel;
        }

        bl++;
        s++;
        d++;
    } while (--count != 0);
}

// Blend from sRGB to 16bpp 555, ignoring sRGB's non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_555(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(UINT16, UINT16)
    DEFINE_BLEND_POINTER(ARGB)
    
    ASSERT(count>0);

    do {
        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 27;

        if (alpha != 0)
        {
            UINT32 dstPixel;

            // Blend: S + [ (255 - sA) * D ] / 255

            // First, convert the source pixel from 32bpp BGRA to
            // 5-5-5 16bpp, pre-multiplied.  
            //
            // Note: No rounding needs to be done on this conversion!

            blendPixel = ((blendPixel & 0x00f80000) >> 9) | 
                         ((blendPixel & 0x0000f800) >> 6) | 
                         ((blendPixel & 0x000000f8) >> 3);

            if (alpha == 31)
            {
                dstPixel = blendPixel;
            }                       
            else
            {
                dstPixel = (UINT32) *s;

                UINT32 multA = 31 - alpha;

                UINT32 D1_00rr00bb = (dstPixel & 0x7c1f);
                UINT32 D2_rrrrbbbb = D1_00rr00bb * multA + 0x00004010;
                UINT32 D3_00rr00bb = (D2_rrrrbbbb & 0x000f83e0) >> 5;
                UINT32 D4_rrxxbbxx = ((D2_rrrrbbbb + D3_00rr00bb) >> 5) & 0x7c1f;

                UINT32 D1_000000gg = (dstPixel & 0x3e0) >> 5;
                UINT32 D2_0000gggg = D1_000000gg * multA + 0x00000010;
                UINT32 D3_000000gg = (D2_0000gggg & 0x000003e0) >> 5;
                UINT32 D4_0000ggxx = (D2_0000gggg + D3_000000gg) & 0x03e0;

                dstPixel = (UINT16) ((D4_rrxxbbxx | D4_0000ggxx) + blendPixel);
            }

            *d = (UINT16) dstPixel;
        }

        bl++;
        s++;
        d++;
    } while (--count != 0);
}

// Blend from sRGB to RGB24, ignoring sRGB's non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_24(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(BYTE, BYTE)
    DEFINE_BLEND_POINTER(ARGB)
    
    ASSERT(count>0);
    
    do {

        if (((UINT_PTR) d & 0x3) == 0)
        {
            while (count >= 4)
            {
                BYTE *bb = (BYTE *) bl;

                if ((bb[3] & bb[7] & bb[11] & bb[15]) != 0xFF)
                {
                    break;
                }

                ((UINT32 *) d)[0] = (bb[4] << 24)  | (bb[2] << 16)  | (bb[1] << 8)  | bb[0];
                ((UINT32 *) d)[1] = (bb[9] << 24)  | (bb[8] << 16)  | (bb[6] << 8)  | bb[5];
                ((UINT32 *) d)[2] = (bb[14] << 24) | (bb[13] << 16) | (bb[12] << 8) | bb[10];

                count -= 4;
                bl += 4;
                d += 12;
                s += 12;
            }
        }
        
        if (count == 0)
        {
            break;
        }

        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 24;

        if (alpha != 0)
        {
            UINT32 dstPixel;

            if (alpha == 255)
            {
                dstPixel = blendPixel;
            }
            else
            {
                // Dst = Src + (1-Alpha) * Dst

                UINT32 multA = 255 - alpha;

                UINT32 D1_000000GG = *(s + 1);
                UINT32 D2_0000GGGG = D1_000000GG * multA + 0x00800080;
                UINT32 D3_000000GG = (D2_0000GGGG & 0xff00ff00) >> 8;
                UINT32 D4_0000GG00 = (D2_0000GGGG + D3_000000GG) & 0xFF00FF00;

                UINT32 D1_00RR00BB = *(s) | (ULONG) *(s + 2) << 16;
                UINT32 D2_RRRRBBBB = D1_00RR00BB * multA + 0x00800080;
                UINT32 D3_00RR00BB = (D2_RRRRBBBB & 0xff00ff00) >> 8;
                UINT32 D4_00RR00BB = ((D2_RRRRBBBB + D3_00RR00BB) & 0xFF00FF00) >> 8;

                dstPixel = (D4_0000GG00 | D4_00RR00BB) + blendPixel;
            }

            *(d)     = (BYTE) (dstPixel);
            *(d + 1) = (BYTE) (dstPixel >> 8);
            *(d + 2) = (BYTE) (dstPixel >> 16);
        }

        bl++;
        d += 3;
        s += 3;
    } while (--count != 0);
}

// Blend from sRGB to BGR24, ignoring sRGB's non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_24BGR(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(BYTE, BYTE)
    DEFINE_BLEND_POINTER(ARGB)
    
    ASSERT(count>0);
    
    do {
        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 24;

        if (alpha != 0)
        {
            UINT32 dstPixel;

            if (alpha == 255)
            {
                dstPixel = blendPixel;
            }
            else
            {
                // Dst = Src + (1-Alpha) * Dst

                UINT32 multA = 255 - alpha;

                UINT32 D1_000000GG = *(s + 1);
                UINT32 D2_0000GGGG = D1_000000GG * multA + 0x00800080;
                UINT32 D3_000000GG = (D2_0000GGGG & 0xff00ff00) >> 8;
                UINT32 D4_0000GG00 = (D2_0000GGGG + D3_000000GG) & 0xFF00FF00;

                UINT32 D1_00RR00BB = *(s) | (ULONG) *(s + 2) << 16;
                UINT32 D2_RRRRBBBB = D1_00RR00BB * multA + 0x00800080;
                UINT32 D3_00RR00BB = (D2_RRRRBBBB & 0xff00ff00) >> 8;
                UINT32 D4_00RR00BB = ((D2_RRRRBBBB + D3_00RR00BB) & 0xFF00FF00) >> 8;

                dstPixel = (D4_0000GG00 | D4_00RR00BB) + blendPixel;
            }

            *(d)     = (BYTE) (dstPixel >> 16);
            *(d + 1) = (BYTE) (dstPixel >> 8);
            *(d + 2) = (BYTE) (dstPixel);
        }

        bl++;
        d += 3;
        s += 3;
    } while (--count != 0);
}

/*

!!![agodfrey]
So we're going to move to standardizing on non-premultiplied alpha.
When we do, the above routines will all have to change - but we may
want to keep the above versions around too.

Below, I've implemented the sRGB and sRGB64 versions for a non-premultiplied
source. Now, these really blend from a non-premultiplied source, 
to a pre-multiplied destination. You can see this from the fact that they 
are equivalent to combining the above pre-multiplied Blends with an
AlphaMultiply step on the source data.

Since pre-multiplied and non-premultiplied formats are identical for alpha==1,
the functions below work fine when the destination has no alpha (i.e. alpha==1).

Otherwise, we can use them when the destination is in premultiplied format.
If we somehow let the user draw to such a destination, they can use an off-screen
premultiplied buffer to accumulate drawing, and then using a
pre-multiplied blend, draw that to the final destination. This gives them
the same functionality that standardizing on pre-multiplied alpha is supposed
to give.

// Blend sRGB over sRGB, ignoring the non-linear gamma.

VOID FASTCALL
ScanOperation::Blend_sRGB_sRGB(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(ARGB, ARGB)
    DEFINE_BLEND_POINTER(ARGB)
    
    ASSERT(count>0);

    do {
        UINT32 blendPixel = *bl;
        UINT32 alpha = blendPixel >> 24;

        // If alpha is zero, skip everything, including writing the
        // destination pixel. This is needed for the RMW optimization.
        
        if (alpha != 0)
        {
            UINT32 dstPixel;

            if (alpha == 255)
            {
                dstPixel = blendPixel;
            }
            else
            {
                // Dst = Dst * (1-Alpha) + Src * Alpha
                
                dstPixel = *s;

                ULONG invalpha = 255 - alpha;
                
                ULONG _D1_00AA00GG = (dstPixel & 0xff00ff00) >> 8;
                ULONG _D1_00RR00BB = (dstPixel & 0x00ff00ff);
                
                // For the alpha channel, the result we want is this:
                //
                //     Dst = Dst * (1-Alpha) + Src.
                //
                // Or equivalently:
                //
                //     Dst = Dst * (1-Alpha) + Alpha.
                //                
                // We want to apply the same operations to the alpha channel as
                // we do to the others. So, to get the above result from
                //
                //     Dst = Dst * (1-Alpha) + Src * Alpha
                //
                // we fake a 'Src' value of 1 (represented by 255).
                
                ULONG _S1_00ff00GG = (blendPixel & 0xff00ff00) >> 8 + 0xff0000;
                ULONG _S1_00RR00BB = (blendPixel & 0x00ff00ff);

                ULONG _D2_AAAAGGGG = _D1_00AA00GG * invalpha + 
                                     _S1_00ff00GG * alpha +
                                     0x00800080;
                ULONG _D2_RRRRBBBB = _D1_00RR00BB * invalpha + 
                                     _S1_00RR00BB * alpha + 
                                     0x00800080;

                ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
                ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;

                ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
                ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;

                
                dstPixel = _D4_AA00GG00 + _D4_00RR00BB;
            }

            *d = dstPixel;
        }

        bl++;
        s++;
        d++;
    } while (--count != 0);
}

// Blend from sRGB64 to sRGB64.

VOID FASTCALL
ScanOperation::Blend_sRGB64_sRGB64(
    VOID *dst,
    const VOID *src,
    INT count,
    const OtherParams *otherParams
    )
{
    DEFINE_POINTERS(ARGB64, ARGB64)
    DEFINE_BLEND_POINTER(ARGB64)
    using namespace sRGB;
    
    while (count--)
    {
        sRGB64Color blendPixel;
        blendPixel.argb = *bl;
        INT alpha = blendPixel.a;

        // If alpha is zero, skip everything, including writing the
        // destination pixel. This is needed for the RMW optimization.
        
        if (alpha != 0)
        {
            sRGB64Color dstPixel;

            if (alpha == SRGB_ONE)
            {
                dstPixel.argb = blendPixel.argb;
            }
            else
            {
                // Dst = Dst * (1-Alpha) + Src * Alpha

                dstPixel.argb = *s;

                INT invalpha = SRGB_ONE - alpha;
                
                dstPixel.r = ((dstPixel.r * invalpha) + 
                              (blendPixel.r * alpha) +
                              SRGB_HALF) >> 
                              SRGB_FRACTIONBITS;
                dstPixel.g = ((dstPixel.g * invalpha) + 
                              (blendPixel.g * alpha) +
                              SRGB_HALF) >> 
                              SRGB_FRACTIONBITS;
                dstPixel.b = ((dstPixel.b * invalpha) + 
                              (blendPixel.b * alpha) +
                              SRGB_HALF) >> 
                              SRGB_FRACTIONBITS;
                dstPixel.a = (((dstPixel.a * invalpha) + SRGB_HALF) >> 
                              SRGB_FRACTIONBITS) + 
                             blendPixel.a;
            }

            *d = dstPixel.argb;
        }

        bl++;
        s++;
        d++;
    }
}

*/