windows-server-2003/windows/advcore/gdiplus/engine/render/soblend.cpp


								/**************************************************************************\

								*

								* Copyright (c) 1999-2000  Microsoft Corporation

								*

								* Module name:

								*

								*   The "Blend" scan operation.

								*

								* Abstract:

								*

								*   See Gdiplus\Specs\ScanOperation.doc for an overview.

								*

								* Notes:

								*

								* Revision History:

								*

								*   12/07/1999 agodfrey

								*       Created it.

								*

								\**************************************************************************/


								#include "precomp.hpp"


								/**************************************************************************\

								*

								* Operation Description:

								*

								*   Blend: Does a SrcOver alpha-blend operation.

								*

								* Arguments:

								*

								*   dst         - The destination scan

								*   src         - The source scan (usually equal to dst).

								*   count       - The length of the scan, in pixels

								*   otherParams - Additional data. (We use BlendingScan.)

								*

								* Return Value:

								*

								*   None

								*

								* Notes:

								*

								*   This is a ternary operation. We take pixels from 'src', blend pixels

								*   from 'otherParams->BlendingScan' over them, and write the result to 'dst'.

								*

								*   Since the formats of the 'dst' and 'src' scans are the same for all

								*   the blend functions we implement, the naming is simplified to list just

								*   the format of BlendingScan, then the format of 'dst'.

								*

								*   src and dst may be equal; otherwise, they must point to scans which do

								*   not overlap in memory.

								*

								*   The blend operation adheres to the following rule:

								*   "If the blending alpha value is zero, do not write the destination pixel."

								*

								*   In other words, it is also a 'WriteRMW' operation. This allows us to

								*   avoid a separate 'WriteRMW' step in some cases. See SOReadRMW.cpp and

								*   SOWriteRMW.cpp.

								*

								*   The impact of this is that you have to be careful if you want 'blend'

								*   to be a true ternary operation. Remember, if a blend pixel

								*   is transparent, NOTHING gets written to the corresponding destination

								*   pixel. One way to solve this is to make sure that the final operation in

								*   your pipeline is a WriteRMW operation.

								*

								* History:

								*

								*   04/04/1999 andrewgo

								*       Created it.

								*   12/07/1999 agodfrey

								*       Included the 32bpp blend (moved from from Ddi/scan.cpp)

								*   01/06/2000 agodfrey

								*       Added AndrewGo's code for 565, 555, RGB24 and BGR24. Changed the

								*       blends to be 'almost' ternary operations.

								*

								\**************************************************************************/


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_32RGB(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(ARGB, ARGB)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0xFF))

								            {

								                *d=*bl;

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            GammaConvert_sRGB_sRGB64(buffer1,s,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination.

								            // Must blend using the previous result as the bl

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(d,buffer1,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_32RGB_MMX(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(ARGB, ARGB)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0xFF))

								            {

								                *d=*bl;

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            GammaConvert_sRGB_sRGB64(buffer1,s,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination

								            // Must blend using the previous result as the bl

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(d,buffer1,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_565(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(UINT16,UINT16)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while (((*((DWORD*)bl+nRun))>>24)==0xFF)

								            {

								                nRun++;

								                if (nRun==count) { break; }

								            }

								            if (nRun>0)

								            {

								                Dither_sRGB_565(d,bl,nRun,otherParams);


								                count-=nRun;

								                d+=nRun;

								                bl+=nRun;

								                s+=nRun;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            Convert_565_sRGB(buffer2,s,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);


								            Dither_sRGB_565(d,buffer2,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_565_MMX(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(UINT16,UINT16)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while (((*((DWORD*)bl+nRun))>>24)==0xFF)

								            {

								                nRun++;

								                if (nRun==count) { break; }

								            }

								            if (nRun>0)

								            {

								                Dither_sRGB_565_MMX(d,bl,nRun,otherParams);


								                count-=nRun;

								                d+=nRun;

								                bl+=nRun;

								                s+=nRun;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            Convert_565_sRGB(buffer2,s,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);


								            Dither_sRGB_565_MMX(d,buffer2,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_555(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(UINT16,UINT16)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while (((*((DWORD*)bl+nRun))>>24)==0xFF)

								            {

								                nRun++;

								                if (nRun==count) { break; }

								            }

								            if (nRun>0)

								            {

								                Dither_sRGB_555(d,bl,nRun,otherParams);


								                count-=nRun;

								                d+=nRun;

								                bl+=nRun;

								                s+=nRun;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            Convert_555_sRGB(buffer2,s,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);


								            Dither_sRGB_555(d,buffer2,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								VOID FASTCALL

								ScanOperation::BlendLinear_sRGB_555_MMX(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    int nRun;

								    void *buffer0=otherParams->TempBuffers[0];

								    void *buffer1=otherParams->TempBuffers[1];

								    void *buffer2=otherParams->TempBuffers[2];

								    DEFINE_POINTERS(UINT16,UINT16)

								    DEFINE_BLEND_POINTER(ARGB)

								    using namespace sRGB;

								    OtherParams otherParams2=*otherParams;


								    while (count>0)

								    {

								        // Find the run of translucent pixels

								        nRun=0;

								        while (isTranslucent(*((ARGB*)(bl+nRun))))

								        {

								            nRun++;

								            if (nRun==count) { break; }

								        }


								        if (nRun==0)

								        {

								            while (((*((DWORD*)bl+nRun))>>24)==0xFF)

								            {

								                nRun++;

								                if (nRun==count) { break; }

								            }

								            if (nRun>0)

								            {

								                Dither_sRGB_555_MMX(d,bl,nRun,otherParams);


								                count-=nRun;

								                d+=nRun;

								                bl+=nRun;

								                s+=nRun;

								            }

								            while ((count>0) && (((*((DWORD*)bl))>>24)==0x00))

								            {

								                count--;

								                d++;

								                bl++;

								                s++;

								            }

								        }

								        else

								        {

								            // Source

								            Convert_555_sRGB(buffer2,s,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer1,buffer2,nRun,otherParams);


								            // Surface to blend

								            AlphaDivide_sRGB(buffer0,bl,nRun,otherParams);

								            GammaConvert_sRGB_sRGB64(buffer2,buffer0,nRun,otherParams);

								            AlphaMultiply_sRGB64(buffer0,buffer2,nRun,otherParams);


								            // Blend to destination

								            otherParams2.BlendingScan=buffer0;

								            Blend_sRGB64_sRGB64_MMX(buffer1,buffer1,nRun,&otherParams2);

								            GammaConvert_sRGB64_sRGB(buffer2,buffer1,nRun,otherParams);


								            Dither_sRGB_555_MMX(d,buffer2,nRun,otherParams);


								            count-=nRun;

								            d+=nRun;

								            bl+=nRun;

								            s+=nRun;

								        }

								    }

								}


								// Blend sRGB over sRGB, ignoring the non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_sRGB(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(ARGB, ARGB)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    UINT32 dstPixel;

								    do {

								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 24;


								        // If alpha is zero, skip everything, including writing the

								        // destination pixel. This is needed for the RMW optimization.


								        if (alpha != 0)

								        {


								            if (alpha == 255)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                //

								                // Dst = B + (1-Alpha) * S

								                //


								                dstPixel = *s;


								                ULONG Multa = 255 - alpha;

								                ULONG _D1_00AA00GG = (dstPixel & 0xff00ff00) >> 8;

								                ULONG _D1_00RR00BB = (dstPixel & 0x00ff00ff);


								                ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;

								                ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;


								                ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;

								                ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;


								                ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;

								                ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;


								                dstPixel = blendPixel + _D4_AA00GG00 + _D4_00RR00BB;

								            }


								            *d = dstPixel;

								        }


								        bl++;

								        s++;

								        d++;

								    } while (--count != 0);

								}


								VOID FASTCALL

								ScanOperation::Blend_sRGB_sRGB_MMX(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								#if defined(_X86_)

								    using namespace sRGB;

								    DEFINE_POINTERS(ARGB64, ARGB64)

								    const void *pbl=otherParams->BlendingScan;

								    static ULONGLONG halfMask=0x0080008000800080;

								    DWORD dwBlendPixel;


								    _asm {

								        mov        ecx,count                   ; ecx=pixel counter

								        mov        ebx,pbl                     ; ebx=blend pixel pointer

								        mov        esi,s                       ; esi=source pixel pointer

								        mov        edi,d                       ; edi=dest pixel pointer

								        pxor       mm7,mm7                     ; mm7=[0|0|0|0]

								        movq       mm3,halfMask


								main_loop:

								        mov        eax,DWORD ptr [ebx]

								        mov        edx,eax                     ; eax=blend pixel

								        shr        edx,24                      ; edx=alpha

								        cmp        edx,0                       ; For some reason, doing a jz right after a shr stalls

								        jz         alpha_blend_done            ; if alpha=0, no blending


								        cmp        edx,0xFF

								        jne        alpha_blend

								        mov        [edi],eax                   ; if alpha=0xFF, copy bl to dest

								        jmp        alpha_blend_done


								alpha_blend:

								        movd       mm4,eax


								        mov        eax,[esi]                   ; eax=source

								        movd       mm0,eax                     ; mm0=[0|0|AR|GB]

								        punpcklbw  mm0,mm7                     ; mm0=[A|R|G|B]


								        xor        edx,0xFF                    ; C=255-Alpha

								        movd       mm2,edx                     ; mm2=[0|0|0|C]

								        punpcklwd  mm2,mm2                     ; mm2=[0|0|C|C]

								        punpckldq  mm2,mm2                     ; mm2=[C|C|C|C]


								        pmullw     mm0,mm2

								        paddw      mm0,mm3                     ; mm0=[AA|RR|GG|BB]

								        movq       mm2,mm0                     ; mm2=[AA|RR|GG|BB]


								        psrlw      mm0,8                       ; mm0=[A|R|G|B]

								        paddw      mm0,mm2                     ; mm0=[AA|RR|GG|BB]

								        psrlw      mm0,8                       ; mm0=[A|R|G|B]


								        packuswb   mm0,mm0                     ; mm0=[AR|GB|AR|GB]

								        paddd      mm0,mm4                     ; Add the blend pixel

								        movd       edx,mm0                     ; edx=[ARGB] -> result pixel

								        mov        [edi],edx


								alpha_blend_done:

								        add        edi,4

								        add        esi,4

								        add        ebx,4

								        dec        ecx

								        jg         main_loop


								        emms

								    }

								#endif

								}


								// Blend from sRGB64 to sRGB64.


								VOID FASTCALL

								ScanOperation::Blend_sRGB64_sRGB64(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(ARGB64, ARGB64)

								    DEFINE_BLEND_POINTER(ARGB64)

								    using namespace sRGB;


								    while (count--)

								    {

								        sRGB64Color blendPixel;

								        blendPixel.argb = *bl;

								        INT16 alpha = blendPixel.a;


								        // If alpha is zero, skip everything, including writing the

								        // destination pixel. This is needed for the RMW optimization.


								        if (alpha != 0)

								        {

								            sRGB64Color dstPixel;


								            if (alpha == SRGB_ONE)

								            {

								                dstPixel.argb = blendPixel.argb;

								            }

								            else

								            {

								                //

								                // Dst = Src + (1-Alpha) * Dst

								                //


								                dstPixel.argb = *s;


								                INT Multa = SRGB_ONE - alpha;


								                dstPixel.r = ((dstPixel.r * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.r;

								                dstPixel.g = ((dstPixel.g * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.g;

								                dstPixel.b = ((dstPixel.b * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.b;

								                dstPixel.a = ((dstPixel.a * Multa + SRGB_HALF) >> SRGB_FRACTIONBITS) + blendPixel.a;

								            }


								            *d = dstPixel.argb;

								        }


								        bl++;

								        s++;

								        d++;

								    }

								}


								// Blend from sRGB64 to sRGB64 MMX.


								VOID FASTCALL

								ScanOperation::Blend_sRGB64_sRGB64_MMX(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								#if defined(_X86_)

								    using namespace sRGB;

								    DEFINE_POINTERS(ARGB64, ARGB64)

								    const void *pbl=otherParams->BlendingScan;

								    static ULONGLONG ullSRGBHalfMask=0x1000100010001000;


								    _asm {

								        mov        ecx,count                   ; ecx=pixel counter

								        mov        ebx,pbl                     ; ebx=blend pixel pointer

								        mov        esi,s                       ; esi=source pixel pointer

								        mov        edi,d                       ; edi=dest pixel pointer

								        movq       mm4,ullSRGBHalfMask         ; mm4=mask with srgb half


								main_loop:

								        movsx      eax,word ptr [ebx+3*2]      ; eax=alpha

								        or         eax,eax                     ; eax==0?

								        jz         alpha_blend_done            ; if alpha=0, no blending


								        movq       mm0,[ebx]                   ; mm0=blend pixel

								        cmp        eax,SRGB_ONE                ; if alpha=SRGB_ONE, dest=blend

								        jne        alpha_blend

								        movq       [edi],mm0                   ; copy blend pixel to dest

								        jmp        alpha_blend_done


								alpha_blend:

								        ; Get SRGB_ONE-Alpha

								        neg        eax

								        add        eax,SRGB_ONE                ; C=SRGB_ONE-Alpha

								        movd       mm2, eax                    ; mm2=[0|0|0|C]

								        punpcklwd  mm2, mm2

								        punpckldq  mm2, mm2                    ; mm2=[C|C|C|C]


								        ; Blend pixels

								        movq       mm1,[esi]                   ; mm1=[A|R|G|B] source pixel

								        movq       mm3,mm1                     ; mm3=[A|R|G|B] source pixel

								        pmullw     mm1,mm2                     ; low word of source*C

								        paddw      mm1,mm4                     ; add an srgb half for rounding

								        psrlw      mm1,SRGB_FRACTIONBITS       ; truncate low SRGB_FRACTIONBITS

								        pmulhw     mm3,mm2                     ; high word of source*C

								        psllw      mm3,SRGB_INTEGERBITS        ; truncate high SRGB_INTEGERBITS

								        por        mm1,mm3                     ; mm1=[A|R|G|B]

								        paddw      mm1,mm0                     ; add blend pixel

								        movq       [edi],mm1                   ; copy result to dest


								alpha_blend_done:

								        add        edi,8

								        add        esi,8

								        add        ebx,8


								        dec        ecx

								        jg         main_loop

								        emms

								    }

								#endif

								}


								// Blend from sRGB to 16bpp 565, ignoring sRGB's non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_565(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(UINT16, UINT16)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    do {

								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 27;


								        if (alpha != 0)

								        {

								            UINT32 dstPixel;


								            // Blend: S + [ (255 - sA) * D ] / 255


								            // First, convert the source pixel from 32bpp BGRA to

								            // 5-5-5 16bpp, pre-multiplied.

								            //

								            // Note: No rounding needs to be done on this conversion!


								            blendPixel = ((blendPixel >> 8) & 0xf800) |

								                         ((blendPixel >> 5) & 0x07e0) |

								                         ((blendPixel >> 3) & 0x001f);


								            if (alpha == 31)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                dstPixel = (UINT32) *s;


								                UINT32 multA = 31 - alpha;


								                UINT32 D1_00rr00bb = (dstPixel & 0xf81f);

								                UINT32 D2_rrrrbbbb = D1_00rr00bb * multA + 0x00008010;

								                UINT32 D3_00rr00bb = (D2_rrrrbbbb & 0x001f03e0) >> 5;

								                UINT32 D4_rrxxbbxx = ((D2_rrrrbbbb + D3_00rr00bb) >> 5) & 0xf81f;


								                UINT32 D1_000000gg = (dstPixel & 0x7e0) >> 5;

								                UINT32 D2_0000gggg = D1_000000gg * 2 * multA + 0x00000020;

								                UINT32 D3_000000gg = (D2_0000gggg & 0x00000fc0) >> 6;

								                UINT32 D4_0000ggxx = ((D2_0000gggg + D3_000000gg) & 0x0fc0) >> 1;


								                dstPixel = (UINT16) ((D4_rrxxbbxx | D4_0000ggxx) + blendPixel);

								            }


								            *d = (UINT16) dstPixel;

								        }


								        bl++;

								        s++;

								        d++;

								    } while (--count != 0);

								}


								// Blend from sRGB to 16bpp 555, ignoring sRGB's non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_555(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(UINT16, UINT16)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    do {

								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 27;


								        if (alpha != 0)

								        {

								            UINT32 dstPixel;


								            // Blend: S + [ (255 - sA) * D ] / 255


								            // First, convert the source pixel from 32bpp BGRA to

								            // 5-5-5 16bpp, pre-multiplied.

								            //

								            // Note: No rounding needs to be done on this conversion!


								            blendPixel = ((blendPixel & 0x00f80000) >> 9) |

								                         ((blendPixel & 0x0000f800) >> 6) |

								                         ((blendPixel & 0x000000f8) >> 3);


								            if (alpha == 31)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                dstPixel = (UINT32) *s;


								                UINT32 multA = 31 - alpha;


								                UINT32 D1_00rr00bb = (dstPixel & 0x7c1f);

								                UINT32 D2_rrrrbbbb = D1_00rr00bb * multA + 0x00004010;

								                UINT32 D3_00rr00bb = (D2_rrrrbbbb & 0x000f83e0) >> 5;

								                UINT32 D4_rrxxbbxx = ((D2_rrrrbbbb + D3_00rr00bb) >> 5) & 0x7c1f;


								                UINT32 D1_000000gg = (dstPixel & 0x3e0) >> 5;

								                UINT32 D2_0000gggg = D1_000000gg * multA + 0x00000010;

								                UINT32 D3_000000gg = (D2_0000gggg & 0x000003e0) >> 5;

								                UINT32 D4_0000ggxx = (D2_0000gggg + D3_000000gg) & 0x03e0;


								                dstPixel = (UINT16) ((D4_rrxxbbxx | D4_0000ggxx) + blendPixel);

								            }


								            *d = (UINT16) dstPixel;

								        }


								        bl++;

								        s++;

								        d++;

								    } while (--count != 0);

								}


								// Blend from sRGB to RGB24, ignoring sRGB's non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_24(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(BYTE, BYTE)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    do {


								        if (((UINT_PTR) d & 0x3) == 0)

								        {

								            while (count >= 4)

								            {

								                BYTE *bb = (BYTE *) bl;


								                if ((bb[3] & bb[7] & bb[11] & bb[15]) != 0xFF)

								                {

								                    break;

								                }


								                ((UINT32 *) d)[0] = (bb[4] << 24)  | (bb[2] << 16)  | (bb[1] << 8)  | bb[0];

								                ((UINT32 *) d)[1] = (bb[9] << 24)  | (bb[8] << 16)  | (bb[6] << 8)  | bb[5];

								                ((UINT32 *) d)[2] = (bb[14] << 24) | (bb[13] << 16) | (bb[12] << 8) | bb[10];


								                count -= 4;

								                bl += 4;

								                d += 12;

								                s += 12;

								            }

								        }


								        if (count == 0)

								        {

								            break;

								        }


								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 24;


								        if (alpha != 0)

								        {

								            UINT32 dstPixel;


								            if (alpha == 255)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                // Dst = Src + (1-Alpha) * Dst


								                UINT32 multA = 255 - alpha;


								                UINT32 D1_000000GG = *(s + 1);

								                UINT32 D2_0000GGGG = D1_000000GG * multA + 0x00800080;

								                UINT32 D3_000000GG = (D2_0000GGGG & 0xff00ff00) >> 8;

								                UINT32 D4_0000GG00 = (D2_0000GGGG + D3_000000GG) & 0xFF00FF00;


								                UINT32 D1_00RR00BB = *(s) | (ULONG) *(s + 2) << 16;

								                UINT32 D2_RRRRBBBB = D1_00RR00BB * multA + 0x00800080;

								                UINT32 D3_00RR00BB = (D2_RRRRBBBB & 0xff00ff00) >> 8;

								                UINT32 D4_00RR00BB = ((D2_RRRRBBBB + D3_00RR00BB) & 0xFF00FF00) >> 8;


								                dstPixel = (D4_0000GG00 | D4_00RR00BB) + blendPixel;

								            }


								            *(d)     = (BYTE) (dstPixel);

								            *(d + 1) = (BYTE) (dstPixel >> 8);

								            *(d + 2) = (BYTE) (dstPixel >> 16);

								        }


								        bl++;

								        d += 3;

								        s += 3;

								    } while (--count != 0);

								}


								// Blend from sRGB to BGR24, ignoring sRGB's non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_24BGR(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(BYTE, BYTE)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    do {

								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 24;


								        if (alpha != 0)

								        {

								            UINT32 dstPixel;


								            if (alpha == 255)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                // Dst = Src + (1-Alpha) * Dst


								                UINT32 multA = 255 - alpha;


								                UINT32 D1_000000GG = *(s + 1);

								                UINT32 D2_0000GGGG = D1_000000GG * multA + 0x00800080;

								                UINT32 D3_000000GG = (D2_0000GGGG & 0xff00ff00) >> 8;

								                UINT32 D4_0000GG00 = (D2_0000GGGG + D3_000000GG) & 0xFF00FF00;


								                UINT32 D1_00RR00BB = *(s) | (ULONG) *(s + 2) << 16;

								                UINT32 D2_RRRRBBBB = D1_00RR00BB * multA + 0x00800080;

								                UINT32 D3_00RR00BB = (D2_RRRRBBBB & 0xff00ff00) >> 8;

								                UINT32 D4_00RR00BB = ((D2_RRRRBBBB + D3_00RR00BB) & 0xFF00FF00) >> 8;


								                dstPixel = (D4_0000GG00 | D4_00RR00BB) + blendPixel;

								            }


								            *(d)     = (BYTE) (dstPixel >> 16);

								            *(d + 1) = (BYTE) (dstPixel >> 8);

								            *(d + 2) = (BYTE) (dstPixel);

								        }


								        bl++;

								        d += 3;

								        s += 3;

								    } while (--count != 0);

								}


								/*


								!!![agodfrey]

								So we're going to move to standardizing on non-premultiplied alpha.

								When we do, the above routines will all have to change - but we may

								want to keep the above versions around too.


								Below, I've implemented the sRGB and sRGB64 versions for a non-premultiplied

								source. Now, these really blend from a non-premultiplied source,

								to a pre-multiplied destination. You can see this from the fact that they

								are equivalent to combining the above pre-multiplied Blends with an

								AlphaMultiply step on the source data.


								Since pre-multiplied and non-premultiplied formats are identical for alpha==1,

								the functions below work fine when the destination has no alpha (i.e. alpha==1).


								Otherwise, we can use them when the destination is in premultiplied format.

								If we somehow let the user draw to such a destination, they can use an off-screen

								premultiplied buffer to accumulate drawing, and then using a

								pre-multiplied blend, draw that to the final destination. This gives them

								the same functionality that standardizing on pre-multiplied alpha is supposed

								to give.


								// Blend sRGB over sRGB, ignoring the non-linear gamma.


								VOID FASTCALL

								ScanOperation::Blend_sRGB_sRGB(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(ARGB, ARGB)

								    DEFINE_BLEND_POINTER(ARGB)


								    ASSERT(count>0);


								    do {

								        UINT32 blendPixel = *bl;

								        UINT32 alpha = blendPixel >> 24;


								        // If alpha is zero, skip everything, including writing the

								        // destination pixel. This is needed for the RMW optimization.


								        if (alpha != 0)

								        {

								            UINT32 dstPixel;


								            if (alpha == 255)

								            {

								                dstPixel = blendPixel;

								            }

								            else

								            {

								                // Dst = Dst * (1-Alpha) + Src * Alpha


								                dstPixel = *s;


								                ULONG invalpha = 255 - alpha;


								                ULONG _D1_00AA00GG = (dstPixel & 0xff00ff00) >> 8;

								                ULONG _D1_00RR00BB = (dstPixel & 0x00ff00ff);


								                // For the alpha channel, the result we want is this:

								                //

								                //     Dst = Dst * (1-Alpha) + Src.

								                //

								                // Or equivalently:

								                //

								                //     Dst = Dst * (1-Alpha) + Alpha.

								                //

								                // We want to apply the same operations to the alpha channel as

								                // we do to the others. So, to get the above result from

								                //

								                //     Dst = Dst * (1-Alpha) + Src * Alpha

								                //

								                // we fake a 'Src' value of 1 (represented by 255).


								                ULONG _S1_00ff00GG = (blendPixel & 0xff00ff00) >> 8 + 0xff0000;

								                ULONG _S1_00RR00BB = (blendPixel & 0x00ff00ff);


								                ULONG _D2_AAAAGGGG = _D1_00AA00GG * invalpha +

								                                     _S1_00ff00GG * alpha +

								                                     0x00800080;

								                ULONG _D2_RRRRBBBB = _D1_00RR00BB * invalpha +

								                                     _S1_00RR00BB * alpha +

								                                     0x00800080;


								                ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;

								                ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;


								                ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;

								                ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;


								                dstPixel = _D4_AA00GG00 + _D4_00RR00BB;

								            }


								            *d = dstPixel;

								        }


								        bl++;

								        s++;

								        d++;

								    } while (--count != 0);

								}


								// Blend from sRGB64 to sRGB64.


								VOID FASTCALL

								ScanOperation::Blend_sRGB64_sRGB64(

								    VOID *dst,

								    const VOID *src,

								    INT count,

								    const OtherParams *otherParams

								    )

								{

								    DEFINE_POINTERS(ARGB64, ARGB64)

								    DEFINE_BLEND_POINTER(ARGB64)

								    using namespace sRGB;


								    while (count--)

								    {

								        sRGB64Color blendPixel;

								        blendPixel.argb = *bl;

								        INT alpha = blendPixel.a;


								        // If alpha is zero, skip everything, including writing the

								        // destination pixel. This is needed for the RMW optimization.


								        if (alpha != 0)

								        {

								            sRGB64Color dstPixel;


								            if (alpha == SRGB_ONE)

								            {

								                dstPixel.argb = blendPixel.argb;

								            }

								            else

								            {

								                // Dst = Dst * (1-Alpha) + Src * Alpha


								                dstPixel.argb = *s;


								                INT invalpha = SRGB_ONE - alpha;


								                dstPixel.r = ((dstPixel.r * invalpha) +

								                              (blendPixel.r * alpha) +

								                              SRGB_HALF) >>

								                              SRGB_FRACTIONBITS;

								                dstPixel.g = ((dstPixel.g * invalpha) +

								                              (blendPixel.g * alpha) +

								                              SRGB_HALF) >>

								                              SRGB_FRACTIONBITS;

								                dstPixel.b = ((dstPixel.b * invalpha) +

								                              (blendPixel.b * alpha) +

								                              SRGB_HALF) >>

								                              SRGB_FRACTIONBITS;

								                dstPixel.a = (((dstPixel.a * invalpha) + SRGB_HALF) >>

								                              SRGB_FRACTIONBITS) +

								                             blendPixel.a;

								            }


								            *d = dstPixel.argb;

								        }


								        bl++;

								        s++;

								        d++;

								    }

								}


								*/