You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2331 lines
80 KiB
2331 lines
80 KiB
/******************************Module*Header*******************************\
|
|
* Module Name: tranblt.cxx
|
|
*
|
|
* Transparent BLT
|
|
*
|
|
* Created: 21-Jun-1996
|
|
* Author: Mark Enstrom [marke]
|
|
*
|
|
* Copyright (c) 1996-1999 Microsoft Corporation
|
|
\**************************************************************************/
|
|
#include "precomp.hxx"
|
|
#pragma hdrstop
|
|
|
|
#if !(_WIN32_WINNT >= 0x500)
|
|
|
|
//
|
|
// global memory DC with single scan line 32 bpp DIBSection,
|
|
// use protected by semLocal
|
|
//
|
|
|
|
HDC ghdc32Tmp;
|
|
HDC ghdc32;
|
|
PULONG gpulDIB32;
|
|
|
|
/**************************************************************************\
|
|
* bInitAlpha
|
|
*
|
|
* Init global scan line DC
|
|
*
|
|
* Arguments:
|
|
*
|
|
* none
|
|
*
|
|
* Return Value:
|
|
*
|
|
* status
|
|
*
|
|
* History:
|
|
*
|
|
* 4/30/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
BOOL
|
|
bInitAlpha()
|
|
{
|
|
BOOL bRet = TRUE;
|
|
BITMAPINFO bmi32;
|
|
|
|
bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
|
|
bmi32.bmiHeader.biWidth = SCAN_LINE_DC_WIDTH;
|
|
bmi32.bmiHeader.biHeight = 1;
|
|
bmi32.bmiHeader.biPlanes = 1;
|
|
bmi32.bmiHeader.biBitCount = 32;
|
|
bmi32.bmiHeader.biCompression = BI_RGB;
|
|
bmi32.bmiHeader.biSizeImage = 0;
|
|
bmi32.bmiHeader.biXPelsPerMeter = 0;
|
|
bmi32.bmiHeader.biYPelsPerMeter = 0;
|
|
bmi32.bmiHeader.biClrUsed = 0;
|
|
bmi32.bmiHeader.biClrImportant = 0;
|
|
|
|
HDC hdc32 = CreateCompatibleDC(NULL);
|
|
|
|
if (hdc32 != NULL)
|
|
{
|
|
PULONG pulDIBSrc;
|
|
HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0);
|
|
|
|
if (hbmSrc)
|
|
{
|
|
HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc);
|
|
|
|
if (hbmOld != NULL)
|
|
{
|
|
ghdc32 = hdc32;
|
|
ghdc32Tmp = hdc32;
|
|
gpulDIB32 = pulDIBSrc;
|
|
}
|
|
else
|
|
{
|
|
DeleteDC(hdc32);
|
|
DeleteObject(hbmSrc);
|
|
bRet = FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DeleteDC(hdc32);
|
|
bRet = FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bRet = FALSE;
|
|
}
|
|
|
|
return(bRet);
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************************\
|
|
* CleanupGlobals
|
|
*
|
|
* Free any global DIBsections, DCs, etc. from initialization.
|
|
*
|
|
* Arguments:
|
|
* none.
|
|
*
|
|
* Return Value:
|
|
* none.
|
|
*
|
|
* History:
|
|
*
|
|
* 1/19/2000 Donald Chinn [DChinn]
|
|
*
|
|
\**************************************************************************/
|
|
VOID CleanupGlobals()
|
|
{
|
|
ASSERTGDI(ghdc32 == ghdc32Tmp, "ghdc32Tmp is still being used.");
|
|
if (!ghdc32)
|
|
{
|
|
DeleteDC(ghdc32);
|
|
}
|
|
if (!gpulDIB32)
|
|
{
|
|
DeleteObject(gpulDIB32);
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
/**************************************************************************\
|
|
* hdcAllocateScanLineDC
|
|
*
|
|
* allocate tmp scan line DC. try to use fast allocator.
|
|
*
|
|
* Arguments:
|
|
*
|
|
* hdcComp - hdc for compatible bitmap
|
|
* width - width of scan line
|
|
* pulScanLine - return pointer to temp scan line
|
|
*
|
|
* Return Value:
|
|
*
|
|
*
|
|
*
|
|
* History:
|
|
*
|
|
* 4/30/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
HDC
|
|
hdcAllocateScanLineDC(
|
|
LONG width,
|
|
PULONG *pulScanLine
|
|
)
|
|
{
|
|
ASSERTGDI(pulScanLine != NULL,"Scan line pointer must not be NULL");
|
|
|
|
HDC hdcRet = NULL;
|
|
|
|
//
|
|
// try to acquire global scan line DC
|
|
//
|
|
|
|
if (width <= SCAN_LINE_DC_WIDTH)
|
|
{
|
|
hdcRet = (HDC)InterlockedExchange((PLONG)&ghdc32Tmp, 0);
|
|
}
|
|
|
|
if (hdcRet != NULL)
|
|
{
|
|
//
|
|
// allocation succeded, assign pointer
|
|
//
|
|
|
|
*pulScanLine = gpulDIB32;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// if acquire failed, allocate
|
|
//
|
|
|
|
BITMAPINFO bmi32;
|
|
|
|
bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
|
|
bmi32.bmiHeader.biWidth = width;
|
|
bmi32.bmiHeader.biHeight = 1;
|
|
bmi32.bmiHeader.biPlanes = 1;
|
|
bmi32.bmiHeader.biBitCount = 32;
|
|
bmi32.bmiHeader.biCompression = BI_RGB;
|
|
bmi32.bmiHeader.biSizeImage = 0;
|
|
bmi32.bmiHeader.biXPelsPerMeter = 0;
|
|
bmi32.bmiHeader.biYPelsPerMeter = 0;
|
|
bmi32.bmiHeader.biClrUsed = 0;
|
|
bmi32.bmiHeader.biClrImportant = 0;
|
|
|
|
HDC hdc32 = CreateCompatibleDC(NULL);
|
|
|
|
if (hdc32 != NULL)
|
|
{
|
|
PULONG pulDIBSrc;
|
|
HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0);
|
|
|
|
if (hbmSrc)
|
|
{
|
|
HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc);
|
|
|
|
if (hbmOld != NULL)
|
|
{
|
|
hdcRet = hdc32;
|
|
*pulScanLine = pulDIBSrc;
|
|
}
|
|
else
|
|
{
|
|
DeleteDC(hdc32);
|
|
DeleteObject(hbmSrc);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
DeleteDC(hdc32);
|
|
}
|
|
}
|
|
}
|
|
|
|
return(hdcRet);
|
|
}
|
|
|
|
/**************************************************************************\
|
|
* vFreeScanLineDC
|
|
*
|
|
* free tmp scan line dc and dibsection
|
|
*
|
|
* Arguments:
|
|
*
|
|
* hdcFree - scan line DC
|
|
*
|
|
* Return Value:
|
|
*
|
|
* none
|
|
*
|
|
* History:
|
|
*
|
|
* 4/30/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
VOID
|
|
vFreeScanLineDC(
|
|
HDC hdcFree
|
|
)
|
|
{
|
|
ASSERTGDI(hdcFree != NULL,"vFreeScanLineDC: DC can't be NULL");
|
|
|
|
if (hdcFree == ghdc32)
|
|
{
|
|
//
|
|
// release global hdc
|
|
//
|
|
|
|
ghdc32Tmp = ghdc32;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// free allocated DC and bitmap
|
|
//
|
|
|
|
HBITMAP hbmOld = (HBITMAP)GetCurrentObject(hdcFree,OBJ_BITMAP);
|
|
|
|
DeleteDC(hdcFree);
|
|
|
|
if (hbmOld)
|
|
{
|
|
DeleteObject(hbmOld);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**************************************************************************\
|
|
* vPixelOver
|
|
*
|
|
* optimized routine used when the blend function is SRC_OVER and the
|
|
* SourceConstantAlpha is 255.
|
|
*
|
|
* Dst = Src + (1-SrcAlpha) * Dst
|
|
*
|
|
* Arguments:
|
|
*
|
|
* ppixDst - address of dst pixel
|
|
* ppixSrc - address of src pixel
|
|
* cx - number of pixels in scan line
|
|
* BlendFunction - blend to be done on each pixel
|
|
* pwrMask - set each byte to 0 for pixel that doesn't need
|
|
* to be written to dst
|
|
*
|
|
* Return Value:
|
|
*
|
|
* none
|
|
*
|
|
* History:
|
|
*
|
|
* 1/23/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
#if !defined(_X86_)
|
|
|
|
VOID
|
|
vPixelOver(
|
|
ALPHAPIX *ppixDst,
|
|
ALPHAPIX *ppixSrc,
|
|
LONG cx,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
ALPHAPIX pixSrc;
|
|
ALPHAPIX pixDst;
|
|
|
|
while (cx--)
|
|
{
|
|
pixSrc = *ppixSrc;
|
|
|
|
if (pixSrc.pix.a != 0)
|
|
{
|
|
pixDst = *ppixDst;
|
|
|
|
if (pixSrc.pix.a == 255)
|
|
{
|
|
pixDst = pixSrc;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Dst = Src + (1-SrcAlpha) * Dst
|
|
//
|
|
|
|
ULONG Multa = 255 - pixSrc.pix.a;
|
|
ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
|
|
ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
|
|
|
|
ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
|
|
ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
|
|
|
|
ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
|
|
ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
|
|
|
|
|
|
ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
|
|
ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
|
|
|
|
pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
|
|
}
|
|
|
|
*ppixDst = pixDst;
|
|
}
|
|
else
|
|
{
|
|
*pwrMask = 0;
|
|
}
|
|
|
|
pwrMask++;
|
|
ppixSrc++;
|
|
ppixDst++;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/**************************************************************************\
|
|
* vPixelBlendOrDissolveOver
|
|
*
|
|
* Blend routine when the blend function is SRC_OVER, but when
|
|
* SourceConstantAlpah != 255 and The source bitmap does have alpha values
|
|
*
|
|
* if SrcAlpha == 255 then
|
|
* (Blend)
|
|
* Dst = Dst + ConstAlpha * (Src - Dst)
|
|
*
|
|
* else
|
|
* (Dissolve)
|
|
* Src = Src * ConstAlpha
|
|
* (Over)
|
|
* Dst = Src + (1 - SrcAlpha) Dst
|
|
*
|
|
* Arguments:
|
|
*
|
|
* ppixDst - address of dst pixel
|
|
* ppixSrc - address of src pixel
|
|
* cx - number of pixels in scan line
|
|
* BlendFunction - blend to be done on each pixel
|
|
* pwrMask - set each byte to 0 for pixel that doesn't need
|
|
* to be written to dst
|
|
*
|
|
* Return Value:
|
|
*
|
|
* None
|
|
*
|
|
* History:
|
|
*
|
|
* 3/12/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
VOID
|
|
vPixelBlendOrDissolveOver(
|
|
ALPHAPIX *ppixDst,
|
|
ALPHAPIX *ppixSrc,
|
|
LONG cx,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
ALPHAPIX pixSrc;
|
|
ALPHAPIX pixDst;
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
|
|
while (cx--)
|
|
{
|
|
pixSrc = *ppixSrc;
|
|
|
|
if (pixSrc.pix.a != 0)
|
|
{
|
|
pixDst = *ppixDst;
|
|
|
|
if (pixSrc.pix.a == 255)
|
|
{
|
|
//
|
|
// Blend: D = sA * S + (1-sA) * D
|
|
//
|
|
// red and blue
|
|
//
|
|
|
|
ULONG uB00rr00bb = pixDst.ul & 0x00ff00ff;
|
|
ULONG uF00rr00bb = pixSrc.ul & 0x00ff00ff;
|
|
|
|
ULONG uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
|
|
(ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
|
|
|
|
ULONG uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
|
|
|
|
ULONG uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
|
|
|
|
//
|
|
// alpha and green
|
|
//
|
|
|
|
ULONG uB00aa00gg = (pixDst.ul >> 8) & 0xff00ff;
|
|
ULONG uF00aa00gg = (pixSrc.ul >> 8) & 0xff00ff;
|
|
|
|
ULONG uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
|
|
(ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
|
|
|
|
ULONG uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
|
|
|
|
ULONG uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
|
|
|
|
pixDst.ul = uD00rr00bb + uDaa00gg00;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// disolve
|
|
//
|
|
|
|
ULONG ul_B_00AA00GG = (pixSrc.ul & 0xff00ff00) >> 8;
|
|
ULONG ul_B_00RR00BB = (pixSrc.ul & 0x00ff00ff);
|
|
|
|
ULONG ul_T_AAAAGGGG = ul_B_00AA00GG * ConstAlpha + 0x00800080;
|
|
ULONG ul_T_RRRRBBBB = ul_B_00RR00BB * ConstAlpha + 0x00800080;
|
|
|
|
ULONG ul_T_00AA00GG = (ul_T_AAAAGGGG & 0xFF00FF00) >> 8;
|
|
ULONG ul_T_00RR00BB = (ul_T_RRRRBBBB & 0xFF00FF00) >> 8;
|
|
|
|
ULONG ul_C_AA00GG00 = ((ul_T_AAAAGGGG + ul_T_00AA00GG) & 0xFF00FF00);
|
|
ULONG ul_C_00RR00BB = ((ul_T_RRRRBBBB + ul_T_00RR00BB) & 0xFF00FF00) >> 8;
|
|
|
|
pixSrc.ul = (ul_C_AA00GG00 | ul_C_00RR00BB);
|
|
|
|
//
|
|
// over
|
|
//
|
|
|
|
|
|
ULONG Multa = 255 - pixSrc.pix.a;
|
|
ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
|
|
ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
|
|
|
|
ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
|
|
ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
|
|
|
|
ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
|
|
ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
|
|
|
|
|
|
ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
|
|
ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
|
|
|
|
pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
|
|
}
|
|
|
|
*ppixDst = pixDst;
|
|
}
|
|
else
|
|
{
|
|
*pwrMask = 0;
|
|
}
|
|
|
|
pwrMask++;
|
|
ppixSrc++;
|
|
ppixDst++;
|
|
}
|
|
}
|
|
|
|
#if !defined(_X86_)
|
|
|
|
/******************************Public*Routine******************************\
|
|
* vPixelBlend
|
|
*
|
|
* Blend function used then BlendFunction is SRC_OVER and
|
|
* SourceConstantAlpha != 255, and Src image does NOT have
|
|
* it's own alpha channel. (assume 255)
|
|
*
|
|
* Dst = Dst + ConstAlpha * (Src - Dst)
|
|
*
|
|
* Arguments:
|
|
*
|
|
* ppixDst - address of dst pixel
|
|
* ppixSrc - address of src pixel
|
|
* cx - number of pixels in scan line
|
|
* BlendFunction - blend to be done on each pixel
|
|
* pwrMask - set each byte to 0 for pixel that doesn't need
|
|
* to be written to dst
|
|
*
|
|
* Return Value:
|
|
*
|
|
* None
|
|
*
|
|
* History:
|
|
*
|
|
* 12/2/1996 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
VOID
|
|
vPixelBlend(
|
|
ALPHAPIX *ppixDst,
|
|
ALPHAPIX *ppixSrc,
|
|
LONG cx,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
PULONG pulSrc = (PULONG)ppixSrc;
|
|
PULONG pulDst = (PULONG)ppixDst;
|
|
PULONG pulSrcEnd = pulSrc + cx;
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
|
|
//
|
|
// Blend: D = sA * S + (1-sA) * D
|
|
//
|
|
|
|
while (pulSrc != pulSrcEnd)
|
|
{
|
|
ULONG ulDst = *pulDst;
|
|
ULONG ulSrc = *pulSrc;
|
|
ULONG uB00rr00bb = ulDst & 0x00ff00ff;
|
|
ULONG uF00rr00bb = ulSrc & 0x00ff00ff;
|
|
|
|
ULONG uMrrrrbbbb;
|
|
ULONG uM00rr00bb;
|
|
ULONG uD00rr00bb;
|
|
ULONG uB00aa00gg;
|
|
ULONG uF00aa00gg;
|
|
ULONG uMaaaagggg;
|
|
ULONG uM00aa00gg;
|
|
ULONG uDaa00gg00;
|
|
|
|
//
|
|
// red and blue
|
|
//
|
|
|
|
uB00rr00bb = ulDst & 0x00ff00ff;
|
|
uF00rr00bb = ulSrc & 0x00ff00ff;
|
|
|
|
uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
|
|
(ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
|
|
|
|
uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
|
|
|
|
uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
|
|
|
|
//
|
|
// alpha and green
|
|
//
|
|
|
|
uB00aa00gg = (ulDst >> 8) & 0xff00ff;
|
|
uF00aa00gg = (ulSrc >> 8) & 0xff00ff;
|
|
|
|
uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
|
|
(ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
|
|
|
|
uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
|
|
|
|
uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
|
|
|
|
*pulDst = uD00rr00bb + uDaa00gg00;
|
|
|
|
pulSrc++;
|
|
pulDst++;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/******************************Public*Routine******************************\
|
|
* vPixelBlend24
|
|
*
|
|
* Blend two 24 bpp images with a constant alpha value
|
|
*
|
|
* Arguments:
|
|
*
|
|
* pixDst,
|
|
* pixSrc,
|
|
* cx,
|
|
* BlendFunction
|
|
* pwrMask
|
|
*
|
|
* Return Value:
|
|
*
|
|
*
|
|
*
|
|
* History:
|
|
*
|
|
* 12/2/1996 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
VOID
|
|
vPixelBlend24(
|
|
ALPHAPIX *ppixDst,
|
|
ALPHAPIX *ppixSrc,
|
|
LONG cx,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
PBYTE pjSrc = (PBYTE)ppixSrc;
|
|
PBYTE pjDst = (PBYTE)ppixDst;
|
|
PBYTE pjSrcEnd = pjSrc + 3*cx;
|
|
|
|
while (pjSrc != pjSrcEnd)
|
|
{
|
|
ULONG ulDst = (*pjDst) << 16;
|
|
ULONG ulSrc = (*pjSrc) << 16;
|
|
|
|
ULONG uB00rr00bb;
|
|
ULONG uF00rr00bb;
|
|
ULONG uMrrrrbbbb;
|
|
ULONG uM00rr00bb;
|
|
ULONG uD00rr00bb;
|
|
ULONG uB000000gg;
|
|
ULONG uF000000gg;
|
|
ULONG uM0000gggg;
|
|
ULONG uM000000gg;
|
|
ULONG uD000000gg;
|
|
|
|
//
|
|
// red and blue
|
|
//
|
|
|
|
uB00rr00bb = uB00rr00bb = ulDst | (*(pjDst+1));
|
|
uF00rr00bb = uF00rr00bb = ulSrc | (*(pjSrc+1));
|
|
|
|
uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
|
|
(ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
|
|
|
|
uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
|
|
|
|
uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
|
|
|
|
//
|
|
// green
|
|
//
|
|
|
|
uB000000gg = *(pjDst+2);
|
|
uF000000gg = *(pjSrc+2);
|
|
|
|
uM0000gggg = ((uB000000gg <<8)-uB000000gg) +
|
|
(ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000080;
|
|
|
|
uM000000gg = (uM0000gggg & 0x0000ff00)>>8;
|
|
|
|
uD000000gg = ((uM0000gggg + uM000000gg) & 0x0000ff00) >> 8;
|
|
|
|
*pjDst = (BYTE)(uD00rr00bb >> 16);
|
|
*(pjDst+1) = (BYTE)(uD00rr00bb);
|
|
*(pjDst+2) = (BYTE)(uD000000gg);
|
|
|
|
pjSrc+=3;
|
|
pjDst+=3;
|
|
}
|
|
}
|
|
|
|
|
|
#if defined(_X86_)
|
|
|
|
typedef unsigned __int64 QWORD;
|
|
|
|
/**************************************************************************
|
|
THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
|
|
DO NOT CALL THIS FUNCTION WITH WIDTH == 0
|
|
|
|
This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
|
|
This function performs the following:
|
|
|
|
SrcTran = 255 - pixSrc.a
|
|
pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
|
|
pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
|
|
pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
|
|
pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
|
|
|
|
pDst is assumed to be aligned to a DWORD boundary when passed to this function.
|
|
Step 1:
|
|
Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
|
|
as a DWORD, then do Step 2.
|
|
Step 2:
|
|
QuadAligned
|
|
pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
|
|
pixel left, do as a DWORD.
|
|
Step 3:
|
|
Load two source pixels, S1 and S2. Get (255 - alpha value) for each source pixel, 255-S1a and 255-S2a.
|
|
Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
|
|
Load two destination pixels, D1 and D2. Expand each byte in D1 into four words
|
|
of an MMX register. If at least four pixels can be done, do Step 4. If not, jump over
|
|
FourPixelsPerPass and finish doing two pixels at TwoPixelsLeft, Step 5.
|
|
Step 4:
|
|
FourPixelsPerPass
|
|
Expand each byte in D2 into four words of an MMX register. Multiply each byte
|
|
of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate result
|
|
of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
|
|
both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
|
|
Pack the results into one MMX register. Add the packed results to the source pixels. Store result
|
|
over destination pixels. Stay in FourPixelsPerPass loop until there are less than four pixels to do.
|
|
Step 5:
|
|
TwoPixelsLeft
|
|
Do same as Step 4 above; but do not loop.
|
|
Step 6:
|
|
OnePixelLeft
|
|
If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
|
|
**************************************************************************/
|
|
VOID
|
|
mmxPixelOver(
|
|
ALPHAPIX *pDst,
|
|
ALPHAPIX *pSrc,
|
|
LONG Width,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask)
|
|
{
|
|
static QWORD W128 = 0x0080008000800080;
|
|
static QWORD AlphaMask = 0x000000FF000000FF;
|
|
|
|
_asm
|
|
{
|
|
mov esi, pSrc
|
|
mov edi, pDst
|
|
|
|
movq mm7, W128 // | 0 | 128 | 0 | 128 | 0 | 128 | 0 | 128 |
|
|
// This register never changes
|
|
pxor mm6, mm6 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
|
|
// This register never changes
|
|
|
|
mov ecx, Width
|
|
// Step 1:
|
|
test edi, 7 // Test first pixel for QWORD alignment
|
|
jz QuadAligned // if unaligned,
|
|
|
|
jmp Do1Pixel // do first pixel only
|
|
|
|
QuadAligned: // Step 2:
|
|
mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
|
|
shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
|
|
test ecx, ecx // Make sure there is at least 1 quad to do
|
|
jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
|
|
|
|
// Step 3:
|
|
movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
|
|
psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
|
|
pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
|
|
punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
|
|
movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
|
|
movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
|
|
punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
|
|
punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
|
|
punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
|
|
|
|
dec ecx
|
|
jz TwoPixelsLeft
|
|
|
|
FourPixelsPerPass: // Step 4:
|
|
// Indenting indicates operations on the next set of pixels
|
|
// Within this loop, instructions will pair as shown for the Pentium processor
|
|
// T1 = 255-S1a T2 = 255-S2a
|
|
punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
|
|
pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
|
|
|
|
movq mm0, [esi+8] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
|
|
pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
|
|
|
|
psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
|
|
add esi, 8 // pSrc++;
|
|
|
|
pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
|
|
paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
|
|
movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
|
|
|
|
movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
|
|
// TDXx' = TX*DXx+128
|
|
psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
|
|
|
|
// TDXx" = (TX*DXx+128)+(TDXx'>>8)
|
|
psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
|
|
paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
|
|
|
|
paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
|
|
psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
|
|
|
|
movq mm2, [edi+8] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
|
|
|
|
movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
|
|
|
|
paddusb mm4, [esi-8]
|
|
punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
|
|
|
|
movq [edi], mm4
|
|
punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
|
|
|
|
punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
|
|
add edi, 8 // pDst++;
|
|
|
|
dec ecx
|
|
jnz FourPixelsPerPass
|
|
|
|
TwoPixelsLeft: // Step 5:
|
|
punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
|
|
pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
|
|
pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
|
|
|
|
paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
|
|
movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
|
|
psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
|
|
psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
|
|
|
|
paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
|
|
paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
|
|
|
|
psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
|
|
psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
|
|
|
|
packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
|
|
|
|
paddusb mm4, [esi]
|
|
|
|
movq [edi], mm4
|
|
|
|
add edi, 8
|
|
add esi, 8
|
|
|
|
OnePixelLeft: // Step 6:
|
|
// This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
|
|
// If 0, there were an even number of pixels and we're done
|
|
// If 1, there is an odd number of pixels and we need to do one more
|
|
test eax, 1
|
|
jz Done
|
|
|
|
Do1Pixel: // make as a macro if used in asm file
|
|
// T = 255-S1x
|
|
movd mm0, DWORD PTR[esi] // | 0 | 0 | 0 | 0 | S1a | S1r | S1g | S1b |
|
|
psrld mm0, 24 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | S1a |
|
|
pxor mm0, AlphaMask // | 0 | 0 | 0 | 255 | 0 | 0 | 0 |255-S1a|
|
|
punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
|
|
punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
|
|
|
|
movd mm1, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
|
|
punpcklbw mm1, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
|
|
pmullw mm0, mm1 // | T*D1a | T*D1r | T*D1g | T*D1b |
|
|
paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
|
|
movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
|
|
psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
|
|
paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
|
|
psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
|
|
movd mm1, [esi]
|
|
packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
|
|
paddusb mm0, mm1
|
|
movd [edi], mm0
|
|
add edi, 4 // pDst++;
|
|
add esi, 4 // pSrc++;
|
|
|
|
test ecx, ecx
|
|
jz Done // just processed the last pixel of the row
|
|
dec ecx
|
|
jmp QuadAligned // just processed the first pixel of the row
|
|
|
|
Done:
|
|
emms // remove for optimizations, have calling function do emms
|
|
}
|
|
}
|
|
|
|
/**************************************************************************\
|
|
* mmxPixelBlendOrDissolveOver
|
|
*
|
|
* Blend routine when the blend function is SRC_OVER, but when
|
|
* SourceConstantAlpah != 255 and The source bitmap does have alpha values
|
|
*
|
|
* if SrcAlpha == 255 then
|
|
*
|
|
* Dst = Dst + ConstAlpha * (Src - Dst)
|
|
*
|
|
* else
|
|
*
|
|
* Src = Src * ConstAlpha
|
|
* Dst = Src + (1 - SrcAlpha) Dst
|
|
*
|
|
* Arguments:
|
|
*
|
|
* ppixDst - address of dst pixel
|
|
* ppixSrc - address of src pixel
|
|
* cx - number of pixels in scan line
|
|
* BlendFunction - blend to be done on each pixel
|
|
* pwrMask - set each byte to 0 for pixel that doesn't need
|
|
* to be written to dst
|
|
*
|
|
* Return Value:
|
|
*
|
|
* None
|
|
*
|
|
* History:
|
|
*
|
|
* 3/12/1997 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************************************
|
|
THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
|
|
DO NOT CALL THIS FUNCTION WITH WIDTH == 0
|
|
|
|
This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
|
|
This function performs the following:
|
|
first,
|
|
pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
|
|
pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
|
|
pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
|
|
pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
|
|
then,
|
|
SrcTran = 255 - pixSrc.a
|
|
pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
|
|
pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
|
|
pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
|
|
pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
|
|
|
|
pDst is assumed to be aligned to a DWORD boundary when passed to this function.
|
|
Step 1:
|
|
Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
|
|
as a DWORD, then do Step 2.
|
|
Step 2:
|
|
QuadAligned
|
|
pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
|
|
pixel left, do as a DWORD.
|
|
Step 3:
|
|
Load two source pixels, S1 and S2, as one QWORD. Expand S1 and S2 as four words into two MMX registers.
|
|
Multiply each word in S1 and S2 by ConstAlpha. Add 128 to each result of both pixels. Copy the results
|
|
of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results
|
|
to the copied results. Shift these results by 8. Pack the results into one MMX register...this will
|
|
be used later.
|
|
Shift the packed results by 24 to get only the alpha value for each pixel.
|
|
Step 4:
|
|
Get (255 - new alpha value) for each pixel, 255-S1a and 255-S2a.
|
|
Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
|
|
Load two destination pixels, D1 and D2. Expand D1 and D2 as four words into two MMX registers.
|
|
Multiply each byte of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate
|
|
result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
|
|
both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
|
|
Pack the results into one MMX register. Add the packed results to the new source pixels saved from
|
|
above. Store result over destination pixels. Stay in TwoPixelsAtOnceLoop loop until there is less than
|
|
two pixels to do.
|
|
Step 5:
|
|
OnePixelLeft
|
|
If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
|
|
**************************************************************************/
|
|
VOID
|
|
mmxPixelBlendOrDissolveOver(
|
|
ALPHAPIX *pDst,
|
|
ALPHAPIX *pSrc,
|
|
LONG Width,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
static QWORD W128 = 0x0080008000800080;
|
|
static QWORD AlphaMask = 0x000000FF000000FF;
|
|
static QWORD Zeros = 0;
|
|
_asm
|
|
{
|
|
mov esi, pSrc
|
|
mov edi, pDst
|
|
|
|
movq mm7, W128 // This register never changes
|
|
pxor mm4, mm4 // This register never changes
|
|
|
|
xor eax, eax
|
|
mov al, ConstAlpha
|
|
movd mm5, eax // | | | | CA |
|
|
punpcklwd mm5, mm5 // | | | CA | CA |
|
|
punpcklwd mm5, mm5 // | CA | CA | CA | CA |
|
|
// This register never changes
|
|
|
|
mov ecx, Width
|
|
// Step 1:
|
|
test edi, 7 // Test first pixel for QWORD alignment
|
|
jz QuadAligned // if unaligned,
|
|
|
|
jmp Do1Pixel // do first pixel only
|
|
|
|
QuadAligned: // Step 2:
|
|
mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
|
|
shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
|
|
test ecx, ecx // Make sure there is at least 1 quad to do
|
|
jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
|
|
|
|
TwoPixelsAtOnceLoop: // Step 3:
|
|
// Within this loop, instructions will pair as shown for the Pentium processor
|
|
|
|
/* Dissolve
|
|
pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
|
|
pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
|
|
pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
|
|
pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
|
|
*/
|
|
|
|
movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
|
|
|
|
movq mm1, mm0 // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
|
|
punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
|
|
|
|
punpckhbw mm1, mm4 // | 0 | S2a | 0 | S2r | 0 | S2g | 0 | S2b |
|
|
pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
|
|
|
|
add esi, 8 // pSrc++;
|
|
pmullw mm1, mm5 // | CA*S2a | CA*S2r | CA*S2g | CA*S2b |
|
|
|
|
paddusw mm1, mm7 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
|
|
paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
|
|
|
|
movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
|
|
psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
|
|
|
|
// S1x' = CA*S1x+128 S2x' = CA*S2x+128
|
|
movq mm3, mm1 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
|
|
psrlw mm1, 8 // | S2a'>>8 | S2r'>>8 | S2g'>>8 | S2b'>>8 |
|
|
|
|
// S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
|
|
paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
|
|
paddusw mm1, mm3 // | S2a" | S2r" | S2g" | S2b" |
|
|
|
|
psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
|
|
|
|
// SXx'" = ((CA*SXx+128)>>8)>>8)
|
|
psrlw mm1, 8 // | S2a">>8 | S2r">>8 | S2g">>8 | S2b">>8 |
|
|
packuswb mm0, mm1 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
|
|
|
|
movq mm6, mm0
|
|
psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
|
|
|
|
/* Over
|
|
SrcTran = 255 - pixSrc.a
|
|
pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
|
|
pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
|
|
pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
|
|
pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
|
|
*/
|
|
// Step 4:
|
|
pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
|
|
movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
|
|
|
|
movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
punpcklwd mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
|
|
|
|
movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
|
|
punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
|
|
|
|
punpcklwd mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
|
|
|
|
punpckhbw mm3, mm4 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
|
|
|
|
// T1 = 255-S1a T2 = 255-S2a
|
|
punpcklbw mm2, mm4 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
|
|
pmullw mm1, mm3 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
|
|
|
|
add edi, 8 // pDst++;
|
|
pmullw mm0, mm2 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
|
|
|
|
paddusw mm0, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
paddusw mm1, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
|
|
movq mm3, mm1 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
|
|
// TDXx' = TX*DXx+128
|
|
psrlw mm1, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
|
|
|
|
movq mm2, mm0 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
|
|
psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
|
|
// TDXx" = (TX*DXx+128)+(TDXx'>>8)
|
|
paddusw mm1, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
|
|
paddusw mm0, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
|
|
|
|
psrlw mm1, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
|
|
|
|
psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
|
|
|
|
packuswb mm0, mm1 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
|
|
// SXx = SXx'" TDXx = TDXx'"
|
|
paddusb mm0, mm6// |S2a+TD2a|S2r+TD2r|S2g+TD2g|S2b+TD2b|S1a+TD1a|S1r+TD1r|S1g+TD1g|S1b+TD1b|
|
|
|
|
movq [edi-8], mm0
|
|
|
|
dec ecx
|
|
jnz TwoPixelsAtOnceLoop
|
|
|
|
OnePixelLeft: // Step 5:
|
|
// This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
|
|
// If 0, there were an even number of pixels and we're done
|
|
// If 1, there is an odd number of pixels and we need to do one more
|
|
test eax, 1
|
|
jz Done
|
|
|
|
Do1Pixel: // make as a macro if used in asm file
|
|
|
|
/* Dissolve
|
|
pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
|
|
pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
|
|
pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
|
|
pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
|
|
*/
|
|
|
|
movd mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
|
|
punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
|
|
|
|
pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
|
|
paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
|
|
movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
|
|
|
|
// S1x' = CA*S1x+128 S2x' = CA*S2x+128
|
|
psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
|
|
// S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
|
|
paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
|
|
psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
|
|
packuswb mm0, mm0 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
|
|
movq mm6, mm0
|
|
psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
|
|
|
|
/* Over
|
|
SrcTran = 255 - pixSrc.a
|
|
pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
|
|
pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
|
|
pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
|
|
pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
|
|
*/
|
|
|
|
pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
|
|
punpcklwd mm0, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 |255-S1a|255-S1a|
|
|
punpckldq mm0, mm0 // | 255-S1a| 255-S1a| 255-S1a| 255-S1a|
|
|
movd mm2, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
|
|
punpcklbw mm2, mm4 // | D1a | D1r | D1g | D1b |
|
|
// T = 255-S1x
|
|
pmullw mm0, mm2 // | T*D1a | T*D1r | T*D1g | T*D1b |
|
|
paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
|
|
movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
|
|
psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
|
|
paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
|
|
psrlw mm0, 8
|
|
packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
|
|
paddusb mm0, mm6
|
|
movd [edi], mm0
|
|
add edi, 4 // pDst++;
|
|
add esi, 4 // pSrc++;
|
|
|
|
test ecx, ecx
|
|
jz Done // just processed the last pixel of the row
|
|
dec ecx
|
|
jmp QuadAligned // just processed the first pixel of the row
|
|
|
|
Done:
|
|
emms // remove for optimizations, have calling function do emms
|
|
}
|
|
}
|
|
|
|
/**************************************************************************
|
|
THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
|
|
|
|
This function operates on 16 bit pixels (5 for Red, 5 for Green, and 5 for Blue) in a row of a bitmap.
|
|
It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
|
|
The function performs the following on each byte:
|
|
|
|
tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
|
|
|
|
tmp2 = tmp1 AND 3E0h (mask off low 5 bits)
|
|
tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
tmp2 = tmp2 + tmp1
|
|
tmp2 = tmp2 AND 3E0h (mask off low 5 bits)
|
|
tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
Dst = tmp2
|
|
|
|
pDst is assumed to be aligned to a DWORD boundary when passed to this function.
|
|
|
|
Red and blue are processed together in the same register. Green is processed separately.
|
|
For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
|
|
greens are processed together in a separate register.
|
|
|
|
The loop structure is as follows:
|
|
Step 1:
|
|
Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
|
|
as a DWORD (OnePixelLeft:), then do Step 2.
|
|
Step 2:
|
|
(QuadAligned:)
|
|
pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
|
|
pixel left, do as a DWORD.
|
|
Step 3:
|
|
(TwoPixelsAtOnceLoop:)
|
|
Perform the above function, using MMX instructions, on two pixels per pass of the loop.
|
|
Step 4:
|
|
(OnePixelLeft:)
|
|
If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
|
|
**************************************************************************/
|
|
VOID
|
|
mmxPixelBlend16_555(
|
|
PALPHAPIX pDst,
|
|
PALPHAPIX pSrc,
|
|
LONG Width,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
static QWORD RMask = 0x007C0000007C0000;
|
|
static QWORD GMask = 0x0000000003E003E0;
|
|
static QWORD BMask = 0x0000001F0000001F;
|
|
static QWORD RBConst = 0x0010001000100010;
|
|
static QWORD GConst = 0x0000000000100010;
|
|
static QWORD RGBMask = 0x03E003E003E003E0;
|
|
static QWORD RedMask = 0x001F0000001F0000;
|
|
static QWORD CA; // ConstAlpha in 4 words of a qword
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
|
|
_asm
|
|
{
|
|
mov ecx, Width // Make sure there is at least one pixel to do
|
|
test ecx, ecx
|
|
jz Done
|
|
|
|
mov esi, pSrc
|
|
mov edi, pDst
|
|
|
|
xor eax, eax
|
|
mov al, ConstAlpha
|
|
movd mm5, eax // | | | | CA |
|
|
punpcklwd mm5, mm5 // | | | CA | CA |
|
|
punpcklwd mm5, mm5 // | CA | CA | CA | CA |
|
|
movq CA, mm5
|
|
// Step 1:
|
|
test edi, 7 // Test first pixel for QWORD alignment
|
|
jz QuadAligned // if unaligned,
|
|
|
|
jmp Do1Pixel // do first pixel only
|
|
|
|
QuadAligned: // Step 2:
|
|
mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
|
|
shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
|
|
test ecx, ecx // Make sure there is at least 1 quad to do
|
|
jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
|
|
|
|
TwoPixelsAtOnceLoop: // Step 3:
|
|
movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
|
|
movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
|
|
punpcklbw mm0, mm7 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
punpcklbw mm1, mm7 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
|
|
movq mm4, mm0 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
pand mm0, RMask // | D20rrrrr00 | 0 | D10rrrrr00 | 0 |
|
|
movq mm5, mm1 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
|
|
|
|
pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb |
|
|
psrlw mm0, 2 // | D2rrrrr | 0 | D1rrrrr | 0 |
|
|
|
|
pand mm1, RMask // | S20rrrrr00 | 0 | S10rrrrr00 | 0 |
|
|
por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
|
|
movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm2, GMask // | 0 | 0 |D2ggggg00000|D1ggggg00000|
|
|
psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000|
|
|
|
|
pand mm3, GMask // | 0 | 0 |S2ggggg00000|S1ggggg00000|
|
|
psrlw mm1, 2 // | S2rrrrr | 0 | S1rrrrr | 0 |
|
|
|
|
por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
|
|
movq mm6, mm2 // | 0 | 0 |D2ggggg00000|D1ggggg00000|
|
|
|
|
psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
|
|
psrlw mm2, 5 // | 0 | 0 | D2ggggg | D1ggggg |
|
|
|
|
pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
|
|
psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 |
|
|
|
|
paddw mm4, RBConst// | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
|
|
psrlw mm3, 5 // | 0 | 0 | S2ggggg | S1ggggg |
|
|
|
|
psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
|
|
add esi, 4 // pSrc++;
|
|
|
|
pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
|
|
paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
psubw mm6, mm2 // | 0 | 0 | D2g*31 | D2g*31 |
|
|
add edi, 4 // pDst++;
|
|
|
|
paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
|
|
movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
|
|
|
|
movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
|
|
psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
|
|
|
|
pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
|
|
psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
|
|
psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits)
|
|
movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm4, RedMask// Mask to get red
|
|
|
|
pand mm1, BMask // Mask to get blue
|
|
psllw mm4, 2 // Line up the red
|
|
|
|
por mm4, mm1 // Combine reds and blues in proper bit location
|
|
|
|
packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
|
|
|
|
por mm4, mm5 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
|
|
|
|
movd [edi-4], mm4
|
|
|
|
dec ecx
|
|
jnz TwoPixelsAtOnceLoop
|
|
|
|
OnePixelLeft: // Step 4:
|
|
// This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
|
|
// If 0, there was an even number of pixels and we're done
|
|
// If 1, there is an odd number of pixels and we need to do one more
|
|
test eax, 1
|
|
jz Done
|
|
|
|
Do1Pixel: // make as a macro if used in asm file
|
|
|
|
movzx edx,WORD PTR[edi] ; edx = D 0000 0000 0rrr rrgg gggb bbbb
|
|
movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 0rrr rrgg gggb bbbb
|
|
|
|
movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
|
|
movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
|
|
punpcklbw mm0, mm7 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
punpcklbw mm1, mm7 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
|
|
movq mm4, mm0 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
|
|
|
|
pand mm0, RMask // | 0 | 0 | D10rrrrr00 | 0 |
|
|
movq mm5, mm1 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
|
|
|
|
pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb |
|
|
psrlw mm0, 2 // | 0 | 0 | D1rrrrr | 0 |
|
|
|
|
pand mm1, RMask // | 0 | 0 | S10rrrrr00 | 0 |
|
|
por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
|
|
movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm2, GMask // | 0 | 0 | 0 |D1ggggg00000|
|
|
psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000|
|
|
|
|
pand mm3, GMask // | 0 | 0 | 0 |S1ggggg00000|
|
|
psrlw mm1, 2 // | 0 | 0 | S1rrrrr | 0 |
|
|
|
|
por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
|
|
movq mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000|
|
|
// mm1 is free
|
|
psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
|
|
psrlw mm2, 5 // | 0 | 0 | 0 | D1ggggg |
|
|
|
|
pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
|
|
psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 |
|
|
|
|
paddw mm4, RBConst// | 0 | 0 | CA1r+c | CA1b+c |
|
|
psrlw mm3, 5 // | 0 | 0 | 0 | S1ggggg |
|
|
|
|
psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
|
|
add esi, 2 // pSrc++;
|
|
|
|
pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
|
|
paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
psubw mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000-D1ggggg|
|
|
add edi, 2 // pDst++;
|
|
|
|
paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
|
|
movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
|
|
|
|
movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
|
|
psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
|
|
|
|
pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
|
|
psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
|
|
psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits)
|
|
movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm4, RedMask// Mask to get red
|
|
|
|
pand mm1, BMask // Mask to get blue
|
|
psllw mm4, 2 // Line up the red
|
|
|
|
por mm4, mm1 // Combine reds and blues in proper bit location
|
|
|
|
packsswb mm4, mm7 // | 0 | 0 | D10rrrrr00 | D1000bbbbb |
|
|
|
|
por mm4, mm5 // | 0 | 0 | D10rrrrrgg | D1gggbbbbb |
|
|
|
|
movd edx, mm4
|
|
|
|
mov [edi-2], dx
|
|
|
|
test ecx, ecx
|
|
jz Done // just processed the last pixel of the row
|
|
dec ecx
|
|
jmp QuadAligned // just processed the first pixel of the row
|
|
|
|
Done:
|
|
emms // remove for optimizations, have calling function do emms
|
|
}
|
|
}
|
|
|
|
/**************************************************************************
|
|
THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
|
|
|
|
This function operates on 16 bit pixels (5 for Red, 6 for Green, and 5 for Blue) in a row of a bitmap.
|
|
It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
|
|
The function performs the following:
|
|
|
|
For red and blue:
|
|
tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
|
|
|
|
tmp2 = tmp1 AND 3E0h (mask off low 5 bits)
|
|
tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
tmp2 = tmp2 + tmp1
|
|
tmp2 = tmp2 AND 3E0h (mask off low 5 bits)
|
|
tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
Dst = tmp2
|
|
|
|
For green:
|
|
tmp1 = Alpha(Src - Dst) + 32 + (Dst * 63)
|
|
|
|
tmp2 = tmp1 AND FC0h (mask off low 6 bits)
|
|
tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits)
|
|
tmp2 = tmp2 + tmp1
|
|
tmp2 = tmp2 AND FC0h (mask off low 6 bits)
|
|
tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits)
|
|
Dst = tmp2
|
|
|
|
pDst is assumed to be aligned to a DWORD boundary when passed to this function.
|
|
|
|
Red and blue are processed together in the same register. Green is processed separately.
|
|
For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
|
|
greens are processed together in a separate register.
|
|
|
|
The loop structure is as follows:
|
|
Step 1:
|
|
Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
|
|
as a DWORD (OnePixelLeft:), then do Step 2.
|
|
Step 2:
|
|
(QuadAligned:)
|
|
pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
|
|
pixel left, do as a DWORD.
|
|
Step 3:
|
|
(TwoPixelsAtOnceLoop:)
|
|
Perform the above function, using MMX instructions, on two pixels per pass of the loop.
|
|
Step 4:
|
|
(OnePixelLeft:)
|
|
If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
|
|
**************************************************************************/
|
|
VOID
|
|
mmxPixelBlend16_565(
|
|
PALPHAPIX pDst,
|
|
PALPHAPIX pSrc,
|
|
LONG Width,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
static QWORD RMask = 0x00FF000000FF0000;
|
|
static QWORD GMask = 0x0000000007E007E0;
|
|
static QWORD BMask = 0x0000001F0000001F;
|
|
static QWORD RBConst = 0x0010001000100010;
|
|
static QWORD GConst = 0x0000000000200020;
|
|
static QWORD RBMask = 0x03E003E003E003E0;
|
|
static QWORD GreenMask = 0x000000000FC00FC0;
|
|
static QWORD CA; // ConstAlpha in 4 words of a qword
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
|
|
_asm
|
|
{
|
|
mov ecx, Width // Make sure there is at least one pixel to do
|
|
test ecx, ecx
|
|
jz Done
|
|
|
|
mov esi, pSrc
|
|
mov edi, pDst
|
|
|
|
xor eax, eax
|
|
mov al, ConstAlpha
|
|
movd mm5, eax // | | | | CA |
|
|
punpcklwd mm5, mm5 // | | | CA | CA |
|
|
punpcklwd mm5, mm5 // | CA | CA | CA | CA |
|
|
movq CA, mm5
|
|
// Step 1:
|
|
test edi, 7 // Test first pixel for QWORD alignment
|
|
jz QuadAligned // if unaligned,
|
|
|
|
jmp Do1Pixel // do first pixel only
|
|
|
|
QuadAligned: // Step 2:
|
|
mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
|
|
shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
|
|
test ecx, ecx // Make sure there is at least 1 quad to do
|
|
jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
|
|
|
|
TwoPixelsAtOnceLoop: // Step 3:
|
|
movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
|
|
movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
|
|
punpcklbw mm0, mm7 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
punpcklbw mm1, mm7 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
|
|
movq mm4, mm0 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
pand mm0, RMask // | D2rrrrr000 | 0 | D1rrrrr000 | 0 |
|
|
movq mm5, mm1 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
|
|
|
|
pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb |
|
|
psrlw mm0, 3 // | D2rrrrr | 0 | D1rrrrr | 0 |
|
|
|
|
pand mm1, RMask // | S2rrrrr000 | 0 | S1rrrrr000 | 0 |
|
|
por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
|
|
movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm2, GMask // | 0 | 0 |D2gggggg00000|D1gggggg00000|
|
|
psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000|
|
|
|
|
pand mm3, GMask // | 0 | 0 |S2gggggg00000|S1gggggg00000|
|
|
psrlw mm1, 3 // | S2rrrrr | 0 | S1rrrrr | 0 |
|
|
|
|
por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
|
|
movq mm6, mm2 // | 0 | 0 |D2gggggg00000|D1gggggg00000|
|
|
|
|
psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
|
|
psrlw mm2, 5 // | 0 | 0 | D2gggggg | D1gggggg |
|
|
|
|
pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
|
|
psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 |
|
|
|
|
paddw mm4, RBConst // | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
|
|
psrlw mm3, 5 // | 0 | 0 | S2gggggg | S1gggggg |
|
|
|
|
psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
|
|
add esi, 4 // pSrc++;
|
|
|
|
pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
|
|
psllw mm6, 1 // | 0 | 0 |D2gggggg000000|D1gggggg000000|
|
|
|
|
paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
psubw mm6, mm2 // | 0 | 0 | D2g*63 | D1g*63 |
|
|
|
|
paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
|
|
movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
add edi, 4 // pDst++;
|
|
psllw mm3, 1 // | 0 | 0 | CA2g*2 | CA1g*2 |
|
|
|
|
pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
|
|
|
|
movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
|
|
psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits)
|
|
paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
|
|
|
|
pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
|
|
psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits)
|
|
|
|
paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
|
|
psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits)
|
|
movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm4, RMask // Mask to get red
|
|
psrlw mm5, 1 // Align the green
|
|
|
|
pand mm1, BMask // Mask to get blue
|
|
psllw mm4, 3 // Align the red
|
|
|
|
por mm4, mm1 // Combine reds and blues in proper bit location
|
|
|
|
packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D2rrrrr000 | D2000bbbbb | D1rrrrr000 | D1000bbbbb |
|
|
|
|
por mm4, mm5 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
movd [edi-4], mm4
|
|
|
|
dec ecx
|
|
jnz TwoPixelsAtOnceLoop
|
|
|
|
OnePixelLeft: // Step 4:
|
|
// This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
|
|
// If 0, there were an even number of pixels and we're done
|
|
// If 1, there is an odd number of pixels and we need to do one more
|
|
test eax, 1
|
|
jz Done
|
|
|
|
Do1Pixel: // make as a macro if used in asm file
|
|
|
|
movzx edx,WORD PTR[edi] ; edx = D 0000 0000 rrrr rggg gggb bbbb
|
|
movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 rrrr rggg gggb bbbb
|
|
|
|
movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
|
|
movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
|
|
punpcklbw mm0, mm7 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
punpcklbw mm1, mm7 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
|
|
movq mm4, mm0 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
pand mm0, RMask // | 0 | 0 | D1rrrrr000 | 0 |
|
|
movq mm5, mm1 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
|
|
|
|
pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb |
|
|
psrlw mm0, 3 // | 0 | 0 | D1rrrrr | 0 |
|
|
|
|
pand mm1, RMask // | 0 | 0 | S1rrrrr000 | 0 |
|
|
por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
|
|
movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
|
|
|
|
pand mm2, GMask // | 0 | 0 | 0 |D1gggggg00000|
|
|
psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000|
|
|
|
|
pand mm3, GMask // | 0 | 0 | 0 |S1gggggg00000|
|
|
psrlw mm1, 3 // | 0 | 0 | S1rrrrr | 0 |
|
|
|
|
por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
|
|
movq mm6, mm2 // | 0 | 0 | 0 |D1gggggg00000|
|
|
|
|
psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
|
|
psrlw mm2, 5 // | 0 | 0 | 0 | D1gggggg |
|
|
|
|
pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
|
|
psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 |
|
|
|
|
paddw mm4, RBConst // | 0 | 0 | CA1r+c | CA1b+c |
|
|
psrlw mm3, 5 // | 0 | 0 | 0 | S1gggggg |
|
|
|
|
psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
|
|
add esi, 2 // pSrc++;
|
|
|
|
pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
|
|
paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
psllw mm6, 1 // | 0 | 0 | 0 |D1gggggg000000|
|
|
|
|
psubw mm6, mm2 // | 0 | 0 | 0 | D1g*63 |
|
|
add edi, 2 // pDst++;
|
|
|
|
paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
|
|
movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
|
|
|
|
psllw mm3, 1 // | 0 | 0 | 0 | CA1g*2 |
|
|
|
|
pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
|
|
paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
|
|
|
|
movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
|
|
psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits)
|
|
paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
|
|
|
|
pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
|
|
psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits)
|
|
|
|
paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
|
|
psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits)
|
|
movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
|
|
|
|
pand mm4, RMask // Mask to get red
|
|
psrlw mm5, 1 // Align the green
|
|
|
|
pand mm1, BMask // Mask to get blue
|
|
psllw mm4, 3 // Align the red
|
|
|
|
por mm4, mm1 // Combine reds and blues in proper bit location
|
|
|
|
packuswb mm4, mm7 // | 0 | 0 | D1rrrrr000 | D1000bbbbb |
|
|
|
|
por mm4, mm5 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
|
|
|
|
movd edx, mm4
|
|
|
|
mov [edi-2], dx
|
|
|
|
test ecx, ecx
|
|
jz Done // just processed the last pixel of the row
|
|
dec ecx
|
|
jmp QuadAligned // just processed the first pixel of the row
|
|
|
|
Done:
|
|
emms // remove for optimizations, have calling function do emms
|
|
}
|
|
}
|
|
|
|
/**************************************************************************
|
|
THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
|
|
|
|
This function operates on 24 bit pixels (8 bits each for Red, Green, and Blue) in a row of a bitmap.
|
|
It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
|
|
The function performs the following on each byte:
|
|
|
|
tmp1 = Alpha(Src - Dst) + 128 + (Dst * 127)
|
|
|
|
tmp2 = tmp1 AND FF00h (mask off low byte)
|
|
tmp2 = tmp2 shr 8 (move high byte to low byte)
|
|
tmp2 = tmp2 + tmp1
|
|
tmp2 = tmp2 AND FF00h (mask off low byte)
|
|
tmp2 = tmp2 shr 8 (move high byte to low byte)
|
|
Dst = tmp2
|
|
|
|
pDst is assumed to be aligned to a DWORD boundary when passed to this function.
|
|
The loop structure is as follows:
|
|
Step 1:
|
|
Multiply width in pixels by 3 to get width in bytes. Byte count is kept in ecx and eax.
|
|
ecx is used as the loop counter.
|
|
Step 2:
|
|
Check pDst for QWORD alignment. If aligned, do Step 3. If unaligned, test to see if there
|
|
are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:) and then do Step 3.
|
|
If no, there are only 3 bytes to do; so do them one at a time (OneToThreeBytesLeft:).
|
|
Step 3:
|
|
(QuadAligned:)
|
|
pDst is QWORD aligned. We want to do 8 bytes (1 quad) at once, so divide byte count by 8 to get loop
|
|
count. If ecx is 0 at this point, there are no more quads to do; so do 0 to 7 bytes (NoQuadsLeft:),
|
|
in Step 5.
|
|
Step 4:
|
|
(Do1QUAD:)
|
|
Perform the above function, using MMX instructions, on 8 bytes per pass of the loop.
|
|
Step 5:
|
|
(NoQuadsLeft:)
|
|
Mask eax with 7 to get the byte count modulo 8, 0 to 7 bytes left. Copy eax into ecx. Test to see
|
|
if there are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:); if no, there are
|
|
only 3 bytes to do, so do them one at a time (OneToThreeBytesLeft:).
|
|
Step 6:
|
|
(Do1DWORD:)
|
|
Perform the above function, using MMX instructions, on 4 bytes. Do Step 3 (QuadAligned:) to see if
|
|
there are more bytes to do.
|
|
Step 7:
|
|
(OneToThreeBytesLeft:)
|
|
Do one byte at a time. This will happen if there are less than 4 bytes left to do.
|
|
**************************************************************************/
|
|
VOID
|
|
mmxPixelBlend24(
|
|
PALPHAPIX pDst,
|
|
PALPHAPIX pSrc,
|
|
LONG Width,
|
|
BLENDFUNCTION BlendFunction,
|
|
PBYTE pwrMask
|
|
)
|
|
{
|
|
static QWORD WordConst = 0x0080008000800080;
|
|
static QWORD WordMask = 0xFF00FF00FF00FF00;
|
|
static QWORD ByteConst = 0x0000000000000080;
|
|
static QWORD ByteMask = 0x000000000000FF00;
|
|
static QWORD CA; // ConstAlpha in 4 words of a qword
|
|
BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
|
|
|
|
_asm
|
|
{
|
|
mov ecx, Width // Make sure there is at least one pixel to do
|
|
test ecx, ecx
|
|
jz Done
|
|
|
|
mov esi, pSrc
|
|
mov edi, pDst
|
|
|
|
xor eax, eax
|
|
mov al, ConstAlpha
|
|
movd mm5, eax // | | | | CA |
|
|
punpcklwd mm5, mm5 // | | | CA | CA |
|
|
punpcklwd mm5, mm5 // | CA | CA | CA | CA |
|
|
movq CA, mm5
|
|
|
|
// Step 1:
|
|
lea ecx, [2*ecx+ecx]// NumPixels * 3 bytes/pixel = NumBytes
|
|
|
|
// Step 2:
|
|
test edi, 7 // Test first pixel for QWORD alignment
|
|
jz QuadAligned // If unaligned,
|
|
|
|
cmp ecx, 4 // test to see if there are 4 bytes to do
|
|
jae Do1DWORD // if yes, do 4 bytes
|
|
jmp OneToThreeBytesLeft// if no, do 1 to 3 bytes
|
|
|
|
QuadAligned: // Step 3:
|
|
mov eax, ecx // Save the width in eax for later (see NoQuadsLeft:)
|
|
shr ecx, 3 // Want to do 8 bytes at once, so divide
|
|
// byte count by 8 to get loop count
|
|
test ecx, ecx // Make sure there is at least 1 QUAD (8 bytes) to do
|
|
jz NoQuadsLeft // If we take this jump, there are 0 to 7 bytes left
|
|
|
|
Do1QUAD: // Step 4:
|
|
// Instructions will pair as shown for the Pentium processor
|
|
movq mm0, [edi] // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
|
|
pxor mm7, mm7
|
|
|
|
movq mm1, [esi] // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
|
|
movq mm2, mm0 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
|
|
|
|
movq mm3, mm1 // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
|
|
punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
|
|
|
|
movq mm4, mm0 // | D4 | D3 | D2 | D1 |
|
|
punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
|
|
|
|
punpckhbw mm2, mm7 // | D8 | D7 | D6 | D5 |
|
|
psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
|
|
|
|
pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
|
|
punpckhbw mm3, mm7 // | S8 | S7 | S6 | S5 |
|
|
|
|
psubw mm3, mm2 // | S8-D8 | S7-D7 | S6-D6 | S5-D5 |
|
|
movq mm6, mm2 // | D8 | D7 | D6 | D5 |
|
|
|
|
pmullw mm3, CA // | CA8 | CA7 | CA6 | CA5 |
|
|
psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
|
|
|
|
psllw mm6, 8 // | D8*128 | D7*128 | D6*128 | D5*128 |
|
|
psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
|
|
|
|
paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
|
|
psubw mm6, mm2 // | D8*127 | D7*127 | D6*127 | D5*127 |
|
|
|
|
paddw mm6, WordConst // | D8*127+C| D7*127+C| D6*127+C| D5*127+C|
|
|
paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
paddw mm6, mm3 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
|
|
movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
pand mm4, WordMask // tmp3 = tmp1 AND FF00h (mask off low bytes)
|
|
movq mm5, mm6 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
|
|
|
|
pand mm6, WordMask // tmp4 = tmp2 AND FF00h (mask off low bytes)
|
|
psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte)
|
|
|
|
psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte)
|
|
paddw mm4, mm3 // tmp3 = tmp3 + tmp1
|
|
|
|
pand mm4, WordMask // tmp3 = tmp3 AND FF00h (mask off low bytes)
|
|
paddw mm6, mm5 // tmp4 = tmp4 + tmp2
|
|
|
|
pand mm6, WordMask // tmp4 = tmp4 AND FF00h (mask off low bytes)
|
|
psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte)
|
|
|
|
psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte)
|
|
add edi, 8 // pDst++;
|
|
|
|
packuswb mm4, mm6 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
|
|
add esi, 8 // pSrc++;
|
|
|
|
movq [edi-8], mm4
|
|
|
|
dec ecx
|
|
jnz Do1QUAD
|
|
|
|
NoQuadsLeft: // Step 5:
|
|
// This tests for 0 to 7 bytes left in row - eax contains initial byte count
|
|
and eax, 7 // 0 to 7 bytes left to do
|
|
jz Done
|
|
cmp eax, 4 // Test to see if there are 4 bytes to do
|
|
mov ecx, eax
|
|
jae Do1DWORD // if yes, do 4 bytes
|
|
jmp OneToThreeBytesLeft // if no, do 1 to 3 bytes
|
|
|
|
// Step 6:
|
|
Do1DWORD: // make as a macro if used in asm file
|
|
movd mm0, [edi] // | 0 | 0 | 0 | 0 | D4 | D3 | D2 | D1 |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, [esi] // | 0 | 0 | 0 | 0 | S4 | S3 | S2 | S1 |
|
|
punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
|
|
|
|
movq mm4, mm0 // | D4 | D3 | D2 | D1 |
|
|
punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
|
|
|
|
psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
|
|
psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
|
|
|
|
pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
|
|
psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
|
|
|
|
paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
|
|
|
|
paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
pand mm4, WordMask // tmp2 = tmp1 AND FF00h (mask off low bytes)
|
|
|
|
psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte)
|
|
|
|
paddw mm4, mm3 // tmp2 = tmp2 + tmp1
|
|
|
|
pand mm4, WordMask // tmp2 = tmp2 AND FF00h (mask off low bytes)
|
|
|
|
psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte)
|
|
add edi, 4 // pDst++;
|
|
|
|
packuswb mm4, mm4 // | D4 | D3 | D2 | D1 | D4 | D3 | D2 | D1 |
|
|
add esi, 4 // pSrc++;
|
|
|
|
movd [edi-4], mm4
|
|
|
|
sub ecx, 4 // Just did 4 bytes at the beginning or end of a scan line
|
|
jmp QuadAligned // Jump to QuadAligned to determine if there are more bytes to do
|
|
|
|
OneToThreeBytesLeft: // Step 7:
|
|
|
|
movzx edx,BYTE PTR[edi] ; edx = Dest Byte
|
|
movzx ebx,BYTE PTR[esi] ; ebx = Src Byte
|
|
|
|
movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
|
|
pxor mm7, mm7
|
|
|
|
movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Sb |
|
|
movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
|
|
|
|
psllw mm2, 8 // | 0 | 0 | 0 | 0 | 0 | 0 | Db| 0 |
|
|
|
|
psubw mm1, mm0 // | 0 | 0 | 0 | Sb-Db |
|
|
|
|
pmullw mm1, CA // | 0 | 0 | 0 | CAb |
|
|
psubw mm2, mm0 // | 0 | 0 | 0 | Db*127|
|
|
|
|
paddw mm2, ByteConst // | 0 | 0 | 0 |Db*127+128|
|
|
|
|
paddw mm1, mm2 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
movq mm2, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
|
|
|
|
pand mm2, ByteMask // tmp2 = tmp1 AND FF00h
|
|
|
|
psrlw mm2, 8 // tmp2 = tmp2 shr 8
|
|
|
|
paddw mm2, mm1 // tmp2 = tmp2 + tmp1
|
|
|
|
pand mm2, ByteMask // tmp2 = tmp2 AND FF00h
|
|
|
|
psrlw mm2, 8 // tmp2 = tmp2 shr 8
|
|
|
|
movd edx, mm2
|
|
|
|
mov BYTE PTR[edi], dl
|
|
|
|
inc edi
|
|
inc esi
|
|
|
|
dec ecx
|
|
jnz OneToThreeBytesLeft
|
|
|
|
Done:
|
|
emms // remove for optimizations, have calling function do emms
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/******************************Public*Routine******************************\
|
|
* AlphaScanLineBlend
|
|
*
|
|
* Blends source and destionation surfaces one scan line at a time.
|
|
*
|
|
* Allocate a scan line buffer for xlate of src to 32BGRA if needed.
|
|
* Allocate a scan line buffer for xlate of dst to 32BGRA if needed.
|
|
* Blend scan line using blend function from pAlphaDispatch
|
|
* Write scan line back to dst (if needed)
|
|
*
|
|
* Arguments:
|
|
*
|
|
* pDst - pointer to dst surface
|
|
* pDstRect - Dst output rect
|
|
* DeltaDst - dst scan line delat
|
|
* pSrc - pointer to src surface
|
|
* DeltaSrc - src scan line delta
|
|
* pptlSrc - src offset
|
|
* pxloSrcTo32 - xlateobj from src to 32BGR
|
|
* pxlo32ToDst - xlateobj from 32BGR to dst
|
|
* palDst - destination palette
|
|
* palSrc - source palette
|
|
* pAlphaDispatch - blend data and function pointers
|
|
*
|
|
* Return Value:
|
|
*
|
|
* ALPHA_COMPLETE: success, written to destination
|
|
* ALPHA_SEND_TEMP: success, must write tmp bmp to dest
|
|
* ALPHA_FAIL: error
|
|
*
|
|
* History:
|
|
*
|
|
* 10/14/1996 Mark Enstrom [marke]
|
|
*
|
|
\**************************************************************************/
|
|
|
|
ULONG
|
|
AlphaScanLineBlend(
|
|
PBYTE pDst,
|
|
PRECTL pDstRect,
|
|
ULONG DeltaDst,
|
|
PBYTE pSrc,
|
|
ULONG DeltaSrc,
|
|
PPOINTL pptlSrc,
|
|
PALPHA_DISPATCH_FORMAT pAlphaDispatch,
|
|
PDIBINFO pDibInfoSrc,
|
|
PDIBINFO pDibInfoDst
|
|
)
|
|
{
|
|
//
|
|
// get two scanlines of RGBA data, blend pixels, store
|
|
//
|
|
|
|
LONG cx = pDstRect->right - pDstRect->left;
|
|
LONG cy = pDstRect->bottom - pDstRect->top;
|
|
LONG ScanBufferWidth = cx * 4;
|
|
LONG WriteMaskSize = cx;
|
|
LONG AllocationSize = 0;
|
|
ULONG ulSrcBytesPerPixel = pAlphaDispatch->ulSrcBitsPerPixel/8;
|
|
ULONG ulDstBytesPerPixel = pAlphaDispatch->ulDstBitsPerPixel/8;
|
|
PBYTE pjSrcTempScanBuffer = NULL;
|
|
PBYTE pjDstTempScanBuffer = NULL;
|
|
PBYTE pjAlloc = NULL;
|
|
PBYTE pjDstTmp;
|
|
PBYTE pjSrcTmp;
|
|
PBYTE pWriteMask;
|
|
LONG lRet = ALPHA_SEND_TEMP;
|
|
|
|
HDC hdc32 = NULL;
|
|
PULONG pulDIBSrc = NULL;
|
|
|
|
//
|
|
// if there is a temp dst needed, use dc allocator
|
|
//
|
|
|
|
if (pAlphaDispatch->pfnLoadDstAndConvert != NULL)
|
|
{
|
|
hdc32 = hdcAllocateScanLineDC(cx,&pulDIBSrc);
|
|
|
|
if (hdc32 == NULL)
|
|
{
|
|
return(ALPHA_FAIL);
|
|
}
|
|
|
|
//
|
|
// set temp scan line
|
|
//
|
|
|
|
pjDstTempScanBuffer = (PBYTE)pulDIBSrc;
|
|
}
|
|
|
|
//
|
|
// calculate destination starting address
|
|
//
|
|
|
|
if (ulDstBytesPerPixel)
|
|
{
|
|
pjDstTmp = pDst + ulDstBytesPerPixel * pDstRect->left + DeltaDst * pDstRect->top;
|
|
}
|
|
else if (pAlphaDispatch->ulDstBitsPerPixel == 1)
|
|
{
|
|
pjDstTmp = pDst + pDstRect->left/8 + DeltaDst * pDstRect->top;
|
|
}
|
|
else
|
|
{
|
|
pjDstTmp = pDst + pDstRect->left/2 + DeltaDst * pDstRect->top;
|
|
}
|
|
|
|
//
|
|
// calculate source starting address
|
|
//
|
|
|
|
if (ulSrcBytesPerPixel)
|
|
{
|
|
pjSrcTmp = pSrc + ulSrcBytesPerPixel * pptlSrc->x + DeltaSrc * pptlSrc->y;
|
|
}
|
|
else if (pAlphaDispatch->ulSrcBitsPerPixel == 1)
|
|
{
|
|
pjSrcTmp = pSrc + pptlSrc->x/8 + DeltaSrc * pptlSrc->y;
|
|
}
|
|
else
|
|
{
|
|
pjSrcTmp = pSrc + pptlSrc->x/2 + DeltaSrc * pptlSrc->y;
|
|
}
|
|
|
|
//
|
|
// calculate size of needed scan line buffer
|
|
//
|
|
|
|
if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
|
|
{
|
|
AllocationSize += ScanBufferWidth;
|
|
}
|
|
|
|
AllocationSize += WriteMaskSize;
|
|
|
|
//
|
|
// allocate scan line buffer memory
|
|
//
|
|
|
|
pWriteMask = (PBYTE)LOCALALLOC(AllocationSize);
|
|
|
|
if (pWriteMask != NULL)
|
|
{
|
|
//
|
|
// calc offsets
|
|
//
|
|
|
|
PBYTE pjTemp = pWriteMask + WriteMaskSize;
|
|
|
|
if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
|
|
{
|
|
pjSrcTempScanBuffer = pjTemp;
|
|
pjTemp += ScanBufferWidth;
|
|
|
|
}
|
|
|
|
//
|
|
// Blend scan lines
|
|
//
|
|
|
|
LONG yScan = 0;
|
|
|
|
while (cy--)
|
|
{
|
|
PBYTE pjSource = pjSrcTmp;
|
|
PBYTE pjDest = pjDstTmp;
|
|
|
|
//
|
|
// get src scan line if needed
|
|
//
|
|
|
|
if (pjSrcTempScanBuffer)
|
|
{
|
|
(*pAlphaDispatch->pfnLoadSrcAndConvert)(
|
|
(PULONG)pjSrcTempScanBuffer,
|
|
pjSrcTmp,
|
|
0,
|
|
cx,
|
|
(PVOID)pDibInfoSrc);
|
|
|
|
pjSource = pjSrcTempScanBuffer;
|
|
}
|
|
|
|
//
|
|
// get dst scan line if needed
|
|
//
|
|
|
|
if (pjDstTempScanBuffer)
|
|
{
|
|
(*pAlphaDispatch->pfnLoadDstAndConvert)(
|
|
(PULONG)pjDstTempScanBuffer,
|
|
pjDstTmp,
|
|
0,
|
|
cx,
|
|
(PVOID)pDibInfoDst);
|
|
|
|
pjDest = pjDstTempScanBuffer;
|
|
}
|
|
|
|
//
|
|
// blend
|
|
//
|
|
|
|
memset(pWriteMask,1,WriteMaskSize);
|
|
|
|
(*pAlphaDispatch->pfnGeneralBlend)(
|
|
(PALPHAPIX)pjDest,
|
|
(PALPHAPIX)pjSource,
|
|
cx,
|
|
pAlphaDispatch->BlendFunction,
|
|
pWriteMask
|
|
);
|
|
|
|
//
|
|
// write buffer back if needed
|
|
//
|
|
|
|
if (pjDstTempScanBuffer)
|
|
{
|
|
(*pAlphaDispatch->pfnConvertAndStore)(
|
|
pjDstTmp,
|
|
(PULONG)pjDstTempScanBuffer,
|
|
cx,
|
|
0,
|
|
yScan,
|
|
(PVOID)pDibInfoDst,
|
|
pWriteMask,
|
|
hdc32
|
|
);
|
|
}
|
|
|
|
pjDstTmp += DeltaDst;
|
|
pjSrcTmp += DeltaSrc;
|
|
yScan++;
|
|
}
|
|
|
|
//
|
|
// free any temp buffer memory
|
|
//
|
|
|
|
LOCALFREE(pWriteMask);
|
|
}
|
|
else
|
|
{
|
|
lRet = ALPHA_FAIL;
|
|
}
|
|
|
|
if (hdc32)
|
|
{
|
|
vFreeScanLineDC(hdc32);
|
|
}
|
|
|
|
if (
|
|
(lRet != ALPHA_FAIL) &&
|
|
(pAlphaDispatch->pfnConvertAndStore == vConvertAndSaveBGRAToDest)
|
|
)
|
|
{
|
|
lRet = ALPHA_COMPLETE;
|
|
}
|
|
|
|
return(lRet);
|
|
}
|
|
|
|
#endif
|