/******************************Module*Header*******************************\ * Module Name: tranblt.cxx * * Transparent BLT * * Created: 21-Jun-1996 * Author: Mark Enstrom [marke] * * Copyright (c) 1996-1999 Microsoft Corporation \**************************************************************************/ #include "precomp.hxx" #pragma hdrstop #if !(_WIN32_WINNT >= 0x500) // // global memory DC with single scan line 32 bpp DIBSection, // use protected by semLocal // HDC ghdc32Tmp; HDC ghdc32; PULONG gpulDIB32; /**************************************************************************\ * bInitAlpha * * Init global scan line DC * * Arguments: * * none * * Return Value: * * status * * History: * * 4/30/1997 Mark Enstrom [marke] * \**************************************************************************/ BOOL bInitAlpha() { BOOL bRet = TRUE; BITMAPINFO bmi32; bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER); bmi32.bmiHeader.biWidth = SCAN_LINE_DC_WIDTH; bmi32.bmiHeader.biHeight = 1; bmi32.bmiHeader.biPlanes = 1; bmi32.bmiHeader.biBitCount = 32; bmi32.bmiHeader.biCompression = BI_RGB; bmi32.bmiHeader.biSizeImage = 0; bmi32.bmiHeader.biXPelsPerMeter = 0; bmi32.bmiHeader.biYPelsPerMeter = 0; bmi32.bmiHeader.biClrUsed = 0; bmi32.bmiHeader.biClrImportant = 0; HDC hdc32 = CreateCompatibleDC(NULL); if (hdc32 != NULL) { PULONG pulDIBSrc; HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0); if (hbmSrc) { HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc); if (hbmOld != NULL) { ghdc32 = hdc32; ghdc32Tmp = hdc32; gpulDIB32 = pulDIBSrc; } else { DeleteDC(hdc32); DeleteObject(hbmSrc); bRet = FALSE; } } else { DeleteDC(hdc32); bRet = FALSE; } } else { bRet = FALSE; } return(bRet); } /**************************************************************************\ * CleanupGlobals * * Free any global DIBsections, DCs, etc. from initialization. * * Arguments: * none. * * Return Value: * none. * * History: * * 1/19/2000 Donald Chinn [DChinn] * \**************************************************************************/ VOID CleanupGlobals() { ASSERTGDI(ghdc32 == ghdc32Tmp, "ghdc32Tmp is still being used."); if (!ghdc32) { DeleteDC(ghdc32); } if (!gpulDIB32) { DeleteObject(gpulDIB32); } return; } /**************************************************************************\ * hdcAllocateScanLineDC * * allocate tmp scan line DC. try to use fast allocator. * * Arguments: * * hdcComp - hdc for compatible bitmap * width - width of scan line * pulScanLine - return pointer to temp scan line * * Return Value: * * * * History: * * 4/30/1997 Mark Enstrom [marke] * \**************************************************************************/ HDC hdcAllocateScanLineDC( LONG width, PULONG *pulScanLine ) { ASSERTGDI(pulScanLine != NULL,"Scan line pointer must not be NULL"); HDC hdcRet = NULL; // // try to acquire global scan line DC // if (width <= SCAN_LINE_DC_WIDTH) { hdcRet = (HDC)InterlockedExchange((PLONG)&ghdc32Tmp, 0); } if (hdcRet != NULL) { // // allocation succeded, assign pointer // *pulScanLine = gpulDIB32; } else { // // if acquire failed, allocate // BITMAPINFO bmi32; bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER); bmi32.bmiHeader.biWidth = width; bmi32.bmiHeader.biHeight = 1; bmi32.bmiHeader.biPlanes = 1; bmi32.bmiHeader.biBitCount = 32; bmi32.bmiHeader.biCompression = BI_RGB; bmi32.bmiHeader.biSizeImage = 0; bmi32.bmiHeader.biXPelsPerMeter = 0; bmi32.bmiHeader.biYPelsPerMeter = 0; bmi32.bmiHeader.biClrUsed = 0; bmi32.bmiHeader.biClrImportant = 0; HDC hdc32 = CreateCompatibleDC(NULL); if (hdc32 != NULL) { PULONG pulDIBSrc; HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0); if (hbmSrc) { HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc); if (hbmOld != NULL) { hdcRet = hdc32; *pulScanLine = pulDIBSrc; } else { DeleteDC(hdc32); DeleteObject(hbmSrc); } } else { DeleteDC(hdc32); } } } return(hdcRet); } /**************************************************************************\ * vFreeScanLineDC * * free tmp scan line dc and dibsection * * Arguments: * * hdcFree - scan line DC * * Return Value: * * none * * History: * * 4/30/1997 Mark Enstrom [marke] * \**************************************************************************/ VOID vFreeScanLineDC( HDC hdcFree ) { ASSERTGDI(hdcFree != NULL,"vFreeScanLineDC: DC can't be NULL"); if (hdcFree == ghdc32) { // // release global hdc // ghdc32Tmp = ghdc32; } else { // // free allocated DC and bitmap // HBITMAP hbmOld = (HBITMAP)GetCurrentObject(hdcFree,OBJ_BITMAP); DeleteDC(hdcFree); if (hbmOld) { DeleteObject(hbmOld); } } } /**************************************************************************\ * vPixelOver * * optimized routine used when the blend function is SRC_OVER and the * SourceConstantAlpha is 255. * * Dst = Src + (1-SrcAlpha) * Dst * * Arguments: * * ppixDst - address of dst pixel * ppixSrc - address of src pixel * cx - number of pixels in scan line * BlendFunction - blend to be done on each pixel * pwrMask - set each byte to 0 for pixel that doesn't need * to be written to dst * * Return Value: * * none * * History: * * 1/23/1997 Mark Enstrom [marke] * \**************************************************************************/ #if !defined(_X86_) VOID vPixelOver( ALPHAPIX *ppixDst, ALPHAPIX *ppixSrc, LONG cx, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { ALPHAPIX pixSrc; ALPHAPIX pixDst; while (cx--) { pixSrc = *ppixSrc; if (pixSrc.pix.a != 0) { pixDst = *ppixDst; if (pixSrc.pix.a == 255) { pixDst = pixSrc; } else { // // Dst = Src + (1-SrcAlpha) * Dst // ULONG Multa = 255 - pixSrc.pix.a; ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8; ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff); ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080; ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080; ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8; ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8; ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00; ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8; pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB; } *ppixDst = pixDst; } else { *pwrMask = 0; } pwrMask++; ppixSrc++; ppixDst++; } } #endif /**************************************************************************\ * vPixelBlendOrDissolveOver * * Blend routine when the blend function is SRC_OVER, but when * SourceConstantAlpah != 255 and The source bitmap does have alpha values * * if SrcAlpha == 255 then * (Blend) * Dst = Dst + ConstAlpha * (Src - Dst) * * else * (Dissolve) * Src = Src * ConstAlpha * (Over) * Dst = Src + (1 - SrcAlpha) Dst * * Arguments: * * ppixDst - address of dst pixel * ppixSrc - address of src pixel * cx - number of pixels in scan line * BlendFunction - blend to be done on each pixel * pwrMask - set each byte to 0 for pixel that doesn't need * to be written to dst * * Return Value: * * None * * History: * * 3/12/1997 Mark Enstrom [marke] * \**************************************************************************/ VOID vPixelBlendOrDissolveOver( ALPHAPIX *ppixDst, ALPHAPIX *ppixSrc, LONG cx, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { ALPHAPIX pixSrc; ALPHAPIX pixDst; BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; while (cx--) { pixSrc = *ppixSrc; if (pixSrc.pix.a != 0) { pixDst = *ppixDst; if (pixSrc.pix.a == 255) { // // Blend: D = sA * S + (1-sA) * D // // red and blue // ULONG uB00rr00bb = pixDst.ul & 0x00ff00ff; ULONG uF00rr00bb = pixSrc.ul & 0x00ff00ff; ULONG uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) + (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080; ULONG uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8; ULONG uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8; // // alpha and green // ULONG uB00aa00gg = (pixDst.ul >> 8) & 0xff00ff; ULONG uF00aa00gg = (pixSrc.ul >> 8) & 0xff00ff; ULONG uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) + (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080; ULONG uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8; ULONG uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00; pixDst.ul = uD00rr00bb + uDaa00gg00; } else { // // disolve // ULONG ul_B_00AA00GG = (pixSrc.ul & 0xff00ff00) >> 8; ULONG ul_B_00RR00BB = (pixSrc.ul & 0x00ff00ff); ULONG ul_T_AAAAGGGG = ul_B_00AA00GG * ConstAlpha + 0x00800080; ULONG ul_T_RRRRBBBB = ul_B_00RR00BB * ConstAlpha + 0x00800080; ULONG ul_T_00AA00GG = (ul_T_AAAAGGGG & 0xFF00FF00) >> 8; ULONG ul_T_00RR00BB = (ul_T_RRRRBBBB & 0xFF00FF00) >> 8; ULONG ul_C_AA00GG00 = ((ul_T_AAAAGGGG + ul_T_00AA00GG) & 0xFF00FF00); ULONG ul_C_00RR00BB = ((ul_T_RRRRBBBB + ul_T_00RR00BB) & 0xFF00FF00) >> 8; pixSrc.ul = (ul_C_AA00GG00 | ul_C_00RR00BB); // // over // ULONG Multa = 255 - pixSrc.pix.a; ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8; ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff); ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080; ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080; ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8; ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8; ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00; ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8; pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB; } *ppixDst = pixDst; } else { *pwrMask = 0; } pwrMask++; ppixSrc++; ppixDst++; } } #if !defined(_X86_) /******************************Public*Routine******************************\ * vPixelBlend * * Blend function used then BlendFunction is SRC_OVER and * SourceConstantAlpha != 255, and Src image does NOT have * it's own alpha channel. (assume 255) * * Dst = Dst + ConstAlpha * (Src - Dst) * * Arguments: * * ppixDst - address of dst pixel * ppixSrc - address of src pixel * cx - number of pixels in scan line * BlendFunction - blend to be done on each pixel * pwrMask - set each byte to 0 for pixel that doesn't need * to be written to dst * * Return Value: * * None * * History: * * 12/2/1996 Mark Enstrom [marke] * \**************************************************************************/ VOID vPixelBlend( ALPHAPIX *ppixDst, ALPHAPIX *ppixSrc, LONG cx, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { PULONG pulSrc = (PULONG)ppixSrc; PULONG pulDst = (PULONG)ppixDst; PULONG pulSrcEnd = pulSrc + cx; BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; // // Blend: D = sA * S + (1-sA) * D // while (pulSrc != pulSrcEnd) { ULONG ulDst = *pulDst; ULONG ulSrc = *pulSrc; ULONG uB00rr00bb = ulDst & 0x00ff00ff; ULONG uF00rr00bb = ulSrc & 0x00ff00ff; ULONG uMrrrrbbbb; ULONG uM00rr00bb; ULONG uD00rr00bb; ULONG uB00aa00gg; ULONG uF00aa00gg; ULONG uMaaaagggg; ULONG uM00aa00gg; ULONG uDaa00gg00; // // red and blue // uB00rr00bb = ulDst & 0x00ff00ff; uF00rr00bb = ulSrc & 0x00ff00ff; uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) + (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080; uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8; uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8; // // alpha and green // uB00aa00gg = (ulDst >> 8) & 0xff00ff; uF00aa00gg = (ulSrc >> 8) & 0xff00ff; uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) + (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080; uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8; uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00; *pulDst = uD00rr00bb + uDaa00gg00; pulSrc++; pulDst++; } } #endif /******************************Public*Routine******************************\ * vPixelBlend24 * * Blend two 24 bpp images with a constant alpha value * * Arguments: * * pixDst, * pixSrc, * cx, * BlendFunction * pwrMask * * Return Value: * * * * History: * * 12/2/1996 Mark Enstrom [marke] * \**************************************************************************/ VOID vPixelBlend24( ALPHAPIX *ppixDst, ALPHAPIX *ppixSrc, LONG cx, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; PBYTE pjSrc = (PBYTE)ppixSrc; PBYTE pjDst = (PBYTE)ppixDst; PBYTE pjSrcEnd = pjSrc + 3*cx; while (pjSrc != pjSrcEnd) { ULONG ulDst = (*pjDst) << 16; ULONG ulSrc = (*pjSrc) << 16; ULONG uB00rr00bb; ULONG uF00rr00bb; ULONG uMrrrrbbbb; ULONG uM00rr00bb; ULONG uD00rr00bb; ULONG uB000000gg; ULONG uF000000gg; ULONG uM0000gggg; ULONG uM000000gg; ULONG uD000000gg; // // red and blue // uB00rr00bb = uB00rr00bb = ulDst | (*(pjDst+1)); uF00rr00bb = uF00rr00bb = ulSrc | (*(pjSrc+1)); uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) + (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080; uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8; uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8; // // green // uB000000gg = *(pjDst+2); uF000000gg = *(pjSrc+2); uM0000gggg = ((uB000000gg <<8)-uB000000gg) + (ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000080; uM000000gg = (uM0000gggg & 0x0000ff00)>>8; uD000000gg = ((uM0000gggg + uM000000gg) & 0x0000ff00) >> 8; *pjDst = (BYTE)(uD00rr00bb >> 16); *(pjDst+1) = (BYTE)(uD00rr00bb); *(pjDst+2) = (BYTE)(uD000000gg); pjSrc+=3; pjDst+=3; } } #if defined(_X86_) typedef unsigned __int64 QWORD; /************************************************************************** THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION DO NOT CALL THIS FUNCTION WITH WIDTH == 0 This function operates on 32 bit pixels (BGRA) in a row of a bitmap. This function performs the following: SrcTran = 255 - pixSrc.a pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255); pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255); pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255); pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255); pDst is assumed to be aligned to a DWORD boundary when passed to this function. Step 1: Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel as a DWORD, then do Step 2. Step 2: QuadAligned pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one pixel left, do as a DWORD. Step 3: Load two source pixels, S1 and S2. Get (255 - alpha value) for each source pixel, 255-S1a and 255-S2a. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register. Load two destination pixels, D1 and D2. Expand each byte in D1 into four words of an MMX register. If at least four pixels can be done, do Step 4. If not, jump over FourPixelsPerPass and finish doing two pixels at TwoPixelsLeft, Step 5. Step 4: FourPixelsPerPass Expand each byte in D2 into four words of an MMX register. Multiply each byte of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results to the copied results. Shift these results by 8. Pack the results into one MMX register. Add the packed results to the source pixels. Store result over destination pixels. Stay in FourPixelsPerPass loop until there are less than four pixels to do. Step 5: TwoPixelsLeft Do same as Step 4 above; but do not loop. Step 6: OnePixelLeft If there is one pixel left (odd number of original pixels) do last pixel as a DWORD. **************************************************************************/ VOID mmxPixelOver( ALPHAPIX *pDst, ALPHAPIX *pSrc, LONG Width, BLENDFUNCTION BlendFunction, PBYTE pwrMask) { static QWORD W128 = 0x0080008000800080; static QWORD AlphaMask = 0x000000FF000000FF; _asm { mov esi, pSrc mov edi, pDst movq mm7, W128 // | 0 | 128 | 0 | 128 | 0 | 128 | 0 | 128 | // This register never changes pxor mm6, mm6 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | // This register never changes mov ecx, Width // Step 1: test edi, 7 // Test first pixel for QWORD alignment jz QuadAligned // if unaligned, jmp Do1Pixel // do first pixel only QuadAligned: // Step 2: mov eax, ecx // Save the width in eax for later (see OnePixelLeft:) shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even test ecx, ecx // Make sure there is at least 1 quad to do jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned) // Step 3: movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b | psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a | pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a | movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a | movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a | punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a | punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b | dec ecx jz TwoPixelsLeft FourPixelsPerPass: // Step 4: // Indenting indicates operations on the next set of pixels // Within this loop, instructions will pair as shown for the Pentium processor // T1 = 255-S1a T2 = 255-S2a punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b | pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b | movq mm0, [esi+8] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b | pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b | psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a | add esi, 8 // pSrc++; pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a | movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a | // TDXx' = TX*DXx+128 psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 | // TDXx" = (TX*DXx+128)+(TDXx'>>8) psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 | paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" | paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" | psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 | movq mm2, [edi+8] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 | movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"| paddusb mm4, [esi-8] punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a | movq [edi], mm4 punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a | punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b | add edi, 8 // pDst++; dec ecx jnz FourPixelsPerPass TwoPixelsLeft: // Step 5: punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b | pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b | pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b | paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 | psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 | paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" | paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" | psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 | psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 | packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"| paddusb mm4, [esi] movq [edi], mm4 add edi, 8 add esi, 8 OnePixelLeft: // Step 6: // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2 // If 0, there were an even number of pixels and we're done // If 1, there is an odd number of pixels and we need to do one more test eax, 1 jz Done Do1Pixel: // make as a macro if used in asm file // T = 255-S1x movd mm0, DWORD PTR[esi] // | 0 | 0 | 0 | 0 | S1a | S1r | S1g | S1b | psrld mm0, 24 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | S1a | pxor mm0, AlphaMask // | 0 | 0 | 0 | 255 | 0 | 0 | 0 |255-S1a| punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a | punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a | movd mm1, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b | punpcklbw mm1, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b | pmullw mm0, mm1 // | T*D1a | T*D1r | T*D1g | T*D1b | paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 | movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 | psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 | paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" | psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 | movd mm1, [esi] packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"| paddusb mm0, mm1 movd [edi], mm0 add edi, 4 // pDst++; add esi, 4 // pSrc++; test ecx, ecx jz Done // just processed the last pixel of the row dec ecx jmp QuadAligned // just processed the first pixel of the row Done: emms // remove for optimizations, have calling function do emms } } /**************************************************************************\ * mmxPixelBlendOrDissolveOver * * Blend routine when the blend function is SRC_OVER, but when * SourceConstantAlpah != 255 and The source bitmap does have alpha values * * if SrcAlpha == 255 then * * Dst = Dst + ConstAlpha * (Src - Dst) * * else * * Src = Src * ConstAlpha * Dst = Src + (1 - SrcAlpha) Dst * * Arguments: * * ppixDst - address of dst pixel * ppixSrc - address of src pixel * cx - number of pixels in scan line * BlendFunction - blend to be done on each pixel * pwrMask - set each byte to 0 for pixel that doesn't need * to be written to dst * * Return Value: * * None * * History: * * 3/12/1997 Mark Enstrom [marke] * \**************************************************************************/ /************************************************************************** THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION DO NOT CALL THIS FUNCTION WITH WIDTH == 0 This function operates on 32 bit pixels (BGRA) in a row of a bitmap. This function performs the following: first, pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255); pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255); pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255); pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255); then, SrcTran = 255 - pixSrc.a pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255); pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255); pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255); pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255); pDst is assumed to be aligned to a DWORD boundary when passed to this function. Step 1: Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel as a DWORD, then do Step 2. Step 2: QuadAligned pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one pixel left, do as a DWORD. Step 3: Load two source pixels, S1 and S2, as one QWORD. Expand S1 and S2 as four words into two MMX registers. Multiply each word in S1 and S2 by ConstAlpha. Add 128 to each result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results to the copied results. Shift these results by 8. Pack the results into one MMX register...this will be used later. Shift the packed results by 24 to get only the alpha value for each pixel. Step 4: Get (255 - new alpha value) for each pixel, 255-S1a and 255-S2a. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register. Load two destination pixels, D1 and D2. Expand D1 and D2 as four words into two MMX registers. Multiply each byte of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results to the copied results. Shift these results by 8. Pack the results into one MMX register. Add the packed results to the new source pixels saved from above. Store result over destination pixels. Stay in TwoPixelsAtOnceLoop loop until there is less than two pixels to do. Step 5: OnePixelLeft If there is one pixel left (odd number of original pixels) do last pixel as a DWORD. **************************************************************************/ VOID mmxPixelBlendOrDissolveOver( ALPHAPIX *pDst, ALPHAPIX *pSrc, LONG Width, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; static QWORD W128 = 0x0080008000800080; static QWORD AlphaMask = 0x000000FF000000FF; static QWORD Zeros = 0; _asm { mov esi, pSrc mov edi, pDst movq mm7, W128 // This register never changes pxor mm4, mm4 // This register never changes xor eax, eax mov al, ConstAlpha movd mm5, eax // | | | | CA | punpcklwd mm5, mm5 // | | | CA | CA | punpcklwd mm5, mm5 // | CA | CA | CA | CA | // This register never changes mov ecx, Width // Step 1: test edi, 7 // Test first pixel for QWORD alignment jz QuadAligned // if unaligned, jmp Do1Pixel // do first pixel only QuadAligned: // Step 2: mov eax, ecx // Save the width in eax for later (see OnePixelLeft:) shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even test ecx, ecx // Make sure there is at least 1 quad to do jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned) TwoPixelsAtOnceLoop: // Step 3: // Within this loop, instructions will pair as shown for the Pentium processor /* Dissolve pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255); pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255); pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255); pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255); */ movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b | movq mm1, mm0 // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b | punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b | punpckhbw mm1, mm4 // | 0 | S2a | 0 | S2r | 0 | S2g | 0 | S2b | pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b | add esi, 8 // pSrc++; pmullw mm1, mm5 // | CA*S2a | CA*S2r | CA*S2g | CA*S2b | paddusw mm1, mm7 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 | paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 | movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 | psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 | // S1x' = CA*S1x+128 S2x' = CA*S2x+128 movq mm3, mm1 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 | psrlw mm1, 8 // | S2a'>>8 | S2r'>>8 | S2g'>>8 | S2b'>>8 | // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8 paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" | paddusw mm1, mm3 // | S2a" | S2r" | S2g" | S2b" | psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 | // SXx'" = ((CA*SXx+128)>>8)>>8) psrlw mm1, 8 // | S2a">>8 | S2r">>8 | S2g">>8 | S2b">>8 | packuswb mm0, mm1 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"| movq mm6, mm0 psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a | /* Over SrcTran = 255 - pixSrc.a pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255); pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255); pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255); pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255); */ // Step 4: pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a | movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | punpcklwd mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a | movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b | punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a | punpcklwd mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a | punpckhbw mm3, mm4 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b | // T1 = 255-S1a T2 = 255-S2a punpcklbw mm2, mm4 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b | pmullw mm1, mm3 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b | add edi, 8 // pDst++; pmullw mm0, mm2 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b | paddusw mm0, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | paddusw mm1, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | movq mm3, mm1 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 | // TDXx' = TX*DXx+128 psrlw mm1, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 | movq mm2, mm0 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 | psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 | // TDXx" = (TX*DXx+128)+(TDXx'>>8) paddusw mm1, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" | paddusw mm0, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" | psrlw mm1, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 | psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 | packuswb mm0, mm1 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"| // SXx = SXx'" TDXx = TDXx'" paddusb mm0, mm6// |S2a+TD2a|S2r+TD2r|S2g+TD2g|S2b+TD2b|S1a+TD1a|S1r+TD1r|S1g+TD1g|S1b+TD1b| movq [edi-8], mm0 dec ecx jnz TwoPixelsAtOnceLoop OnePixelLeft: // Step 5: // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2 // If 0, there were an even number of pixels and we're done // If 1, there is an odd number of pixels and we need to do one more test eax, 1 jz Done Do1Pixel: // make as a macro if used in asm file /* Dissolve pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255); pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255); pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255); pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255); */ movd mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b | punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b | pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b | paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 | movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 | // S1x' = CA*S1x+128 S2x' = CA*S2x+128 psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 | // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8 paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" | psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 | packuswb mm0, mm0 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"| movq mm6, mm0 psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a | /* Over SrcTran = 255 - pixSrc.a pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255); pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255); pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255); pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255); */ pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a| punpcklwd mm0, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 |255-S1a|255-S1a| punpckldq mm0, mm0 // | 255-S1a| 255-S1a| 255-S1a| 255-S1a| movd mm2, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b | punpcklbw mm2, mm4 // | D1a | D1r | D1g | D1b | // T = 255-S1x pmullw mm0, mm2 // | T*D1a | T*D1r | T*D1g | T*D1b | paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 | movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 | psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 | paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" | psrlw mm0, 8 packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"| paddusb mm0, mm6 movd [edi], mm0 add edi, 4 // pDst++; add esi, 4 // pSrc++; test ecx, ecx jz Done // just processed the last pixel of the row dec ecx jmp QuadAligned // just processed the first pixel of the row Done: emms // remove for optimizations, have calling function do emms } } /************************************************************************** THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION This function operates on 16 bit pixels (5 for Red, 5 for Green, and 5 for Blue) in a row of a bitmap. It blends source and destination bitmaps, without alpha channels, using a constant alpha input. The function performs the following on each byte: tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31) tmp2 = tmp1 AND 3E0h (mask off low 5 bits) tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits) tmp2 = tmp2 + tmp1 tmp2 = tmp2 AND 3E0h (mask off low 5 bits) tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits) Dst = tmp2 pDst is assumed to be aligned to a DWORD boundary when passed to this function. Red and blue are processed together in the same register. Green is processed separately. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the greens are processed together in a separate register. The loop structure is as follows: Step 1: Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel as a DWORD (OnePixelLeft:), then do Step 2. Step 2: (QuadAligned:) pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one pixel left, do as a DWORD. Step 3: (TwoPixelsAtOnceLoop:) Perform the above function, using MMX instructions, on two pixels per pass of the loop. Step 4: (OnePixelLeft:) If there is one pixel left (odd number of original pixels) do last pixel as a DWORD. **************************************************************************/ VOID mmxPixelBlend16_555( PALPHAPIX pDst, PALPHAPIX pSrc, LONG Width, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { static QWORD RMask = 0x007C0000007C0000; static QWORD GMask = 0x0000000003E003E0; static QWORD BMask = 0x0000001F0000001F; static QWORD RBConst = 0x0010001000100010; static QWORD GConst = 0x0000000000100010; static QWORD RGBMask = 0x03E003E003E003E0; static QWORD RedMask = 0x001F0000001F0000; static QWORD CA; // ConstAlpha in 4 words of a qword BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; _asm { mov ecx, Width // Make sure there is at least one pixel to do test ecx, ecx jz Done mov esi, pSrc mov edi, pDst xor eax, eax mov al, ConstAlpha movd mm5, eax // | | | | CA | punpcklwd mm5, mm5 // | | | CA | CA | punpcklwd mm5, mm5 // | CA | CA | CA | CA | movq CA, mm5 // Step 1: test edi, 7 // Test first pixel for QWORD alignment jz QuadAligned // if unaligned, jmp Do1Pixel // do first pixel only QuadAligned: // Step 2: mov eax, ecx // Save the width in eax for later (see OnePixelLeft:) shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even test ecx, ecx // Make sure there is at least 1 quad to do jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned) TwoPixelsAtOnceLoop: // Step 3: movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb | pxor mm7, mm7 movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb | movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb | movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb | punpcklbw mm0, mm7 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb | punpcklbw mm1, mm7 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb | movq mm4, mm0 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb | pand mm0, RMask // | D20rrrrr00 | 0 | D10rrrrr00 | 0 | movq mm5, mm1 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb | pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb | psrlw mm0, 2 // | D2rrrrr | 0 | D1rrrrr | 0 | pand mm1, RMask // | S20rrrrr00 | 0 | S10rrrrr00 | 0 | por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb | pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb | movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb | pand mm2, GMask // | 0 | 0 |D2ggggg00000|D1ggggg00000| psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000| pand mm3, GMask // | 0 | 0 |S2ggggg00000|S1ggggg00000| psrlw mm1, 2 // | S2rrrrr | 0 | S1rrrrr | 0 | por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb | movq mm6, mm2 // | 0 | 0 |D2ggggg00000|D1ggggg00000| psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b | psrlw mm2, 5 // | 0 | 0 | D2ggggg | D1ggggg | pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b | psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 | paddw mm4, RBConst// | CA2r+c | CA2b+c | CA1r+c | CA1b+c | psrlw mm3, 5 // | 0 | 0 | S2ggggg | S1ggggg | psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g | add esi, 4 // pSrc++; pmullw mm3, CA // | 0 | 0 | CA2g | CA1g | paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) psubw mm6, mm2 // | 0 | 0 | D2g*31 | D2g*31 | add edi, 4 // pDst++; paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c | movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits) paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31) movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31) psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits) paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1 pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits) psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits) paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1 psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits) movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm4, RedMask// Mask to get red pand mm1, BMask // Mask to get blue psllw mm4, 2 // Line up the red por mm4, mm1 // Combine reds and blues in proper bit location packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb | por mm4, mm5 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb | movd [edi-4], mm4 dec ecx jnz TwoPixelsAtOnceLoop OnePixelLeft: // Step 4: // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2 // If 0, there was an even number of pixels and we're done // If 1, there is an odd number of pixels and we need to do one more test eax, 1 jz Done Do1Pixel: // make as a macro if used in asm file movzx edx,WORD PTR[edi] ; edx = D 0000 0000 0rrr rrgg gggb bbbb movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 0rrr rrgg gggb bbbb movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb | pxor mm7, mm7 movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb | movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb | movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb | punpcklbw mm0, mm7 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb | punpcklbw mm1, mm7 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb | movq mm4, mm0 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb | pand mm0, RMask // | 0 | 0 | D10rrrrr00 | 0 | movq mm5, mm1 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb | pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb | psrlw mm0, 2 // | 0 | 0 | D1rrrrr | 0 | pand mm1, RMask // | 0 | 0 | S10rrrrr00 | 0 | por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb | pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb | movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb | pand mm2, GMask // | 0 | 0 | 0 |D1ggggg00000| psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000| pand mm3, GMask // | 0 | 0 | 0 |S1ggggg00000| psrlw mm1, 2 // | 0 | 0 | S1rrrrr | 0 | por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb | movq mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000| // mm1 is free psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b | psrlw mm2, 5 // | 0 | 0 | 0 | D1ggggg | pmullw mm5, CA // | 0 | 0 | CA1r | CA1b | psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 | paddw mm4, RBConst// | 0 | 0 | CA1r+c | CA1b+c | psrlw mm3, 5 // | 0 | 0 | 0 | S1ggggg | psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g | add esi, 2 // pSrc++; pmullw mm3, CA // | 0 | 0 | 0 | CA1g | paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) psubw mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000-D1ggggg| add edi, 2 // pDst++; paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c | movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits) paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31) movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31) psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits) paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1 pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits) psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits) paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1 psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits) movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm4, RedMask// Mask to get red pand mm1, BMask // Mask to get blue psllw mm4, 2 // Line up the red por mm4, mm1 // Combine reds and blues in proper bit location packsswb mm4, mm7 // | 0 | 0 | D10rrrrr00 | D1000bbbbb | por mm4, mm5 // | 0 | 0 | D10rrrrrgg | D1gggbbbbb | movd edx, mm4 mov [edi-2], dx test ecx, ecx jz Done // just processed the last pixel of the row dec ecx jmp QuadAligned // just processed the first pixel of the row Done: emms // remove for optimizations, have calling function do emms } } /************************************************************************** THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION This function operates on 16 bit pixels (5 for Red, 6 for Green, and 5 for Blue) in a row of a bitmap. It blends source and destination bitmaps, without alpha channels, using a constant alpha input. The function performs the following: For red and blue: tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31) tmp2 = tmp1 AND 3E0h (mask off low 5 bits) tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits) tmp2 = tmp2 + tmp1 tmp2 = tmp2 AND 3E0h (mask off low 5 bits) tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits) Dst = tmp2 For green: tmp1 = Alpha(Src - Dst) + 32 + (Dst * 63) tmp2 = tmp1 AND FC0h (mask off low 6 bits) tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits) tmp2 = tmp2 + tmp1 tmp2 = tmp2 AND FC0h (mask off low 6 bits) tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits) Dst = tmp2 pDst is assumed to be aligned to a DWORD boundary when passed to this function. Red and blue are processed together in the same register. Green is processed separately. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the greens are processed together in a separate register. The loop structure is as follows: Step 1: Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel as a DWORD (OnePixelLeft:), then do Step 2. Step 2: (QuadAligned:) pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one pixel left, do as a DWORD. Step 3: (TwoPixelsAtOnceLoop:) Perform the above function, using MMX instructions, on two pixels per pass of the loop. Step 4: (OnePixelLeft:) If there is one pixel left (odd number of original pixels) do last pixel as a DWORD. **************************************************************************/ VOID mmxPixelBlend16_565( PALPHAPIX pDst, PALPHAPIX pSrc, LONG Width, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { static QWORD RMask = 0x00FF000000FF0000; static QWORD GMask = 0x0000000007E007E0; static QWORD BMask = 0x0000001F0000001F; static QWORD RBConst = 0x0010001000100010; static QWORD GConst = 0x0000000000200020; static QWORD RBMask = 0x03E003E003E003E0; static QWORD GreenMask = 0x000000000FC00FC0; static QWORD CA; // ConstAlpha in 4 words of a qword BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; _asm { mov ecx, Width // Make sure there is at least one pixel to do test ecx, ecx jz Done mov esi, pSrc mov edi, pDst xor eax, eax mov al, ConstAlpha movd mm5, eax // | | | | CA | punpcklwd mm5, mm5 // | | | CA | CA | punpcklwd mm5, mm5 // | CA | CA | CA | CA | movq CA, mm5 // Step 1: test edi, 7 // Test first pixel for QWORD alignment jz QuadAligned // if unaligned, jmp Do1Pixel // do first pixel only QuadAligned: // Step 2: mov eax, ecx // Save the width in eax for later (see OnePixelLeft:) shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even test ecx, ecx // Make sure there is at least 1 quad to do jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned) TwoPixelsAtOnceLoop: // Step 3: movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb | pxor mm7, mm7 movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb | movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb | movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb | punpcklbw mm0, mm7 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb | punpcklbw mm1, mm7 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb | movq mm4, mm0 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb | pand mm0, RMask // | D2rrrrr000 | 0 | D1rrrrr000 | 0 | movq mm5, mm1 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb | pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb | psrlw mm0, 3 // | D2rrrrr | 0 | D1rrrrr | 0 | pand mm1, RMask // | S2rrrrr000 | 0 | S1rrrrr000 | 0 | por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb | pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb | movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb | pand mm2, GMask // | 0 | 0 |D2gggggg00000|D1gggggg00000| psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000| pand mm3, GMask // | 0 | 0 |S2gggggg00000|S1gggggg00000| psrlw mm1, 3 // | S2rrrrr | 0 | S1rrrrr | 0 | por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb | movq mm6, mm2 // | 0 | 0 |D2gggggg00000|D1gggggg00000| psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b | psrlw mm2, 5 // | 0 | 0 | D2gggggg | D1gggggg | pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b | psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 | paddw mm4, RBConst // | CA2r+c | CA2b+c | CA1r+c | CA1b+c | psrlw mm3, 5 // | 0 | 0 | S2gggggg | S1gggggg | psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g | add esi, 4 // pSrc++; pmullw mm3, CA // | 0 | 0 | CA2g | CA1g | psllw mm6, 1 // | 0 | 0 |D2gggggg000000|D1gggggg000000| paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) psubw mm6, mm2 // | 0 | 0 | D2g*63 | D1g*63 | paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c | movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) add edi, 4 // pDst++; psllw mm3, 1 // | 0 | 0 | CA2g*2 | CA1g*2 | pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits) paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63) movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63) psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits) paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1 pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits) psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits) paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1 psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits) movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm4, RMask // Mask to get red psrlw mm5, 1 // Align the green pand mm1, BMask // Mask to get blue psllw mm4, 3 // Align the red por mm4, mm1 // Combine reds and blues in proper bit location packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D2rrrrr000 | D2000bbbbb | D1rrrrr000 | D1000bbbbb | por mm4, mm5 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb | movd [edi-4], mm4 dec ecx jnz TwoPixelsAtOnceLoop OnePixelLeft: // Step 4: // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2 // If 0, there were an even number of pixels and we're done // If 1, there is an odd number of pixels and we need to do one more test eax, 1 jz Done Do1Pixel: // make as a macro if used in asm file movzx edx,WORD PTR[edi] ; edx = D 0000 0000 rrrr rggg gggb bbbb movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 rrrr rggg gggb bbbb movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb | pxor mm7, mm7 movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb | movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1rrrrrggg | D1gggbbbbb | movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb | punpcklbw mm0, mm7 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb | punpcklbw mm1, mm7 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb | movq mm4, mm0 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb | pand mm0, RMask // | 0 | 0 | D1rrrrr000 | 0 | movq mm5, mm1 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb | pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb | psrlw mm0, 3 // | 0 | 0 | D1rrrrr | 0 | pand mm1, RMask // | 0 | 0 | S1rrrrr000 | 0 | por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb | pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb | movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb | pand mm2, GMask // | 0 | 0 | 0 |D1gggggg00000| psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000| pand mm3, GMask // | 0 | 0 | 0 |S1gggggg00000| psrlw mm1, 3 // | 0 | 0 | S1rrrrr | 0 | por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb | movq mm6, mm2 // | 0 | 0 | 0 |D1gggggg00000| psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b | psrlw mm2, 5 // | 0 | 0 | 0 | D1gggggg | pmullw mm5, CA // | 0 | 0 | CA1r | CA1b | psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 | paddw mm4, RBConst // | 0 | 0 | CA1r+c | CA1b+c | psrlw mm3, 5 // | 0 | 0 | 0 | S1gggggg | psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g | add esi, 2 // pSrc++; pmullw mm3, CA // | 0 | 0 | 0 | CA1g | paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) psllw mm6, 1 // | 0 | 0 | 0 |D1gggggg000000| psubw mm6, mm2 // | 0 | 0 | 0 | D1g*63 | add edi, 2 // pDst++; paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c | movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31) psllw mm3, 1 // | 0 | 0 | 0 | CA1g*2 | pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits) paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63) movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63) psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits) paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1 pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits) psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits) paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1 psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits) movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits) pand mm4, RMask // Mask to get red psrlw mm5, 1 // Align the green pand mm1, BMask // Mask to get blue psllw mm4, 3 // Align the red por mm4, mm1 // Combine reds and blues in proper bit location packuswb mm4, mm7 // | 0 | 0 | D1rrrrr000 | D1000bbbbb | por mm4, mm5 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb | movd edx, mm4 mov [edi-2], dx test ecx, ecx jz Done // just processed the last pixel of the row dec ecx jmp QuadAligned // just processed the first pixel of the row Done: emms // remove for optimizations, have calling function do emms } } /************************************************************************** THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION This function operates on 24 bit pixels (8 bits each for Red, Green, and Blue) in a row of a bitmap. It blends source and destination bitmaps, without alpha channels, using a constant alpha input. The function performs the following on each byte: tmp1 = Alpha(Src - Dst) + 128 + (Dst * 127) tmp2 = tmp1 AND FF00h (mask off low byte) tmp2 = tmp2 shr 8 (move high byte to low byte) tmp2 = tmp2 + tmp1 tmp2 = tmp2 AND FF00h (mask off low byte) tmp2 = tmp2 shr 8 (move high byte to low byte) Dst = tmp2 pDst is assumed to be aligned to a DWORD boundary when passed to this function. The loop structure is as follows: Step 1: Multiply width in pixels by 3 to get width in bytes. Byte count is kept in ecx and eax. ecx is used as the loop counter. Step 2: Check pDst for QWORD alignment. If aligned, do Step 3. If unaligned, test to see if there are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:) and then do Step 3. If no, there are only 3 bytes to do; so do them one at a time (OneToThreeBytesLeft:). Step 3: (QuadAligned:) pDst is QWORD aligned. We want to do 8 bytes (1 quad) at once, so divide byte count by 8 to get loop count. If ecx is 0 at this point, there are no more quads to do; so do 0 to 7 bytes (NoQuadsLeft:), in Step 5. Step 4: (Do1QUAD:) Perform the above function, using MMX instructions, on 8 bytes per pass of the loop. Step 5: (NoQuadsLeft:) Mask eax with 7 to get the byte count modulo 8, 0 to 7 bytes left. Copy eax into ecx. Test to see if there are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:); if no, there are only 3 bytes to do, so do them one at a time (OneToThreeBytesLeft:). Step 6: (Do1DWORD:) Perform the above function, using MMX instructions, on 4 bytes. Do Step 3 (QuadAligned:) to see if there are more bytes to do. Step 7: (OneToThreeBytesLeft:) Do one byte at a time. This will happen if there are less than 4 bytes left to do. **************************************************************************/ VOID mmxPixelBlend24( PALPHAPIX pDst, PALPHAPIX pSrc, LONG Width, BLENDFUNCTION BlendFunction, PBYTE pwrMask ) { static QWORD WordConst = 0x0080008000800080; static QWORD WordMask = 0xFF00FF00FF00FF00; static QWORD ByteConst = 0x0000000000000080; static QWORD ByteMask = 0x000000000000FF00; static QWORD CA; // ConstAlpha in 4 words of a qword BYTE ConstAlpha = BlendFunction.SourceConstantAlpha; _asm { mov ecx, Width // Make sure there is at least one pixel to do test ecx, ecx jz Done mov esi, pSrc mov edi, pDst xor eax, eax mov al, ConstAlpha movd mm5, eax // | | | | CA | punpcklwd mm5, mm5 // | | | CA | CA | punpcklwd mm5, mm5 // | CA | CA | CA | CA | movq CA, mm5 // Step 1: lea ecx, [2*ecx+ecx]// NumPixels * 3 bytes/pixel = NumBytes // Step 2: test edi, 7 // Test first pixel for QWORD alignment jz QuadAligned // If unaligned, cmp ecx, 4 // test to see if there are 4 bytes to do jae Do1DWORD // if yes, do 4 bytes jmp OneToThreeBytesLeft// if no, do 1 to 3 bytes QuadAligned: // Step 3: mov eax, ecx // Save the width in eax for later (see NoQuadsLeft:) shr ecx, 3 // Want to do 8 bytes at once, so divide // byte count by 8 to get loop count test ecx, ecx // Make sure there is at least 1 QUAD (8 bytes) to do jz NoQuadsLeft // If we take this jump, there are 0 to 7 bytes left Do1QUAD: // Step 4: // Instructions will pair as shown for the Pentium processor movq mm0, [edi] // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 | pxor mm7, mm7 movq mm1, [esi] // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 | movq mm2, mm0 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 | movq mm3, mm1 // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 | punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 | movq mm4, mm0 // | D4 | D3 | D2 | D1 | punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 | punpckhbw mm2, mm7 // | D8 | D7 | D6 | D5 | psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 | pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 | punpckhbw mm3, mm7 // | S8 | S7 | S6 | S5 | psubw mm3, mm2 // | S8-D8 | S7-D7 | S6-D6 | S5-D5 | movq mm6, mm2 // | D8 | D7 | D6 | D5 | pmullw mm3, CA // | CA8 | CA7 | CA6 | CA5 | psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 | psllw mm6, 8 // | D8*128 | D7*128 | D6*128 | D5*128 | psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 | paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C| psubw mm6, mm2 // | D8*127 | D7*127 | D6*127 | D5*127 | paddw mm6, WordConst // | D8*127+C| D7*127+C| D6*127+C| D5*127+C| paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) paddw mm6, mm3 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127) movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) pand mm4, WordMask // tmp3 = tmp1 AND FF00h (mask off low bytes) movq mm5, mm6 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127) pand mm6, WordMask // tmp4 = tmp2 AND FF00h (mask off low bytes) psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte) psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte) paddw mm4, mm3 // tmp3 = tmp3 + tmp1 pand mm4, WordMask // tmp3 = tmp3 AND FF00h (mask off low bytes) paddw mm6, mm5 // tmp4 = tmp4 + tmp2 pand mm6, WordMask // tmp4 = tmp4 AND FF00h (mask off low bytes) psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte) psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte) add edi, 8 // pDst++; packuswb mm4, mm6 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 | add esi, 8 // pSrc++; movq [edi-8], mm4 dec ecx jnz Do1QUAD NoQuadsLeft: // Step 5: // This tests for 0 to 7 bytes left in row - eax contains initial byte count and eax, 7 // 0 to 7 bytes left to do jz Done cmp eax, 4 // Test to see if there are 4 bytes to do mov ecx, eax jae Do1DWORD // if yes, do 4 bytes jmp OneToThreeBytesLeft // if no, do 1 to 3 bytes // Step 6: Do1DWORD: // make as a macro if used in asm file movd mm0, [edi] // | 0 | 0 | 0 | 0 | D4 | D3 | D2 | D1 | pxor mm7, mm7 movd mm1, [esi] // | 0 | 0 | 0 | 0 | S4 | S3 | S2 | S1 | punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 | movq mm4, mm0 // | D4 | D3 | D2 | D1 | punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 | psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 | psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 | pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 | psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 | paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C| paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) pand mm4, WordMask // tmp2 = tmp1 AND FF00h (mask off low bytes) psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte) paddw mm4, mm3 // tmp2 = tmp2 + tmp1 pand mm4, WordMask // tmp2 = tmp2 AND FF00h (mask off low bytes) psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte) add edi, 4 // pDst++; packuswb mm4, mm4 // | D4 | D3 | D2 | D1 | D4 | D3 | D2 | D1 | add esi, 4 // pSrc++; movd [edi-4], mm4 sub ecx, 4 // Just did 4 bytes at the beginning or end of a scan line jmp QuadAligned // Jump to QuadAligned to determine if there are more bytes to do OneToThreeBytesLeft: // Step 7: movzx edx,BYTE PTR[edi] ; edx = Dest Byte movzx ebx,BYTE PTR[esi] ; ebx = Src Byte movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db | pxor mm7, mm7 movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Sb | movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db | psllw mm2, 8 // | 0 | 0 | 0 | 0 | 0 | 0 | Db| 0 | psubw mm1, mm0 // | 0 | 0 | 0 | Sb-Db | pmullw mm1, CA // | 0 | 0 | 0 | CAb | psubw mm2, mm0 // | 0 | 0 | 0 | Db*127| paddw mm2, ByteConst // | 0 | 0 | 0 |Db*127+128| paddw mm1, mm2 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) movq mm2, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127) pand mm2, ByteMask // tmp2 = tmp1 AND FF00h psrlw mm2, 8 // tmp2 = tmp2 shr 8 paddw mm2, mm1 // tmp2 = tmp2 + tmp1 pand mm2, ByteMask // tmp2 = tmp2 AND FF00h psrlw mm2, 8 // tmp2 = tmp2 shr 8 movd edx, mm2 mov BYTE PTR[edi], dl inc edi inc esi dec ecx jnz OneToThreeBytesLeft Done: emms // remove for optimizations, have calling function do emms } } #endif /******************************Public*Routine******************************\ * AlphaScanLineBlend * * Blends source and destionation surfaces one scan line at a time. * * Allocate a scan line buffer for xlate of src to 32BGRA if needed. * Allocate a scan line buffer for xlate of dst to 32BGRA if needed. * Blend scan line using blend function from pAlphaDispatch * Write scan line back to dst (if needed) * * Arguments: * * pDst - pointer to dst surface * pDstRect - Dst output rect * DeltaDst - dst scan line delat * pSrc - pointer to src surface * DeltaSrc - src scan line delta * pptlSrc - src offset * pxloSrcTo32 - xlateobj from src to 32BGR * pxlo32ToDst - xlateobj from 32BGR to dst * palDst - destination palette * palSrc - source palette * pAlphaDispatch - blend data and function pointers * * Return Value: * * ALPHA_COMPLETE: success, written to destination * ALPHA_SEND_TEMP: success, must write tmp bmp to dest * ALPHA_FAIL: error * * History: * * 10/14/1996 Mark Enstrom [marke] * \**************************************************************************/ ULONG AlphaScanLineBlend( PBYTE pDst, PRECTL pDstRect, ULONG DeltaDst, PBYTE pSrc, ULONG DeltaSrc, PPOINTL pptlSrc, PALPHA_DISPATCH_FORMAT pAlphaDispatch, PDIBINFO pDibInfoSrc, PDIBINFO pDibInfoDst ) { // // get two scanlines of RGBA data, blend pixels, store // LONG cx = pDstRect->right - pDstRect->left; LONG cy = pDstRect->bottom - pDstRect->top; LONG ScanBufferWidth = cx * 4; LONG WriteMaskSize = cx; LONG AllocationSize = 0; ULONG ulSrcBytesPerPixel = pAlphaDispatch->ulSrcBitsPerPixel/8; ULONG ulDstBytesPerPixel = pAlphaDispatch->ulDstBitsPerPixel/8; PBYTE pjSrcTempScanBuffer = NULL; PBYTE pjDstTempScanBuffer = NULL; PBYTE pjAlloc = NULL; PBYTE pjDstTmp; PBYTE pjSrcTmp; PBYTE pWriteMask; LONG lRet = ALPHA_SEND_TEMP; HDC hdc32 = NULL; PULONG pulDIBSrc = NULL; // // if there is a temp dst needed, use dc allocator // if (pAlphaDispatch->pfnLoadDstAndConvert != NULL) { hdc32 = hdcAllocateScanLineDC(cx,&pulDIBSrc); if (hdc32 == NULL) { return(ALPHA_FAIL); } // // set temp scan line // pjDstTempScanBuffer = (PBYTE)pulDIBSrc; } // // calculate destination starting address // if (ulDstBytesPerPixel) { pjDstTmp = pDst + ulDstBytesPerPixel * pDstRect->left + DeltaDst * pDstRect->top; } else if (pAlphaDispatch->ulDstBitsPerPixel == 1) { pjDstTmp = pDst + pDstRect->left/8 + DeltaDst * pDstRect->top; } else { pjDstTmp = pDst + pDstRect->left/2 + DeltaDst * pDstRect->top; } // // calculate source starting address // if (ulSrcBytesPerPixel) { pjSrcTmp = pSrc + ulSrcBytesPerPixel * pptlSrc->x + DeltaSrc * pptlSrc->y; } else if (pAlphaDispatch->ulSrcBitsPerPixel == 1) { pjSrcTmp = pSrc + pptlSrc->x/8 + DeltaSrc * pptlSrc->y; } else { pjSrcTmp = pSrc + pptlSrc->x/2 + DeltaSrc * pptlSrc->y; } // // calculate size of needed scan line buffer // if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL) { AllocationSize += ScanBufferWidth; } AllocationSize += WriteMaskSize; // // allocate scan line buffer memory // pWriteMask = (PBYTE)LOCALALLOC(AllocationSize); if (pWriteMask != NULL) { // // calc offsets // PBYTE pjTemp = pWriteMask + WriteMaskSize; if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL) { pjSrcTempScanBuffer = pjTemp; pjTemp += ScanBufferWidth; } // // Blend scan lines // LONG yScan = 0; while (cy--) { PBYTE pjSource = pjSrcTmp; PBYTE pjDest = pjDstTmp; // // get src scan line if needed // if (pjSrcTempScanBuffer) { (*pAlphaDispatch->pfnLoadSrcAndConvert)( (PULONG)pjSrcTempScanBuffer, pjSrcTmp, 0, cx, (PVOID)pDibInfoSrc); pjSource = pjSrcTempScanBuffer; } // // get dst scan line if needed // if (pjDstTempScanBuffer) { (*pAlphaDispatch->pfnLoadDstAndConvert)( (PULONG)pjDstTempScanBuffer, pjDstTmp, 0, cx, (PVOID)pDibInfoDst); pjDest = pjDstTempScanBuffer; } // // blend // memset(pWriteMask,1,WriteMaskSize); (*pAlphaDispatch->pfnGeneralBlend)( (PALPHAPIX)pjDest, (PALPHAPIX)pjSource, cx, pAlphaDispatch->BlendFunction, pWriteMask ); // // write buffer back if needed // if (pjDstTempScanBuffer) { (*pAlphaDispatch->pfnConvertAndStore)( pjDstTmp, (PULONG)pjDstTempScanBuffer, cx, 0, yScan, (PVOID)pDibInfoDst, pWriteMask, hdc32 ); } pjDstTmp += DeltaDst; pjSrcTmp += DeltaSrc; yScan++; } // // free any temp buffer memory // LOCALFREE(pWriteMask); } else { lRet = ALPHA_FAIL; } if (hdc32) { vFreeScanLineDC(hdc32); } if ( (lRet != ALPHA_FAIL) && (pAlphaDispatch->pfnConvertAndStore == vConvertAndSaveBGRAToDest) ) { lRet = ALPHA_COMPLETE; } return(lRet); } #endif