/**************************************************************************\ * * Copyright (c) 2000 Microsoft Corporation * * Module name: * * Include file to generate either 5-5-5 or 5-6-5 versions of the * dither code. * * Notes: * * When DITHER_BLEND_555 is #defined to 1, then this file will generate * 5-5-5 versions of the included routines. * * When DITHER_BLEND_555 is #defined to 0, then we will generate 5-6-5 * versions. * * Revision History: * * 03/15/2000 andrewgo * Created it. * \**************************************************************************/ #undef DITHER_ARRAY #undef RED_SHIFT #undef GREEN_SHIFT #undef BLUE_SHIFT #undef DITHERBLEND_FUNC #undef DITHER_FUNC #if DITHER_BLEND_555 #define DITHER_ARRAY Dither555 #define RED_SHIFT 9 #define GREEN_SHIFT 6 #define BLUE_SHIFT 3 #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_555_MMX #define DITHER_FUNC ScanOperation::Dither_sRGB_555_MMX #else #define DITHER_ARRAY Dither565 #define RED_SHIFT 8 #define GREEN_SHIFT 5 #define BLUE_SHIFT 3 #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_565_MMX #define DITHER_FUNC ScanOperation::Dither_sRGB_565_MMX #endif // Do a dithered blend to 16bpp using MMX VOID FASTCALL DITHERBLEND_FUNC( VOID *dst, const VOID *src, INT count, const OtherParams *otherParams ) { #if defined(_X86_) DEFINE_POINTERS(ARGB, WORD); DEFINE_BLEND_POINTER(ARGB); ASSERT(count != 0); ASSERT(otherParams); static ULONGLONG redBlueMask = 0x00f800f800f800f8; static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff; #if DITHER_BLEND_555 static ULONGLONG greenMask = 0x0000f8000000f800; static ULONGLONG redBlueMultiplier = 0x0400000104000001; #else static ULONGLONG greenMask = 0x0000fc000000fc00; static ULONGLONG redBlueMultiplier = 0x0800000108000001; #endif INT x = otherParams->X; INT y = otherParams->Y; UINT32 *dither = (otherParams->DoingDither) ? &DITHER_ARRAY[8 * (y & 3)] : &DitherNone[0]; UINT32 ditherIncrement = (x & 3) * 4; const ARGB *blendPixel = bl; _asm { ; ecx = count ; esi = source ; edi = destination ; mm4 = red and blue mask (0xf800f8) ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5) ; mm6 = C1 | C0 dither ; mm7 = C3 | C2 dither mov eax, ditherIncrement mov esi, blendPixel mov edi, d mov ecx, count movq mm4, redBlueMask movq mm5, greenMask ; We always want our qword reads from the screen to be aligned. ; So if the initial pixel is not qword-aligned, we handle up to ; three pixels up front to make it qword-aligned. ; ; (Note that as a consequence of us aligning to the destination, ; we're often doing unaligned reads on the source. But it's ; a much bigger performance win to align operations to the screen ; than to system memory, due to the terrible screen read ; performance.) alignment_loop: add eax, dither test edi, 6 movq mm6, [eax] movq mm7, [eax+8] jz done_start_alignment call do_single_pixel ; Adjust our pointers and load our new dither values: mov eax, ditherIncrement add eax, 4 and eax, 0x0000000F mov ditherIncrement, eax add esi, 4 add edi, 2 dec ecx jz all_done jmp alignment_loop done_start_alignment: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_main_loop: sub ecx, 4 ; pre-decrement by 4 jl do_pair ; We do chunks of 4 pixels at a time so that we can unroll our ; dither loop (our dither repeats every 4 pixels). do_main_loop_2: mov al, [esi+3] and al, [esi+7] and al, [esi+11] and al, [esi+15] inc al ; if all alphas were 0xff, this jnz do_pair ; will wrap to zero ; The four pixels starting at [esi] are opaque. We only need to ; dither them and convert to 16bpp. The following codepath will ; process all four in parallel (two at a time) in order to optimize ; usage of the execution units and minimize dependencies between ; consecutive instructions. ; We start by reading the four pixels into mm0 and mm1, adding ; the dither component, and then breaking into group 0 (pixels 0 ; and 2) and group 1 (pixels 1 and 3). I will use **0** and **1** ; in the comments below to show which pixel group the instruction is ; processing movq mm0, [esi] ; mm0 = DW1 | DW0 movq mm1, [esi + 8] ; mm1 = DW3 | DW2 paddusb mm0, mm6 ; add dither movq mm2, mm0 paddusb mm1, mm7 ; add dither add edi, 8 punpckhdq mm2, mm1 ; **1** mm2 = DW3 | DW1 punpckldq mm0, mm1 ; **0** mm0 = DW2 | DW0 movq mm3, mm2 ; **1** pand mm2, mm4 ; **1** red and blue movq mm1, mm0 ; **0** pand mm0, mm4 ; **0** red and blue pand mm3, mm5 ; **1** green psrlw mm0, 3 ; **0** shift red and blue to lowest ; 5 bits in register ; Note the use of the pmaddwd to simultaneously shift both the red and ; blue bits into their appropriate positions. The constant ; redBlueMultiplier contains four shorts, each of which is equal to ; 2^i where i is the number of bits that we need to shift that color ; component by in order to attain the correct position in the 16bpp ; color. This is possible only because the red and blue ; components lie on different shorts in the 64bits register (green has ; been masked earlier), and so we can dedicate an entire 16bit short ; to red and to blue. pmaddwd mm2, redBlueMultiplier ; **1** add esi, 16 pand mm1, mm5 ; **0** green psrld mm3, GREEN_SHIFT-3 ; **1** pmaddwd mm0, redBlueMultiplier ; **0** sub ecx, 4 ; pre-decrement for next iteration por mm2, mm3 ; **1** combine green with red/blue ; mm2 = 0 | W3 | 0 | W1 psrld mm1, GREEN_SHIFT ; **0** psllq mm2, 13 ; **1** mm2 = W3 | 0 | W1 | 0 por mm0, mm1 ; **0** combine green with red/blue ; mm1 = 0 | W2 | 0 | W0 por mm0, mm2 ; mm2 = W3 | W2 | W1 | W0 movq [edi - 8], mm0 jge do_main_loop_2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_pair: add ecx, 2 ; pre-decrement for this iteration jl do_last_pixel ; We're doing only a single pair of pixels, so swap our dither ; values in preparation for the next iteration: pxor mm6, mm7 pxor mm7, mm6 pxor mm6, mm7 ; swap mm6 and mm7 mov al, [esi+3] inc al cmp al, 1 ja do_pair_blend mov al, [esi+7] inc al cmp al, 1 ja do_pair_blend mov al, [esi+3] ; Do we really want this here? or al, [esi+7] jz do_pair_done movq mm0, [esi] paddusb mm0, mm7 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5) movq mm3, mm2 psrld mm3, BLUE_SHIFT ; blue psrld mm2, RED_SHIFT ; red (9 for 5-5-5) por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd eax, mm0 cmp byte ptr [esi+3], 0 je do_pair_done_first_write mov [edi], ax do_pair_done_first_write: cmp byte ptr [esi+7], 0 je do_pair_done_second_write shr eax, 16 mov [edi+2], ax do_pair_done_second_write: add edi, 4 add esi, 8 jmp do_main_loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_pair_blend: movd mm1, [edi] ; read destination, X | X | C1 | C0 punpcklwd mm1, mm1 ; C1 | C1 | C0 | C0 psrld mm1, 16 ; 0 | C1 | 0 | C0 ; (trick using single red and ; blue mask requires high bits ; to be zero) movq mm0, mm1 movq mm2, mm1 pslld mm1, BLUE_SHIFT ; blue pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5) pslld mm2, RED_SHIFT ; red (9 for 5-5-5) por mm1, mm2 ; combine red and blue pand mm1, mm4 ; leave valid red and blue bits pand mm0, mm5 ; leave valid green bits por mm1, mm0 ; mm1 = C1 | C0 ; Okay now we've got the destination read and split. Handle the first ; blend: movd mm2, [esi] punpcklbw mm2, mm2 psrlw mm2, 8 ; mm2 = S movq mm3, mm2 punpckhwd mm3, mm3 punpckhdq mm3, mm3 ; mm3 = alpha movq mm0, mm1 punpcklbw mm0, mm0 psrlw mm0, 8 ; mm0 = D #if NO_PREMULTIPLIED_ALPHA psubw mm2, mm0 pmullw mm2, mm3 ; mm2 = alpha * (S - D) movq mm3, mm2 psrlw mm3, 8 paddw mm2, mm3 ; approximate x/255 by 257/65536 psrlw mm2, 8 ; mm2 = alpha * (S - D) paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D) #else pxor mm3, flipAlphaBits pmullw mm0, mm3 ; mm2 = (255 - alpha) * D movq mm3, mm0 psrlw mm0, 8 ; approximate x/255 by 257/65536 paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255 psrlw mm0, 8 ; don't care about rounding, not enough bits paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D #endif ; Handle the second blend (change mm0 to mm1): movd mm2, [esi+4] punpcklbw mm2, mm2 psrlw mm2, 8 ; mm2 = S movq mm3, mm2 punpckhwd mm3, mm3 punpckhdq mm3, mm3 ; mm3 = alpha punpckhbw mm1, mm1 psrlw mm1, 8 ; mm1 = D #if NO_PREMULTIPLIED_ALPHA psubw mm2, mm1 pmullw mm2, mm3 ; mm2 = alpha * (S - D) movq mm3, mm2 psrlw mm3, 8 paddw mm2, mm3 ; approximate x/255 by 257/65536 psrlw mm2, 8 ; mm2 = alpha * (S - D) paddb mm1, mm2 ; mm1 = C1 = D + alpha * (S - D) #else pxor mm3, flipAlphaBits pmullw mm1, mm3 ; mm2 = (255 - alpha) * D movq mm3, mm1 psrlw mm1, 8 ; approximate x/255 by 257/65536 paddw mm1, mm3 ; mm2 = (255 - alpha) * D / 255 psrlw mm1, 8 ; don't care about rounding, not enough bits paddb mm1, mm2 ; mm1 = C1 = S + (1 - alpha) * D #endif packuswb mm0, mm1 ; mm0 = C1 | C0 ; Dither and pack everything back up: paddusb mm0, mm7 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT ; green movq mm3, mm2 psrld mm3, BLUE_SHIFT ; blue psrld mm2, RED_SHIFT ; red por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 ; mm1 = 0 | 0 | X | C1 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd [edi], mm0 do_pair_done: add edi, 4 add esi, 8 jmp do_main_loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_single_pixel: movd mm0, [esi] mov al, [esi+3] inc al jnz do_single_blend ; if not completely opaque paddusb mm0, mm6 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT movq mm3, mm2 psrld mm3, BLUE_SHIFT psrld mm2, RED_SHIFT por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movd eax, mm0 mov [edi], ax do_single_done: ret do_single_blend: dec al jz do_single_done ; completely transparent pixel ; alpha is between 0 and 255 movzx eax, word ptr [edi] ; do the destination read movd mm1, eax ; mm1 = 0 | 0 | 0 | C0 movq mm0, mm1 movq mm2, mm1 pslld mm1, BLUE_SHIFT ; blue pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5) pslld mm2, RED_SHIFT ; red (9 for 5-5-5) por mm1, mm2 ; combine red and blue pand mm1, mm4 ; leave valid red and blue bits pand mm0, mm5 ; leave valid green bits por mm1, mm0 ; mm1 = C1 | C0 ; Okay now we've got the destination read and split. Handle the first blend: movd mm2, [esi] punpcklbw mm2, mm2 psrlw mm2, 8 ; mm2 = S movq mm3, mm2 punpckhwd mm3, mm3 punpckhdq mm3, mm3 ; mm3 = alpha movq mm0, mm1 punpcklbw mm0, mm0 psrlw mm0, 8 ; mm0 = D #if NO_PREMULTIPLIED_ALPHA psubw mm2, mm0 pmullw mm2, mm3 ; mm2 = alpha * (S - D) movq mm3, mm2 psrlw mm3, 8 paddw mm2, mm3 ; approximate x/255 by 257/65536 psrlw mm2, 8 ; mm2 = alpha * (S - D) paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D) #else pxor mm3, flipAlphaBits pmullw mm0, mm3 ; mm2 = (255 - alpha) * D movq mm3, mm0 psrlw mm0, 8 ; approximate x/255 by 257/65536 paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255 psrlw mm0, 8 ; don't care about rounding, not enough bits paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D #endif packuswb mm0, mm0 ; mm0 = C1 | C0 ; Dither and pack everything back up: paddusb mm0, mm6 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT movq mm3, mm2 psrld mm3, BLUE_SHIFT psrld mm2, RED_SHIFT por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movd eax, mm0 mov [edi], ax ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_last_pixel: test ecx, 1 jz all_done call do_single_pixel all_done: emms } #endif } // Dither to 16bpp using MMX VOID FASTCALL DITHER_FUNC( VOID *dst, const VOID *src, INT count, const OtherParams *otherParams ) { #if defined(_X86_) DEFINE_POINTERS(ARGB, WORD); ASSERT(count != 0); ASSERT(otherParams); static ULONGLONG redBlueMask = 0x00f800f800f800f8; static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff; #if DITHER_BLEND_555 static ULONGLONG greenMask = 0x0000f8000000f800; #else static ULONGLONG greenMask = 0x0000fc000000fc00; #endif INT x = otherParams->X; INT y = otherParams->Y; UINT32 *dither = (otherParams->DoingDither) ? &DITHER_ARRAY[8 * (y & 3) + (x & 3)] : &DitherNone[0]; _asm { ; ecx = count ; esi = source ; edi = destination ; mm4 = red and blue mask (0xf800f8) ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5) ; mm6 = C1 | C0 dither ; mm7 = C3 | C2 dither mov eax, dither mov esi, s mov edi, d mov ecx, count movq mm4, redBlueMask movq mm5, greenMask movq mm6, [eax] movq mm7, [eax+8] sub ecx, 4 ; pre-decrement by 4 jl do_last_3_pixels_or_less ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; We do chunks of 4 pixels at a time so that we can unroll our ; dither loop (our dither repeats every 4 pixels). do_main_loop: movq mm0, [esi] paddusb mm0, mm6 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5) movq mm3, mm2 psrld mm3, BLUE_SHIFT ; blue psrld mm2, RED_SHIFT ; red (9 for 5-5-5) por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 ; mm1 = X | X | X | C1 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd [edi], mm0 movq mm0, [esi+8] paddusb mm0, mm7 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT movq mm3, mm2 psrld mm3, BLUE_SHIFT psrld mm2, RED_SHIFT por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd [edi+4], mm0 add edi, 8 add esi, 16 sub ecx, 4 ; pre-decrement for next iteration jge do_main_loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; do_last_3_pixels_or_less: add ecx, 4 ; get back 'real' count jz all_done dec ecx ; if exactly 1 pixel left jz do_last_pixel ; do 2 pixels ; we'll decrement ecx again later movq mm0, [esi] paddusb mm0, mm6 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5) movq mm3, mm2 psrld mm3, BLUE_SHIFT ; blue psrld mm2, RED_SHIFT ; red (9 for 5-5-5) por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 ; mm1 = X | X | X | C1 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd eax, mm0 mov [edi], eax dec ecx jz all_done add esi, 8 add edi, 4 do_last_pixel: movd mm0, [esi] paddusb mm0, mm7 ; add dither movq mm2, mm0 pand mm0, mm5 ; green pand mm2, mm4 ; red and blue psrld mm0, GREEN_SHIFT movq mm3, mm2 psrld mm3, BLUE_SHIFT psrld mm2, RED_SHIFT por mm0, mm3 por mm0, mm2 ; mm0 = X | C1 | X | C0 movq mm1, mm0 psrlq mm1, 32 punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0 movd eax, mm0 mov [edi], ax all_done: emms } #endif }