Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

645 lines
24 KiB

/**************************************************************************\
*
* Copyright (c) 2000 Microsoft Corporation
*
* Module name:
*
* Include file to generate either 5-5-5 or 5-6-5 versions of the
* dither code.
*
* Notes:
*
* When DITHER_BLEND_555 is #defined to 1, then this file will generate
* 5-5-5 versions of the included routines.
*
* When DITHER_BLEND_555 is #defined to 0, then we will generate 5-6-5
* versions.
*
* Revision History:
*
* 03/15/2000 andrewgo
* Created it.
*
\**************************************************************************/
#undef DITHER_ARRAY
#undef RED_SHIFT
#undef GREEN_SHIFT
#undef BLUE_SHIFT
#undef DITHERBLEND_FUNC
#undef DITHER_FUNC
#if DITHER_BLEND_555
#define DITHER_ARRAY Dither555
#define RED_SHIFT 9
#define GREEN_SHIFT 6
#define BLUE_SHIFT 3
#define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_555_MMX
#define DITHER_FUNC ScanOperation::Dither_sRGB_555_MMX
#else
#define DITHER_ARRAY Dither565
#define RED_SHIFT 8
#define GREEN_SHIFT 5
#define BLUE_SHIFT 3
#define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_565_MMX
#define DITHER_FUNC ScanOperation::Dither_sRGB_565_MMX
#endif
// Do a dithered blend to 16bpp using MMX
VOID FASTCALL
DITHERBLEND_FUNC(
VOID *dst,
const VOID *src,
INT count,
const OtherParams *otherParams
)
{
#if defined(_X86_)
DEFINE_POINTERS(ARGB, WORD);
DEFINE_BLEND_POINTER(ARGB);
ASSERT(count != 0);
ASSERT(otherParams);
static ULONGLONG redBlueMask = 0x00f800f800f800f8;
static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;
#if DITHER_BLEND_555
static ULONGLONG greenMask = 0x0000f8000000f800;
static ULONGLONG redBlueMultiplier = 0x0400000104000001;
#else
static ULONGLONG greenMask = 0x0000fc000000fc00;
static ULONGLONG redBlueMultiplier = 0x0800000108000001;
#endif
INT x = otherParams->X;
INT y = otherParams->Y;
UINT32 *dither = (otherParams->DoingDither)
? &DITHER_ARRAY[8 * (y & 3)]
: &DitherNone[0];
UINT32 ditherIncrement = (x & 3) * 4;
const ARGB *blendPixel = bl;
_asm
{
; ecx = count
; esi = source
; edi = destination
; mm4 = red and blue mask (0xf800f8)
; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
; mm6 = C1 | C0 dither
; mm7 = C3 | C2 dither
mov eax, ditherIncrement
mov esi, blendPixel
mov edi, d
mov ecx, count
movq mm4, redBlueMask
movq mm5, greenMask
; We always want our qword reads from the screen to be aligned.
; So if the initial pixel is not qword-aligned, we handle up to
; three pixels up front to make it qword-aligned.
;
; (Note that as a consequence of us aligning to the destination,
; we're often doing unaligned reads on the source. But it's
; a much bigger performance win to align operations to the screen
; than to system memory, due to the terrible screen read
; performance.)
alignment_loop:
add eax, dither
test edi, 6
movq mm6, [eax]
movq mm7, [eax+8]
jz done_start_alignment
call do_single_pixel
; Adjust our pointers and load our new dither values:
mov eax, ditherIncrement
add eax, 4
and eax, 0x0000000F
mov ditherIncrement, eax
add esi, 4
add edi, 2
dec ecx
jz all_done
jmp alignment_loop
done_start_alignment:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_main_loop:
sub ecx, 4 ; pre-decrement by 4
jl do_pair
; We do chunks of 4 pixels at a time so that we can unroll our
; dither loop (our dither repeats every 4 pixels).
do_main_loop_2:
mov al, [esi+3]
and al, [esi+7]
and al, [esi+11]
and al, [esi+15]
inc al ; if all alphas were 0xff, this
jnz do_pair ; will wrap to zero
; The four pixels starting at [esi] are opaque. We only need to
; dither them and convert to 16bpp. The following codepath will
; process all four in parallel (two at a time) in order to optimize
; usage of the execution units and minimize dependencies between
; consecutive instructions.
; We start by reading the four pixels into mm0 and mm1, adding
; the dither component, and then breaking into group 0 (pixels 0
; and 2) and group 1 (pixels 1 and 3). I will use **0** and **1**
; in the comments below to show which pixel group the instruction is
; processing
movq mm0, [esi] ; mm0 = DW1 | DW0
movq mm1, [esi + 8] ; mm1 = DW3 | DW2
paddusb mm0, mm6 ; add dither
movq mm2, mm0
paddusb mm1, mm7 ; add dither
add edi, 8
punpckhdq mm2, mm1 ; **1** mm2 = DW3 | DW1
punpckldq mm0, mm1 ; **0** mm0 = DW2 | DW0
movq mm3, mm2 ; **1**
pand mm2, mm4 ; **1** red and blue
movq mm1, mm0 ; **0**
pand mm0, mm4 ; **0** red and blue
pand mm3, mm5 ; **1** green
psrlw mm0, 3 ; **0** shift red and blue to lowest
; 5 bits in register
; Note the use of the pmaddwd to simultaneously shift both the red and
; blue bits into their appropriate positions. The constant
; redBlueMultiplier contains four shorts, each of which is equal to
; 2^i where i is the number of bits that we need to shift that color
; component by in order to attain the correct position in the 16bpp
; color. This is possible only because the red and blue
; components lie on different shorts in the 64bits register (green has
; been masked earlier), and so we can dedicate an entire 16bit short
; to red and to blue.
pmaddwd mm2, redBlueMultiplier ; **1**
add esi, 16
pand mm1, mm5 ; **0** green
psrld mm3, GREEN_SHIFT-3 ; **1**
pmaddwd mm0, redBlueMultiplier ; **0**
sub ecx, 4 ; pre-decrement for next iteration
por mm2, mm3 ; **1** combine green with red/blue
; mm2 = 0 | W3 | 0 | W1
psrld mm1, GREEN_SHIFT ; **0**
psllq mm2, 13 ; **1** mm2 = W3 | 0 | W1 | 0
por mm0, mm1 ; **0** combine green with red/blue
; mm1 = 0 | W2 | 0 | W0
por mm0, mm2 ; mm2 = W3 | W2 | W1 | W0
movq [edi - 8], mm0
jge do_main_loop_2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_pair:
add ecx, 2 ; pre-decrement for this iteration
jl do_last_pixel
; We're doing only a single pair of pixels, so swap our dither
; values in preparation for the next iteration:
pxor mm6, mm7
pxor mm7, mm6
pxor mm6, mm7 ; swap mm6 and mm7
mov al, [esi+3]
inc al
cmp al, 1
ja do_pair_blend
mov al, [esi+7]
inc al
cmp al, 1
ja do_pair_blend
mov al, [esi+3] ; Do we really want this here?
or al, [esi+7]
jz do_pair_done
movq mm0, [esi]
paddusb mm0, mm7 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
movq mm3, mm2
psrld mm3, BLUE_SHIFT ; blue
psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd eax, mm0
cmp byte ptr [esi+3], 0
je do_pair_done_first_write
mov [edi], ax
do_pair_done_first_write:
cmp byte ptr [esi+7], 0
je do_pair_done_second_write
shr eax, 16
mov [edi+2], ax
do_pair_done_second_write:
add edi, 4
add esi, 8
jmp do_main_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_pair_blend:
movd mm1, [edi] ; read destination, X | X | C1 | C0
punpcklwd mm1, mm1 ; C1 | C1 | C0 | C0
psrld mm1, 16 ; 0 | C1 | 0 | C0
; (trick using single red and
; blue mask requires high bits
; to be zero)
movq mm0, mm1
movq mm2, mm1
pslld mm1, BLUE_SHIFT ; blue
pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
pslld mm2, RED_SHIFT ; red (9 for 5-5-5)
por mm1, mm2 ; combine red and blue
pand mm1, mm4 ; leave valid red and blue bits
pand mm0, mm5 ; leave valid green bits
por mm1, mm0 ; mm1 = C1 | C0
; Okay now we've got the destination read and split. Handle the first
; blend:
movd mm2, [esi]
punpcklbw mm2, mm2
psrlw mm2, 8 ; mm2 = S
movq mm3, mm2
punpckhwd mm3, mm3
punpckhdq mm3, mm3 ; mm3 = alpha
movq mm0, mm1
punpcklbw mm0, mm0
psrlw mm0, 8 ; mm0 = D
#if NO_PREMULTIPLIED_ALPHA
psubw mm2, mm0
pmullw mm2, mm3 ; mm2 = alpha * (S - D)
movq mm3, mm2
psrlw mm3, 8
paddw mm2, mm3 ; approximate x/255 by 257/65536
psrlw mm2, 8 ; mm2 = alpha * (S - D)
paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D)
#else
pxor mm3, flipAlphaBits
pmullw mm0, mm3 ; mm2 = (255 - alpha) * D
movq mm3, mm0
psrlw mm0, 8 ; approximate x/255 by 257/65536
paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255
psrlw mm0, 8 ; don't care about rounding, not enough bits
paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D
#endif
; Handle the second blend (change mm0 to mm1):
movd mm2, [esi+4]
punpcklbw mm2, mm2
psrlw mm2, 8 ; mm2 = S
movq mm3, mm2
punpckhwd mm3, mm3
punpckhdq mm3, mm3 ; mm3 = alpha
punpckhbw mm1, mm1
psrlw mm1, 8 ; mm1 = D
#if NO_PREMULTIPLIED_ALPHA
psubw mm2, mm1
pmullw mm2, mm3 ; mm2 = alpha * (S - D)
movq mm3, mm2
psrlw mm3, 8
paddw mm2, mm3 ; approximate x/255 by 257/65536
psrlw mm2, 8 ; mm2 = alpha * (S - D)
paddb mm1, mm2 ; mm1 = C1 = D + alpha * (S - D)
#else
pxor mm3, flipAlphaBits
pmullw mm1, mm3 ; mm2 = (255 - alpha) * D
movq mm3, mm1
psrlw mm1, 8 ; approximate x/255 by 257/65536
paddw mm1, mm3 ; mm2 = (255 - alpha) * D / 255
psrlw mm1, 8 ; don't care about rounding, not enough bits
paddb mm1, mm2 ; mm1 = C1 = S + (1 - alpha) * D
#endif
packuswb mm0, mm1 ; mm0 = C1 | C0
; Dither and pack everything back up:
paddusb mm0, mm7 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT ; green
movq mm3, mm2
psrld mm3, BLUE_SHIFT ; blue
psrld mm2, RED_SHIFT ; red
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32 ; mm1 = 0 | 0 | X | C1
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd [edi], mm0
do_pair_done:
add edi, 4
add esi, 8
jmp do_main_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_single_pixel:
movd mm0, [esi]
mov al, [esi+3]
inc al
jnz do_single_blend ; if not completely opaque
paddusb mm0, mm6 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT
movq mm3, mm2
psrld mm3, BLUE_SHIFT
psrld mm2, RED_SHIFT
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movd eax, mm0
mov [edi], ax
do_single_done:
ret
do_single_blend:
dec al
jz do_single_done ; completely transparent pixel
; alpha is between 0 and 255
movzx eax, word ptr [edi] ; do the destination read
movd mm1, eax ; mm1 = 0 | 0 | 0 | C0
movq mm0, mm1
movq mm2, mm1
pslld mm1, BLUE_SHIFT ; blue
pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
pslld mm2, RED_SHIFT ; red (9 for 5-5-5)
por mm1, mm2 ; combine red and blue
pand mm1, mm4 ; leave valid red and blue bits
pand mm0, mm5 ; leave valid green bits
por mm1, mm0 ; mm1 = C1 | C0
; Okay now we've got the destination read and split. Handle the first blend:
movd mm2, [esi]
punpcklbw mm2, mm2
psrlw mm2, 8 ; mm2 = S
movq mm3, mm2
punpckhwd mm3, mm3
punpckhdq mm3, mm3 ; mm3 = alpha
movq mm0, mm1
punpcklbw mm0, mm0
psrlw mm0, 8 ; mm0 = D
#if NO_PREMULTIPLIED_ALPHA
psubw mm2, mm0
pmullw mm2, mm3 ; mm2 = alpha * (S - D)
movq mm3, mm2
psrlw mm3, 8
paddw mm2, mm3 ; approximate x/255 by 257/65536
psrlw mm2, 8 ; mm2 = alpha * (S - D)
paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D)
#else
pxor mm3, flipAlphaBits
pmullw mm0, mm3 ; mm2 = (255 - alpha) * D
movq mm3, mm0
psrlw mm0, 8 ; approximate x/255 by 257/65536
paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255
psrlw mm0, 8 ; don't care about rounding, not enough bits
paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D
#endif
packuswb mm0, mm0 ; mm0 = C1 | C0
; Dither and pack everything back up:
paddusb mm0, mm6 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT
movq mm3, mm2
psrld mm3, BLUE_SHIFT
psrld mm2, RED_SHIFT
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movd eax, mm0
mov [edi], ax
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_last_pixel:
test ecx, 1
jz all_done
call do_single_pixel
all_done:
emms
}
#endif
}
// Dither to 16bpp using MMX
VOID FASTCALL
DITHER_FUNC(
VOID *dst,
const VOID *src,
INT count,
const OtherParams *otherParams
)
{
#if defined(_X86_)
DEFINE_POINTERS(ARGB, WORD);
ASSERT(count != 0);
ASSERT(otherParams);
static ULONGLONG redBlueMask = 0x00f800f800f800f8;
static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;
#if DITHER_BLEND_555
static ULONGLONG greenMask = 0x0000f8000000f800;
#else
static ULONGLONG greenMask = 0x0000fc000000fc00;
#endif
INT x = otherParams->X;
INT y = otherParams->Y;
UINT32 *dither = (otherParams->DoingDither)
? &DITHER_ARRAY[8 * (y & 3) + (x & 3)]
: &DitherNone[0];
_asm
{
; ecx = count
; esi = source
; edi = destination
; mm4 = red and blue mask (0xf800f8)
; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
; mm6 = C1 | C0 dither
; mm7 = C3 | C2 dither
mov eax, dither
mov esi, s
mov edi, d
mov ecx, count
movq mm4, redBlueMask
movq mm5, greenMask
movq mm6, [eax]
movq mm7, [eax+8]
sub ecx, 4 ; pre-decrement by 4
jl do_last_3_pixels_or_less
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; We do chunks of 4 pixels at a time so that we can unroll our
; dither loop (our dither repeats every 4 pixels).
do_main_loop:
movq mm0, [esi]
paddusb mm0, mm6 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
movq mm3, mm2
psrld mm3, BLUE_SHIFT ; blue
psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32 ; mm1 = X | X | X | C1
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd [edi], mm0
movq mm0, [esi+8]
paddusb mm0, mm7 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT
movq mm3, mm2
psrld mm3, BLUE_SHIFT
psrld mm2, RED_SHIFT
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd [edi+4], mm0
add edi, 8
add esi, 16
sub ecx, 4 ; pre-decrement for next iteration
jge do_main_loop
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
do_last_3_pixels_or_less:
add ecx, 4 ; get back 'real' count
jz all_done
dec ecx ; if exactly 1 pixel left
jz do_last_pixel
; do 2 pixels
; we'll decrement ecx again later
movq mm0, [esi]
paddusb mm0, mm6 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
movq mm3, mm2
psrld mm3, BLUE_SHIFT ; blue
psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32 ; mm1 = X | X | X | C1
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd eax, mm0
mov [edi], eax
dec ecx
jz all_done
add esi, 8
add edi, 4
do_last_pixel:
movd mm0, [esi]
paddusb mm0, mm7 ; add dither
movq mm2, mm0
pand mm0, mm5 ; green
pand mm2, mm4 ; red and blue
psrld mm0, GREEN_SHIFT
movq mm3, mm2
psrld mm3, BLUE_SHIFT
psrld mm2, RED_SHIFT
por mm0, mm3
por mm0, mm2 ; mm0 = X | C1 | X | C0
movq mm1, mm0
psrlq mm1, 32
punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
movd eax, mm0
mov [edi], ax
all_done:
emms
}
#endif
}