Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

704 lines
26 KiB

;-------------------------------------------------------------------------
; INTEL Corporation Proprietary Information
;
; This listing is supplied under the terms of a license
; agreement with INTEL Corporation and may not be copied
; nor disclosed except in accordance with the terms of
; that agreement.
;
; Copyright (c) 1996 Intel Corporation.
; All Rights Reserved.
;
;-------------------------------------------------------------------------
;-------------------------------------------------------------------------
;//
;// $Header: S:\h26x\src\dec\cx512161.asv
;//
;// $Log: S:\h26x\src\dec\cxm12161.asv $
;//
;// Rev 1.9 24 May 1996 11:12:10 AGUPTA2
;//
;// Modified version of final drop from IDC. Fixed alignment, global var,
;// referencing beyond stack pointer problems. Cosmetic changes to adhere
;// to a common coding convention in all MMX color convertor files.
;//
;// Rev 1.8 17 Apr 1996 09:51:08 ISRAELH
;// Added AspectRatio adjustement, emms.
;//
;// Rev 1.7 11 Apr 1996 09:51:08 RMCKENZX
;// Changed return to pop the stack.
;//
;// Rev 1.6 09 Apr 1996 10:00:44 RMCKENZX
;//
;// Changed calling sequence to __stdcall.
;//
;// Rev 1.5 05 Apr 1996 10:40:20 RMCKENZX
;// Hacked in Aspect Ratio correction. This is accomplished
;// by simply overwriting the next even line after the aspect
;// count has been matched or exceeded.
;//
;// Rev 1.4 29 Mar 1996 07:52:56 RMCKENZX
;// re-fixed bug in 655 setup.
;//
;// Rev 1.3 28 Mar 1996 14:35:38 RMCKENZX
;// Cleaned up code, added comments, revised calling sequence,
;// moved global variables onto stack.
;//
;// Rev 1.2 21 Mar 1996 08:10:06 RMCKENZX
;// Fixed 655 case -- initialized GLeftShift at 5.
;//
;// Rev 1.1 20 Mar 1996 11:18:52 RMCKENZX
;// March 96 version.
;
; Rev 1.3 19 Feb 1996 11:49:42 israelh
; bug fix.
; new algorithm for RGB16 bit pack.
;
; Rev 1.3 18 Feb 1996 20:58:44 israelh
; better algorithm and bug fix
;
; Rev 1.2 29 Jan 1996 19:53:50 mikeh
;
; added Ifdef timing
;
; Rev 1.1 29 Jan 1996 16:29:16 mikeh
; remvoed $LOG stuff
;
; Rev 1.0 29 Jan 1996 11:49:44 israelh
; Initial revision.
;//
;// MMX 1.3 14 Jan 1996 IsraelH
;// Implementing runtime RGB bit allocation according to BValLo[0]:
;// It contains the ColorConvertor value from d1color.cpp module.
;// Compiler flag RTIME16 for using runtime allocation.
;//
;// MMX 1.2 10 Jan 1996 IsraelH
;// Implementing RGB16x565 (5-R 5-G 5-B) as default
;// Compiler flag MODE555 for RGB16555 (5-R 5-G 5-B)
;//
;// MMX 1.1 09 Jan 1996 IsraelH
;// Implementing RGB16x555 (5-R 5-G 5-B)
;// Commenting out RGB16664 (6-R 6-G 4-B)
;// Adding performance measurements in runtime
;//
;// MMX 1.0 25 Dec 1995 IsraelH
;// Port to MMX(TM) without using tables
;
;-------------------------------------------------------------------------
;
; +---------- Color convertor.
; |+--------- For both H261 and H263.
; ||+-------- MMx Version.
; |||++------ Convert from YUV12.
; |||||++---- Convert to RGB16.
; |||||||+--- Zoom by one, i.e. non-zoom.
; ||||||||
; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x.
; It handles any format in which there are three fields, the low
; order field being B and fully contained in the low order byte, the
; second field being G and being somewhere in bits 4 through 11,
; and the high order field being R and fully contained in the high
; order byte.
;
; The YUV12 input is planar, 8 bits per pel. The Y plane may have
; a pitch of up to 768. It may have a width less than or equal
; to the pitch. It must be DWORD aligned, and preferably QWORD
; aligned. Pitch and Width must be a multiple of four. For best
; performance, Pitch should not be 4 more than a multiple of 32.
; Height may be any amount, but must be a multiple of two. The U
; and V planes may have a different pitch than the Y plane, subject
; to the same limitations.
;
OPTION CASEMAP:NONE
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
.586
.xlist
include iammx.inc
include memmodel.inc
.list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
MMXDATA1 SEGMENT
ALIGN 8
Minusg DWORD 00800080h, 00800080h
Yadd DWORD 10101010h, 10101010h
VtR DWORD 00660066h, 00660066h ;01990199h,01990199h
VtG DWORD 00340034h, 00340034h ;00d000d0h,00d000d0h
UtG DWORD 00190019h, 00190019h ;00640064h,00640064h
UtB DWORD 00810081h, 00810081h ;02050205h,02050205h
Ymul DWORD 004a004ah, 004a004ah ;012a012ah,012a012ah
UVtG DWORD 00340019h, 00340019h ;00d00064h,00d00064h
VtRUtB DWORD 01990205h, 01990205h
fourbitu DWORD 0f0f0f0f0h, 0f0f0f0f0h
fivebitu DWORD 0e0e0e0e0h, 0e0e0e0e0h
sixbitu DWORD 0c0c0c0c0h, 0c0c0c0c0h
MMXDATA1 ENDS
MMXCODE1 SEGMENT
MMX_YUV12ToRGB16 PROC DIST LANG PUBLIC,
AYPlane: DWORD,
AVPlane: DWORD,
AUPlane: DWORD,
AFrameWidth: DWORD,
AFrameHeight: DWORD,
AYPitch: DWORD,
AVPitch: DWORD,
AAspectAdjustmentCnt: DWORD,
AColorConvertedFrame: DWORD,
ADCIOffset: DWORD,
ACCOffsetToLine0: DWORD,
ACCOPitch: DWORD,
ACCType: DWORD
LocalFrameSize = 256
RegisterStorageSize = 16
argument_base EQU ebp + RegisterStorageSize
local_base EQU esp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Arguments:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
YPlane EQU argument_base + 4
VPlane EQU argument_base + 8
UPlane EQU argument_base + 12
FrameWidth EQU argument_base + 16
FrameHeight EQU argument_base + 20
YPitch EQU argument_base + 24
ChromaPitch EQU argument_base + 28
AspectAdjustmentCount EQU argument_base + 32
ColorConvertedFrame EQU argument_base + 36
DCIOffset EQU argument_base + 40
CCOffsetToLine0 EQU argument_base + 44
CCOPitch EQU argument_base + 48
CCType EQU argument_base + 52
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Locals (on local stack frame)
; (local_base is aligned at cache-line boundary in the prologue)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
localFrameWidth EQU local_base + 0
localYPitch EQU local_base + 4
localChromaPitch EQU local_base + 8
localAspectAdjustmentCount EQU local_base + 12
localCCOPitch EQU local_base + 16
CCOCursor EQU local_base + 20
CCOSkipDistance EQU local_base + 24
YLimit EQU local_base + 28
DistanceFromVToU EQU local_base + 32
currAspectCount EQU local_base + 36
YCursorEven EQU local_base + 40
YCursorOdd EQU local_base + 44
tmpCCOPitch EQU local_base + 48
StashESP EQU local_base + 52
; space for two DWORD locals
temp_mmx EQU local_base + 64 ; note it is 64 bytes
RLeftShift EQU local_base +128
GLeftShift EQU local_base +136
RRightShift EQU local_base +144
GRightShift EQU local_base +152
BRightShift EQU local_base +160
RUpperLimit EQU local_base +168
GUpperLimit EQU local_base +176
BUpperLimit EQU local_base +184
; Switches used by RGB color convertors to determine the exact conversion type.
RGB16555 = 9
RGB16664 = 14
RGB16565 = 18
RGB16655 = 22
push esi
push edi
push ebp
push ebx
mov ebp, esp
sub esp, LocalFrameSize
and esp, -32
mov [StashESP], ebp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Save some parameters on local stack frame
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov ebx, [FrameWidth]
;
mov [localFrameWidth], ebx
mov ebx, [YPitch]
mov [localYPitch], ebx
mov ebx, [ChromaPitch]
mov [localChromaPitch], ebx
mov ebx, [AspectAdjustmentCount]
mov [localAspectAdjustmentCount], ebx
mov ebx, [CCOPitch]
mov [localCCOPitch], ebx
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Set-up rest of the local stack frame
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov al, [CCType]
cmp al, RGB16555
je RGB555
cmp al, RGB16664
je RGB664
cmp al, RGB16565
je RGB565
cmp al, RGB16655
je RGB655
RGB555:
xor eax, eax
mov ebx, 2 ; 10-8 for byte shift
mov [RLeftShift], ebx
mov [RLeftShift+4], eax
mov ebx, 5
mov [GLeftShift+4], eax
mov [GLeftShift], ebx
mov ebx, 9
mov [RRightShift], ebx
mov [RRightShift+4], eax
mov [GRightShift], ebx
mov [GRightShift+4], eax
mov [BRightShift], ebx
mov [BRightShift+4], eax
movq mm0, fivebitu
;
movq [RUpperLimit], mm0
;
movq [GUpperLimit], mm0
;
movq [BUpperLimit], mm0
jmp RGBEND
RGB664:
xor eax, eax
mov ebx, 2 ; 8-6
mov [RLeftShift], ebx
mov [RLeftShift+4], eax
mov ebx, 4
mov [GLeftShift+4], eax
mov [GLeftShift], ebx
mov ebx, 8
mov [RRightShift], ebx
mov [RRightShift+4], eax
mov [GRightShift], ebx
mov [GRightShift+4], eax
mov ebx, 10
mov [BRightShift+4], eax
mov [BRightShift], ebx
;
movq mm0, sixbitu
;
movq [RUpperLimit], mm0
;
movq [GUpperLimit], mm0
;
movq mm0, fourbitu
;
movq [BUpperLimit], mm0
jmp RGBEND
RGB565:
xor eax, eax
mov ebx, 3 ; 8-5
mov [RLeftShift], ebx
mov [RLeftShift+4], eax
mov ebx, 5
mov [GLeftShift+4], eax
mov [GLeftShift], ebx
mov ebx, 9
mov [RRightShift+4], eax
mov [RRightShift], ebx
mov [BRightShift], ebx
mov [BRightShift+4], eax
mov ebx, 8
mov [GRightShift+4], eax
mov [GRightShift], ebx
;
movq mm0, fivebitu
;
movq [RUpperLimit], mm0
;
movq [BUpperLimit], mm0
;
movq mm0, sixbitu
;
movq [GUpperLimit], mm0
jmp RGBEND
RGB655:
xor eax, eax
mov ebx, 2 ; 8-6
mov [RLeftShift], ebx
mov [RLeftShift+4], eax
mov ebx, 5
mov [GLeftShift+4], eax
mov [GLeftShift], ebx
mov ebx, 8
mov [RRightShift], ebx
mov [RRightShift+4], eax
mov ebx, 9
mov [GRightShift+4], eax
mov [GRightShift], ebx
mov [BRightShift], ebx
mov [BRightShift+4], eax
;
movq mm0, sixbitu
;
movq [RUpperLimit], mm0
;
movq mm0, fivebitu
;
movq [GUpperLimit], mm0
;
movq [BUpperLimit], mm0
jmp RGBEND
RGBEND:
mov ebx, [VPlane]
mov ecx, [UPlane]
sub ecx, ebx
mov eax, [ColorConvertedFrame]
mov [DistanceFromVToU], ecx
mov edx, [DCIOffset]
add eax, edx
mov edx, [CCOffsetToLine0]
add eax, edx
mov edx, [FrameHeight]
mov [CCOCursor], eax
mov ecx, [YPitch]
imul edx, ecx ; FrameHeight*YPitch
;
mov ebx, [FrameWidth]
mov eax, [CCOPitch]
sub eax, ebx ; CCOPitch-FrameWidth
mov esi, [YPlane] ; Fetch cursor over luma plane.
sub eax, ebx ; CCOPitch-2*FrameWidth
mov [CCOSkipDistance], eax ; CCOPitch-2*FrameWidth
add edx, esi ; YPlane+Size_of_Y_array
;
mov [YLimit], edx
mov edx, [AspectAdjustmentCount]
cmp edx,1
je finish
mov esi, [VPlane]
mov [currAspectCount], edx
mov [localAspectAdjustmentCount], edx
xor eax, eax
mov edi, [CCOCursor]
mov edx, [DistanceFromVToU]
mov ebp, [YPlane]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; cannot access parameters beyond this point
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov ebx, [localFrameWidth]
mov eax, [localYPitch]
add ebp, ebx
mov [YCursorEven], ebp ; YPlane + FrameWidth
add ebp, eax
sar ebx, 1 ; FrameWidth/2
mov [YCursorOdd], ebp ; YPlane + FrameWidth + YPitch
add esi, ebx ; VPlane + FrameWidth/2
;
add edx, esi ; UPlane + FrameWidth/2
neg ebx
mov [localFrameWidth], ebx ; -FrameWidth/2
; Register Usage:
;
;------------------------------------------------------------------------------
PrepareChromaLine:
mov ebp, [currAspectCount]
mov ebx, [localFrameWidth]
sub ebp, 2
mov eax, [localCCOPitch]
mov [tmpCCOPitch], eax
ja continue
xor eax, eax
add ebp, [localAspectAdjustmentCount]
mov [tmpCCOPitch], eax
continue:
mov [currAspectCount], ebp
do_next_8x2_block:
mov ebp, [YCursorEven]
; here is even line
movdt mm1, [edx+ebx] ; 4 u values
pxor mm0, mm0 ; mm0=0
movdt mm2, [esi+ebx] ; 4 v values
punpcklbw mm1, mm0 ; get 4 unsign u
psubw mm1, Minusg ; get 4 unsign u-128
punpcklbw mm2, mm0 ; get unsign v
psubw mm2, Minusg ; get unsign v-128
movq mm3, mm1 ; save the u-128 unsign
movq mm5, mm1 ; save u-128 unsign
punpcklwd mm1, mm2 ; get 2 low u, v unsign pairs
pmaddwd mm1, UVtG
punpckhwd mm3, mm2 ; create high 2 unsign uv pairs
pmaddwd mm3, UVtG
;
movq [temp_mmx], mm2 ; save v-128
;
movq mm6, [ebp+2*ebx] ; mm6 has 8 y pixels
;
psubusb mm6, Yadd ; mm6 has 8 y-16 pixels
packssdw mm1, mm3 ; packed the results to signed words
movq mm7, mm6 ; save the 8 y-16 pixels
punpcklbw mm6, mm0 ; mm6 has 4 low y-16 unsign
pmullw mm6, Ymul
punpckhbw mm7, mm0 ; mm7 has 4 high y-16 unsign
pmullw mm7, Ymul
movq mm4, mm1
movq [temp_mmx+8], mm1 ; save 4 chroma G values
punpcklwd mm1, mm1 ; chroma G replicate low 2
movq mm0, mm6 ; low y
punpckhwd mm4, mm4 ; chroma G replicate high 2
movq mm3, mm7 ; high y
psubw mm6, mm1 ; 4 low G
psraw mm6, [GRightShift]
psubw mm7, mm4 ; 4 high G values in signed 16 bit
movq mm2, mm5
punpcklwd mm5, mm5 ; replicate the 2 low u pixels
pmullw mm5, UtB
punpckhwd mm2, mm2
psraw mm7, [GRightShift]
pmullw mm2, UtB
packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
;
movq [temp_mmx+16], mm5 ; low chroma B
paddw mm5, mm0 ; 4 low B values in signed 16 bit
movq [temp_mmx+40], mm2 ; high chroma B
paddw mm2, mm3 ; 4 high B values in signed 16 bit
psraw mm5, [BRightShift] ; low B scaled down by 6+(8-5)
;
psraw mm2, [BRightShift] ; high B scaled down by 6+(8-5)
;
packuswb mm5, mm2 ; mm5: B7 B6 B5 B4 B3 B2 B1 B0
;
movq mm2, [temp_mmx] ; 4 v values
movq mm1, mm5 ; save B
movq mm7, mm2
punpcklwd mm2, mm2 ; replicate the 2 low v pixels
pmullw mm2, VtR
punpckhwd mm7, mm7
pmullw mm7, VtR
;
paddusb mm1, [BUpperLimit] ; mm1: saturate B+0FF-15
;
movq [temp_mmx+24], mm2 ; low chroma R
;
paddw mm2, mm0 ; 4 low R values in signed 16 bit
;
psraw mm2, [RRightShift] ; low R scaled down by 6+(8-5)
pxor mm4, mm4 ; mm4=0 for 8->16 conversion
movq [temp_mmx+32], mm7 ; high chroma R
paddw mm7, mm3 ; 4 high R values in signed 16 bit
psraw mm7, [RRightShift] ; high R scaled down by 6+(8-5)
;
psubusb mm1, [BUpperLimit]
packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
paddusb mm6, [GUpperLimit] ; G fast patch ih
;
psubusb mm6, [GUpperLimit] ; fast patch ih
;
paddusb mm2, [RUpperLimit] ; R
;
psubusb mm2, [RUpperLimit]
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; here we are packing from RGB24 to RGB16
; input:
; mm6: G7 G6 G5 G4 G3 G2 G1 G0
; mm1: B7 B6 B5 B4 B3 B2 B1 B0
; mm2: R7 R6 R5 R4 R3 R2 R1 R0
; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
; when H=2**xBITS-1 (x is for R G B)
; output:
; mm1- result: 4 low RGB16
; mm7- result: 4 high RGB16
; using: mm0- zero register
; mm3- temporary results
; algorithm:
; for (i=0; i<8; i++) {
; RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
; }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
psllq mm2, [RLeftShift] ; position R in the most significant part of the byte
movq mm7, mm1 ; mm1: Save B
; note: no need for shift to place B on the least significant part of the byte
; R in left position, B in the right position so they can be combined
punpcklbw mm1, mm2 ; mm1: 4 low 16 bit RB
pxor mm0, mm0 ; mm0: 0
punpckhbw mm7, mm2 ; mm5: 4 high 16 bit RB
movq mm3, mm6 ; mm3: G
punpcklbw mm6, mm0 ; mm6: low 4 G 16 bit
;
psllw mm6, [GLeftShift] ; shift low G 5 positions
;
punpckhbw mm3, mm0 ; mm3: high 4 G 16 bit
por mm1, mm6 ; mm1: low RBG16
psllw mm3, [GLeftShift] ; shift high G 5 positions
;
por mm7, mm3 ; mm5: high RBG16
;
mov ebp, [YCursorOdd] ; moved to here to save cycles before odd line
;
movq [edi], mm1 ; !! aligned
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;- start odd line
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
pxor mm2, mm2
psubusb mm1, Yadd ; mm1 has 8 pixels y-16
;
movq mm5, mm1
punpcklbw mm1, mm2 ; get 4 low y-16 unsign pixels word
pmullw mm1, Ymul ; low 4 luminance contribution
punpckhbw mm5, mm2 ; 4 high y-16
pmullw mm5, Ymul ; high 4 luminance contribution
;
movq [edi+8], mm7 ; !! aligned
movq mm0, mm1
paddw mm0, [temp_mmx+24] ; low 4 R
movq mm6, mm5
psraw mm0, [RRightShift] ; low R scaled down by 6+(8-5)
;
paddw mm5, [temp_mmx+32] ; high 4 R
movq mm2, mm1
psraw mm5, [RRightShift] ; high R scaled down by 6+(8-5)
;
paddw mm2, [temp_mmx+16] ; low 4 B
packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
psraw mm2, [BRightShift] ; low B scaled down by 6+(8-5)
movq mm5, mm6
paddw mm6, [temp_mmx+40] ; high 4 B
;
psraw mm6, [BRightShift] ; high B scaled down by 6+(8-5)
;
movq mm3, [temp_mmx+8] ; chroma G low 4
;
packuswb mm2, mm6 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
movq mm4, mm3
punpcklwd mm3, mm3 ; replicate low 2
;
punpckhwd mm4, mm4 ; replicate high 2
psubw mm1, mm3 ; 4 low G
psraw mm1, [GRightShift] ; low G scaled down by 6+(8-5)
psubw mm5, mm4 ; 4 high G values in signed 16 bit
psraw mm5, [GRightShift] ; high G scaled down by 6+(8-5)
;
paddusb mm2, [BUpperLimit] ; mm1: saturate B+0FF-15
packuswb mm1, mm5 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
psubusb mm2, [BUpperLimit]
;
paddusb mm1, [GUpperLimit] ; G
;
psubusb mm1, [GUpperLimit]
;
paddusb mm0, [RUpperLimit] ; R
;
mov eax, [tmpCCOPitch]
;
psubusb mm0, [RUpperLimit]
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; here we are packing from RGB24 to RGB16
; mm1: G7 G6 G5 G4 G3 G2 G1 G0
; mm2: B7 B6 B5 B4 B3 B2 B1 B0
; mm0: R7 R6 R5 R4 R3 R2 R1 R0
; output:
; mm2- result: 4 low RGB16
; mm7- result: 4 high RGB16
; using: mm4- zero register
; mm3- temporary results
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
psllq mm0, [RLeftShift] ; position R in the most significant part of the byte
movq mm7, mm2 ; mm7: Save B
; note: no need for shift to place B on the least significant part of the byte
; R in left position, B in the right position so they can be combined
punpcklbw mm2, mm0 ; mm1: 4 low 16 bit RB
pxor mm4, mm4 ; mm4: 0
movq mm3, mm1 ; mm3: G
punpckhbw mm7, mm0 ; mm7: 4 high 16 bit RB
punpcklbw mm1, mm4 ; mm1: low 4 G 16 bit
;
punpckhbw mm3, mm4 ; mm3: high 4 G 16 bit
;
psllw mm1, [GLeftShift] ; shift low G 5 positions
por mm2, mm1 ; mm2: low RBG16
psllw mm3, [GLeftShift] ; shift high G 5 positions
;
por mm7, mm3 ; mm7: high RBG16
;
movq [edi+eax], mm2
;
movq [edi+eax+8], mm7 ; aligned
;
add edi, 16 ; ih take 16 bytes (8 pixels-16 bit)
add ebx, 4 ; ? to take 4 pixels together instead of 2
jl do_next_8x2_block ; ? update the loop for 8 y pixels at once
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Update:
; edi: output RGB plane pointer for odd and even line
; ebp: Y Plane address
; esi: V Plane address
; edx: U Plane address
; YcursorEven: Even Y line address
; YCursorOdd: Odd Y line address
; Note: eax, ebx, ecx can be used as scratch registers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov ecx, [CCOSkipDistance]
mov eax, [localYPitch]
add edi, ecx ; go to begin of next even line
mov ecx, [tmpCCOPitch]
add edi, ecx ; skip odd line
mov ecx, [localChromaPitch]
add esi, ecx
add ebp, eax ; skip two lines
mov [YCursorEven], ebp ; save even line address
mov ecx, [localChromaPitch]
add edx, ecx
add ebp, eax ; odd line address
mov [YCursorOdd], ebp ; save odd line address
mov eax, [YLimit] ; Done with last line?
cmp ebp, eax
jbe PrepareChromaLine
; ADDedi CCOSkipDistance ; go to begin of next line
; ADDedi tmpCCOPitch ; skip odd line (if it is needed)
; Leax YPitch
; Lebp YCursorOdd
; add ebp, eax ; skip one line
; Sebp YCursorEven
;
; add ebp, eax ; skip one line
; Sebp tmpYCursorOdd
; ADDesi ChromaPitch
; ADDedx ChromaPitch
; Leax YLimit ; Done with last line?
; cmp ebp, eax
; jbe PrepareChromaLine
finish:
mov esp, [StashESP]
;
pop ebx
pop ebp
pop edi
pop esi
ret
MMX_YUV12ToRGB16 ENDP
MMXCODE1 ENDS
END