|
|
;------------------------------------------------------------------------- ; INTEL Corporation Proprietary Information ; ; This listing is supplied under the terms of a license ; agreement with INTEL Corporation and may not be copied ; nor disclosed except in accordance with the terms of ; that agreement. ; ; Copyright (c) 1996 Intel Corporation. ; All Rights Reserved. ; ;-------------------------------------------------------------------------
;// ;// $Header: S:\h26x\src\dec\cx512241.asv ;// ;// $Log: S:\h26x\src\dec\cxm12241.asv $ ;// ;// Rev 1.7 28 May 1996 17:57:10 AGUPTA2 ;// Cosmetic changes to adhere to common coding convention in all MMX ;// color convertors plus bug fixes. ;// ;// ;// Rev 1.2 26 Mar 1996 11:15:30 RMCKENZX ;// ;// Changed calling sequence to MMX_..., changed parameters to ;// new type (eliminated YUV base, etc.). put data in MMXDATA1 segment ;// and code in MMXCODE1 segment. cleaned and commented code. ;// ;// Rev 1.1 20 Mar 1996 11:19:20 RMCKENZX ;// March 96 version. ; ; Rev 1.3 18 Feb 1996 20:57:18 israelh ; new mmx version ; ; Rev 1.2 29 Jan 1996 19:53:52 mikeh ; ; added Ifdef timing ; ; Rev 1.1 29 Jan 1996 16:29:16 mikeh ; remvoed $LOG stuff ; ; Rev 1.0 29 Jan 1996 11:49:48 israelh ; Initial revision. ;// ;// ;// MMX 1.2 26 Jan 1996 IsraelH ;// Optimized code. ;// Adding runtime performane measurments ;// ;// MMX 1.1 23 Dec 1995 IsraelH ;// Using direct calculations with 10.6 precission. ;// Using 8x2 loop to use the same U,V contibutions for both of the lines. ;// ;// MMX 1.0 16 Dec 1995 IsraelH ;// Port to MMX(TM) without using look up tables ;// ;------------------------------------------------------------------------- ; ; +---------- Color convertor. ; |+--------- For both H261 and H263. ; ||+-------- MMx Version. ; |||++------ Convert from YUV12. ; |||||++---- Convert to RGB24. ; |||||||+--- Zoom by one, i.e. non-zoom. ; |||||||| ; cxm12241 -- This function performs YUV12-to-RGB24 color conversion for H26x. ; It handles the format in which the low order byte is B, the ; second byte is G, and the high order byte is R. ; ; The YUV12 input is planar, 8 bits per pel. The Y plane may have ; a pitch of up to 768. It may have a width less than or equal ; to the pitch. It must be DWORD aligned, and preferably QWORD ; aligned. Pitch and Width must be a multiple of 8. The U ; and V planes may have a different pitch than the Y plane, subject ; to the same limitations. ; OPTION CASEMAP:NONE OPTION PROLOGUE:None OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
.586 .xlist include iammx.inc include memmodel.inc .list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE' MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA' MMXDATA1 ENDS
MMXDATA1 SEGMENT ALIGN 8 ;constants for direct RGB calculation: 4x10.6 values ;PUBLIC Minusg, VtR, VtG, UtG, UtB, Ymul, Yadd, UVtG, lowrgb, lowrgbn, higp, ; highpn, highwn, mzero Minusg DWORD 00800080h, 00800080h VtR DWORD 00660066h, 00660066h VtG DWORD 00340034h, 00340034h UtG DWORD 00190019h, 00190019h UtB DWORD 00810081h, 00810081h Ymul DWORD 004a004ah, 004a004ah Yadd DWORD 10101010h, 10101010h UVtG DWORD 00340019h, 00340019h lowrgb DWORD 00ffffffh, 00000000h lowrgbn DWORD 0ff000000h, 0ffffffffh highp DWORD 00000000h, 0ff000000h highpn DWORD 0ffffffffh, 00ffffffh highwn DWORD 0ffffffffh, 0000ffffh mzero DWORD 00000000h, 00000000h MMXDATA1 ENDS
MMXCODE1 SEGMENT
MMX_YUV12ToRGB24 PROC DIST LANG PUBLIC, AYPlane: DWORD, AVPlane: DWORD, AUPlane: DWORD, AFrameWidth: DWORD, AFrameHeight: DWORD, AYPitch: DWORD, AVPitch: DWORD, AAspectAdjustmentCnt: DWORD, AColorConvertedFrame: DWORD, ADCIOffset: DWORD, ACCOffsetToLine0: DWORD, ACCOPitch: DWORD, ACCType: DWORD
LocalFrameSize = 128 RegisterStorageSize = 16 argument_base EQU ebp + RegisterStorageSize local_base EQU esp ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Arguments: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; YPlane EQU argument_base + 4 VPlane EQU argument_base + 8 UPlane EQU argument_base + 12 FrameWidth EQU argument_base + 16 FrameHeight EQU argument_base + 20 YPitch EQU argument_base + 24 ChromaPitch EQU argument_base + 28 AspectAdjustmentCount EQU argument_base + 32 ColorConvertedFrame EQU argument_base + 36 DCIOffset EQU argument_base + 40 CCOffsetToLine0 EQU argument_base + 44 CCOPitch EQU argument_base + 48 CCType EQU argument_base + 52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Locals (on local stack frame) ; (local_base is aligned at cache-line boundary in the prologue) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; localFrameWidth EQU local_base + 0 localYPitch EQU local_base + 4 localChromaPitch EQU local_base + 8 localAspectAdjustmentCount EQU local_base + 12 localCCOPitch EQU local_base + 16 CCOCursor EQU local_base + 20 CCOSkipDistance EQU local_base + 24 YLimit EQU local_base + 28 DistanceFromVToU EQU local_base + 32 currAspectCount EQU local_base + 36 YCursorEven EQU local_base + 40 YCursorOdd EQU local_base + 44 tmpCCOPitch EQU local_base + 48 StashESP EQU local_base + 52 ; space for two DWORD locals temp_mmx EQU local_base + 64 ; note it is 64 bytes, align at QWORD
push esi push edi push ebp push ebx mov ebp, esp sub esp, LocalFrameSize and esp, -32 ; align at cache line boundary mov [StashESP], ebp ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Save some parameters on local stack frame ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov ebx, [FrameWidth] ; mov [localFrameWidth], ebx mov ebx, [YPitch] mov [localYPitch], ebx mov ebx, [ChromaPitch] mov [localChromaPitch], ebx mov ebx, [AspectAdjustmentCount] mov [localAspectAdjustmentCount], ebx mov ebx, [CCOPitch] mov [localCCOPitch], ebx ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Set-up rest of the local stack frame ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov ebx, [VPlane] mov ecx, [UPlane] mov eax, [ColorConvertedFrame] sub ecx, ebx mov edx, [DCIOffset] mov [DistanceFromVToU], ecx ; UPlane - VPlane mov ecx, [CCOffsetToLine0] add eax, edx ; ColorConvertedFrame+DCIOffset mov edx, [FrameHeight] add eax, ecx ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0 mov ecx, [localYPitch] mov [CCOCursor],eax ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0 mov ebx, [localFrameWidth] mov eax, [CCOPitch] ; imul edx, ecx ; FrameHeight*YPitch ; sub eax, ebx ; CCOPitch-FrameWidth mov esi, [YPlane] ; Fetch cursor over luma plane. sub eax, ebx ; CCOPitch-2*FrameWidth add edx, esi ; YPlane+Size_of_Y_array sub eax, ebx ; CCOPitch-3*FrameWidth mov [YLimit], edx ; YPlane+Size_of_Y_array mov [CCOSkipDistance], eax ; CCOPitch-3*FrameWidth mov edx, [localAspectAdjustmentCount] mov esi, [VPlane] cmp edx,1 je finish mov [currAspectCount], edx mov eax, [localYPitch] mov edi, [CCOCursor] mov edx, [DistanceFromVToU] mov ebp, [YPlane] mov ebx, [localFrameWidth] add ebp,ebx ; mov [YCursorEven], ebp add ebp,eax mov [YCursorOdd], ebp ; sar ebx,1 ; add esi,ebx ; add edx,esi neg ebx mov [localFrameWidth], ebx ; -FrameWidth/2 ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; The following loops do two lines of Y (one line of UV). ; The inner loop (do_next_8x2_block) does 8 pels on the even line and ; the 8 pels immediately below them (sharing the same chroma) on the ; odd line. ; ; Core Register Usage: ; eax output pitch (for odd line writes) ; ebx cursor within the line. Starts at -Width, runs up to 0 ; ecx -- unused -- ; edx U plane base address ; ebp Y plane base address ; esi V plane base address ; edi output RGB plane pointer ; ; The YUV plane base addresses are previously biased by -Width and are ; used in conjunction with ebx. ; ; CAUTION: Parameters should not be referenced beyond this point. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
PrepareChromaLine: mov ebp, [currAspectCount] mov ebx, [localFrameWidth] sub ebp, 2 mov eax, [localCCOPitch] mov [tmpCCOPitch], eax ja continue xor eax, eax add ebp, [localAspectAdjustmentCount] mov [tmpCCOPitch], eax
continue: mov [currAspectCount], ebp
do_next_8x2_block: mov ebp, [YCursorEven] ; movdt mm1, [edx+ebx] ; mm1 = xxxxxxxx U76 U54 U32 U10 pxor mm0, mm0 ; mm0 = 0 movdt mm2, [esi+ebx] ; mm2 = xxxxxxxx V76 V54 V32 V10 punpcklbw mm1, mm0 ; mm1 = .U76 .U54 .U32 .U10 psubw mm1, Minusg ; unbias U (sub 128) punpcklbw mm2, mm0 ; mm2 = .V76 .V54 .V32 .V10 psubw mm2, Minusg ; unbias V (sub 128) movq mm3, mm1 ; mm3 = .U76 .U54 .U32 .U10 ; *** delay cycle for store *** movq [temp_mmx+48], mm1 ; stash .U76 .U54 .U32 .U10 punpcklwd mm1, mm2 ; mm1 = .V32 .U32 .V10 .U10 pmaddwd mm1, UVtG ; mm1 = .....G32 .....G10 (from chroma) punpckhwd mm3, mm2 ; mm3 = .V76 .U76 .V54 .U54 pmaddwd mm3, UVtG ; mm3 = .....G76 .....G54 (from chroma) ; movq [temp_mmx], mm2 ; stash .V76 .V54 .V32 .V10 ; movq mm6, [ebp+2*ebx] ; mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 ; psubusb mm6, Yadd ; unbias Y (sub 16) & clip at 0 packssdw mm1, mm3 ; mm1 = .G76 .G54 .G32 .G10 (from chroma) movq mm7, mm6 ; mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 punpcklbw mm6, mm0 ; mm6 = ..Y3 ..Y2 ..Y1 ..Y0 pmullw mm6, Ymul ; mm6 = ..G3 ..G2 ..G1 ..G0 (from luma) punpckhbw mm7, mm0 ; mm7 = ..Y7 ..Y6 ..Y5 ..Y4 pmullw mm7, Ymul ; mm7 = ..G7 ..G6 ..G5 ..G4 (from luma) movq mm4, mm1 ; mm4 = .G76 .G54 .G32 .G10 (from chroma) movq [temp_mmx+8], mm1 ; stash .G76 .G54 .G32 .G10 (from chroma) punpcklwd mm1, mm1 ; mm1 = .G32 .G32 .G10 .G10 (from chroma) punpckhwd mm4, mm4 ; mm4 = .G76 .G76 .G54 .G54 (from chroma) movq mm0, mm6 ; mm0 = RGB3 RGB2 RGB1 RGB0 (from luma) movq mm3, mm7 ; mm3 = RGB7 RGB6 RGB5 RGB4 (from luma) psubw mm6, mm1 ; mm6 = ..G3 ..G2 ..G1 ..G0 (scaled total) movq mm1, [temp_mmx+48] ; mm1 = .U76 .U54 .U32 .U10 psubw mm7, mm4 ; mm1 = ..G7 ..G6 ..G5 ..G4 (scaled total) psraw mm6, 6 ; mm6 = ..G3 ..G2 ..G1 ..G0 (total) movq mm2, mm1 ; mm2 = .U76 .U54 .U32 .U10 punpcklwd mm1, mm1 ; mm1 = .U32 .U32 .U10 .U10 ; pmullw mm1, UtB ; mm1 = .B32 .B32 .B10 .B10 (from U) punpckhwd mm2, mm2 ; mm2 = .U76 .U76 .U54 .U54 pmullw mm2, UtB ; mm2 = .B76 .B76 .B54 .B54 (from U) psraw mm7, 6 ; mm6 = ..G7 ..G6 ..G5 ..G4 (total) packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0 ; ; -------- green done -------- movq [temp_mmx+16], mm1 ; stash .B32 .B32 .B10 .B10 (from U) ; movq [temp_mmx+40], mm2 ; stash .B76 .B76 .B54 .B54 (from U) paddw mm1, mm0 ; mm1 = ..B3 ..B2 ..B1 ..B0 (scaled total) paddw mm2, mm3 ; mm1 = ..B7 ..B6 ..B5 ..B4 (scaled total) psraw mm1, 6 ; mm1 = ..B3 ..B2 ..B1 ..B0 (total) psraw mm2, 6 ; mm1 = ..B7 ..B6 ..B5 ..B4 (total) ; packuswb mm1, mm2 ; mm1: B7 B6 B5 B4 B3 B2 B1 B0 ; ; -------- blue done -------- movq mm2, [temp_mmx] ; mm2 = .V76 .V54 .V32 .V10 ; movq mm7, mm2 ; mm7 = .V76 .V54 .V32 .V10 punpcklwd mm2, mm2 ; mm2 = .V32 .V32 .V10 .V10 pmullw mm2, VtR ; mm2 = .R32 .R32 .R10 .R10 (from V) punpckhwd mm7, mm7 ; mm7 = .V76 .V76 .V54 .V54 pmullw mm7, VtR ; mm7 = .R76 .R76 .R54 .R54 (from V) ; ; *** delay for multiply *** movq [temp_mmx+24], mm2 ; stash .R32 .R32 .R10 .R10 (from V) paddw mm2, mm0 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total scaled) psraw mm2, 6 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total) ; movq [temp_mmx+32], mm7 ; stash .R76 .R76 .R54 .R54 (from V) paddw mm7, mm3 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total scaled) psraw mm7, 6 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total) movq mm5, mm1 ; mm5 = B7 B6 B5 B4 B3 B2 B1 B0 packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0 ; ; -------- red done -------- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; shuffle up the results: ; red = mm2 ; green = mm6 ; blue = mm1 ; into red-green-blue order and store ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; punpcklbw mm5, mm6 ; mm5: G3 B3 G2 B2 G1 B1 G0 B0 movq mm4, mm2 ; mm4 = R7 R6 R5 R4 R3 R2 R1 R0 punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0 movq mm3, mm5 ; mm3 = G3 B3 G2 B2 G1 B1 G0 B0 punpcklwd mm5, mm4 ; mm5: R1 R1 G1 B1 R0 R0 G0 B0 ; movq mm0, mm5 ; mm0 = R1 R1 G1 B1 R0 R0 G0 B0 ; pand mm5, lowrgb ; mm5: 0 0 0 0 0 R0 G0 B0 ; pand mm0, lowrgbn ; mm0: R1 R1 G1 B1 R0 0 0 0 ; psrlq mm0, 8 ; mm0: 0 R1 R1 G1 B1 R0 0 0 ; por mm0, mm5 ; mm0: x x R1 G1 B1 R0 G0 B0 ; pand mm0, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0 movq mm5, mm3 ; mm5 = G3 B3 G2 B2 G1 B1 G0 B0 punpckhwd mm5, mm4 ; mm5: R3 R3 G3 B3 R2 R2 G2 B2 ; movq mm4, mm5 ; mm4 = R3 R3 G3 B3 R2 R2 G2 B2 ; psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0 ; por mm0, mm4 ; mm0: G2 B2 R1 G1 B1 R0 G0 B0 psrlq mm5, 24 ; mm5: 0 0 0 R3 R3 G3 B3 R2
punpckhbw mm1, mm6 ; mm1: G7 B7 G6 B6 G5 B5 G4 B4 ; punpckhbw mm2, mm2 ; mm2: R7 R7 R6 R6 R5 R5 R4 R4 ; movq [edi], mm0 ; !! aligned movq mm7, mm1 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4 punpcklwd mm1, mm2 ; mm1: R5 R5 G5 B5 R4 R4 G4 B4 ; movq mm6, mm1 ; mm6: R5 R5 G5 B5 R4 R4 G4 B4 punpckldq mm5, mm1 ; mm5: R4 R4 G4 B4 R3 G3 B3 R2 pand mm5, highpn ; mm5: 0 R4 G4 B4 R3 G3 B3 R2 psllq mm6, 24 ; mm6: B5 R4 R4 G4 B4 0 0 0 pand mm6, highp ; mm6: B5 0 0 0 0 0 0 0 psrlq mm1, 40 ; mm1: 0 0 0 0 0 R5 R5 G5 mov ebp, [YCursorOdd] ; moved to here to save cycles before odd line por mm5, mm6 ; mm5: B5 R4 G4 B4 R3 G3 B3 R2 punpckhwd mm7, mm2 ; mm7: R7 R7 G7 B7 R6 R6 G6 B6 ; punpcklwd mm1, mm7 ; mm1: x x x x G6 B6 R5 G5 ; movq [edi+8], mm5 ; !! aligned ; movdf [edi+16], mm1 ; !!!! aligned ; ; ; start odd line ; movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels psrlq mm7, 24 ; belong to even line - for cycles saving movdf [edi+20], mm7 ; !!!! aligned ; psubusb mm1, Yadd ; mm1 has 8 pixels y-16 ; movq mm5, mm1 ; punpcklbw mm1, mzero ; get 4 low y-16 unsign pixels word ; punpckhbw mm5, mzero ; 4 high y-16 ; pmullw mm1, Ymul ; low 4 luminance contribution ; pmullw mm5, Ymul ; high 4 luminance contribution movq mm0, mm1 paddw mm0, [temp_mmx+24] ; low 4 R movq mm6, mm5 paddw mm5, [temp_mmx+32] ; high 4 R psraw mm0, 6 psraw mm5, 6 ; movq mm2, mm1 packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0 ; -------- red done -------- paddw mm2, [temp_mmx+16] ; low 4 B movq mm5, mm6 paddw mm5, [temp_mmx+40] ; high 4 B psraw mm2, 6 psraw mm5, 6 ; packuswb mm2, mm5 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0 ; ; -------- blue done --------
movq mm3, [temp_mmx+8] ; chroma G low 4 ; movq mm4, mm3 punpcklwd mm3, mm3 ; replicate low 2 punpckhwd mm4, mm4 ; replicate high 2 psubw mm1, mm3 ; 4 low G psubw mm6, mm4 ; 4 high G values in signed 16 bit psraw mm1, 6 ; low G psraw mm6, 6 ; high G ; packuswb mm1, mm6 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0 ; ; -------- green done --------
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; shuffle up the results: ; red = mm0 ; green = mm1 ; blue = mm2 ; into red-green-blue order and store ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movq mm3, mm2 ; B ; punpcklbw mm3, mm1 ; mm3: G3 B3 G2 B2 G1 B1 G0 B0 movq mm4, mm0 ; R punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0 movq mm5, mm3 ; BG mov eax, [tmpCCOPitch] punpcklwd mm3, mm4 ; mm3: R1 R1 G1 B1 R0 R0 G0 B0 movq mm6, mm3 ; save mm3 ; pand mm6, lowrgb ; mm6: 0 0 0 0 0 R0 G0 B0 ; pand mm3, lowrgbn ; mm3: R1 R1 G1 B1 R0 0 0 0 ; psrlq mm3, 8 ; mm3: 0 R1 R1 G1 B1 R0 0 0 ; por mm3, mm6 ; mm3: x x R1 G1 B1 R0 G0 B0 ; pand mm3, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0 movq mm6, mm5 ; BG punpckhwd mm6, mm4 ; mm6: R3 R3 G3 B3 R2 R2 G2 B2 ; movq mm4, mm6 ; psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0 ; por mm3, mm4 ; mm3: G2 B2 R1 G1 B1 R0 G0 B0 ; movq [edi+eax], mm3 psrlq mm6, 24 ; mm6: 0 0 0 R3 R3 G3 B3 R2 punpckhbw mm2, mm1 ; mm2: G7 B7 G6 B6 G5 B5 G4 B4 punpckhbw mm0, mm0 ; mm0: R7 R7 R6 R6 R5 R5 R4 R4 movq mm7, mm2 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4 punpcklwd mm7, mm0 ; mm7: x R5 G5 B5 x R4 G4 B4 ; punpckldq mm6, mm7 ; mm6: R4 R4 G4 B4 R3 G3 B3 R2 movq mm4, mm7 psllq mm4, 24 ; mm4: B5 R4 R4 G4 B4 0 0 0 ; pand mm6, highpn ; mm6: 0 R4 G4 B4 R3 G3 B3 R2 psrlq mm7, 40 ; mm7: 0 0 0 0 0 R5 R5 G5 pand mm4, highp ; mm4: B5 0 0 0 0 0 0 0 0 punpckhwd mm2, mm0 ; mm2: R7 R7 G7 B7 R6 R6 G6 B6 por mm6, mm4 ; mm6: B5 R4 G4 B4 R3 G3 B3 R2 punpcklwd mm7, mm2 ; mm7 x x x x G6 B6 R5 G5 psrlq mm2, 24 ; punpckldq mm7, mm2 ; movq [edi+eax+8], mm6 ; aligned ; movq [edi+eax+16], mm7 add edi, 24 ; ih take 24 instead of 12 output add ebx, 4 ; ? to take 4 pixels together instead of 2 jl do_next_8x2_block ; ? update the loop for 8 y pixels at once ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Update: ; edi: output RGB plane pointer for odd and even line ; ebp: Y Plane address ; esi: V Plane address ; edx: U Plane address ; YcursorEven: Even Y line address ; YCursorOdd: Odd Y line address ; Note: eax, ebx, ecx can be used as scratch registers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov ecx, [CCOSkipDistance] mov eax, [localYPitch] add edi, ecx ; go to begin of next even line mov ecx, [tmpCCOPitch] add edi, ecx ; skip odd line mov ecx, [localChromaPitch] add esi, ecx add ebp, eax ; skip two lines mov [YCursorEven], ebp ; save even line address mov ecx, [localChromaPitch] add edx, ecx add ebp, eax ; odd line address mov [YCursorOdd], ebp ; save odd line address mov eax, [YLimit] ; Done with last line? cmp ebp, eax jbe PrepareChromaLine ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; end do 2 lines loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
finish: mov esp, [StashESP] ; pop ebx pop ebp pop edi pop esi ret
MMX_YUV12ToRGB24 ENDP
MMXCODE1 ENDS
END
|