You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
573 lines
24 KiB
573 lines
24 KiB
;-------------------------------------------------------------------------
|
|
; INTEL Corporation Proprietary Information
|
|
;
|
|
; This listing is supplied under the terms of a license
|
|
; agreement with INTEL Corporation and may not be copied
|
|
; nor disclosed except in accordance with the terms of
|
|
; that agreement.
|
|
;
|
|
; Copyright (c) 1996 Intel Corporation.
|
|
; All Rights Reserved.
|
|
;
|
|
;-------------------------------------------------------------------------
|
|
|
|
;//
|
|
;// $Header: S:\h26x\src\dec\cx512241.asv
|
|
;//
|
|
;// $Log: S:\h26x\src\dec\cxm12241.asv $
|
|
;//
|
|
;// Rev 1.7 28 May 1996 17:57:10 AGUPTA2
|
|
;// Cosmetic changes to adhere to common coding convention in all MMX
|
|
;// color convertors plus bug fixes.
|
|
;//
|
|
;//
|
|
;// Rev 1.2 26 Mar 1996 11:15:30 RMCKENZX
|
|
;//
|
|
;// Changed calling sequence to MMX_..., changed parameters to
|
|
;// new type (eliminated YUV base, etc.). put data in MMXDATA1 segment
|
|
;// and code in MMXCODE1 segment. cleaned and commented code.
|
|
;//
|
|
;// Rev 1.1 20 Mar 1996 11:19:20 RMCKENZX
|
|
;// March 96 version.
|
|
;
|
|
; Rev 1.3 18 Feb 1996 20:57:18 israelh
|
|
; new mmx version
|
|
;
|
|
; Rev 1.2 29 Jan 1996 19:53:52 mikeh
|
|
;
|
|
; added Ifdef timing
|
|
;
|
|
; Rev 1.1 29 Jan 1996 16:29:16 mikeh
|
|
; remvoed $LOG stuff
|
|
;
|
|
; Rev 1.0 29 Jan 1996 11:49:48 israelh
|
|
; Initial revision.
|
|
;//
|
|
;//
|
|
;// MMX 1.2 26 Jan 1996 IsraelH
|
|
;// Optimized code.
|
|
;// Adding runtime performane measurments
|
|
;//
|
|
;// MMX 1.1 23 Dec 1995 IsraelH
|
|
;// Using direct calculations with 10.6 precission.
|
|
;// Using 8x2 loop to use the same U,V contibutions for both of the lines.
|
|
;//
|
|
;// MMX 1.0 16 Dec 1995 IsraelH
|
|
;// Port to MMX(TM) without using look up tables
|
|
;//
|
|
;-------------------------------------------------------------------------
|
|
;
|
|
; +---------- Color convertor.
|
|
; |+--------- For both H261 and H263.
|
|
; ||+-------- MMx Version.
|
|
; |||++------ Convert from YUV12.
|
|
; |||||++---- Convert to RGB24.
|
|
; |||||||+--- Zoom by one, i.e. non-zoom.
|
|
; ||||||||
|
|
; cxm12241 -- This function performs YUV12-to-RGB24 color conversion for H26x.
|
|
; It handles the format in which the low order byte is B, the
|
|
; second byte is G, and the high order byte is R.
|
|
;
|
|
; The YUV12 input is planar, 8 bits per pel. The Y plane may have
|
|
; a pitch of up to 768. It may have a width less than or equal
|
|
; to the pitch. It must be DWORD aligned, and preferably QWORD
|
|
; aligned. Pitch and Width must be a multiple of 8. The U
|
|
; and V planes may have a different pitch than the Y plane, subject
|
|
; to the same limitations.
|
|
;
|
|
OPTION CASEMAP:NONE
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
|
|
.586
|
|
.xlist
|
|
include iammx.inc
|
|
include memmodel.inc
|
|
.list
|
|
|
|
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
|
|
MMXCODE1 ENDS
|
|
|
|
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
|
|
MMXDATA1 ENDS
|
|
|
|
MMXDATA1 SEGMENT
|
|
ALIGN 8
|
|
;constants for direct RGB calculation: 4x10.6 values
|
|
;PUBLIC Minusg, VtR, VtG, UtG, UtB, Ymul, Yadd, UVtG, lowrgb, lowrgbn, higp,
|
|
; highpn, highwn, mzero
|
|
Minusg DWORD 00800080h, 00800080h
|
|
VtR DWORD 00660066h, 00660066h
|
|
VtG DWORD 00340034h, 00340034h
|
|
UtG DWORD 00190019h, 00190019h
|
|
UtB DWORD 00810081h, 00810081h
|
|
Ymul DWORD 004a004ah, 004a004ah
|
|
Yadd DWORD 10101010h, 10101010h
|
|
UVtG DWORD 00340019h, 00340019h
|
|
lowrgb DWORD 00ffffffh, 00000000h
|
|
lowrgbn DWORD 0ff000000h, 0ffffffffh
|
|
highp DWORD 00000000h, 0ff000000h
|
|
highpn DWORD 0ffffffffh, 00ffffffh
|
|
highwn DWORD 0ffffffffh, 0000ffffh
|
|
mzero DWORD 00000000h, 00000000h
|
|
MMXDATA1 ENDS
|
|
|
|
MMXCODE1 SEGMENT
|
|
|
|
MMX_YUV12ToRGB24 PROC DIST LANG PUBLIC,
|
|
AYPlane: DWORD,
|
|
AVPlane: DWORD,
|
|
AUPlane: DWORD,
|
|
AFrameWidth: DWORD,
|
|
AFrameHeight: DWORD,
|
|
AYPitch: DWORD,
|
|
AVPitch: DWORD,
|
|
AAspectAdjustmentCnt: DWORD,
|
|
AColorConvertedFrame: DWORD,
|
|
ADCIOffset: DWORD,
|
|
ACCOffsetToLine0: DWORD,
|
|
ACCOPitch: DWORD,
|
|
ACCType: DWORD
|
|
|
|
LocalFrameSize = 128
|
|
RegisterStorageSize = 16
|
|
argument_base EQU ebp + RegisterStorageSize
|
|
local_base EQU esp
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Arguments:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
YPlane EQU argument_base + 4
|
|
VPlane EQU argument_base + 8
|
|
UPlane EQU argument_base + 12
|
|
FrameWidth EQU argument_base + 16
|
|
FrameHeight EQU argument_base + 20
|
|
YPitch EQU argument_base + 24
|
|
ChromaPitch EQU argument_base + 28
|
|
AspectAdjustmentCount EQU argument_base + 32
|
|
ColorConvertedFrame EQU argument_base + 36
|
|
DCIOffset EQU argument_base + 40
|
|
CCOffsetToLine0 EQU argument_base + 44
|
|
CCOPitch EQU argument_base + 48
|
|
CCType EQU argument_base + 52
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Locals (on local stack frame)
|
|
; (local_base is aligned at cache-line boundary in the prologue)
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
localFrameWidth EQU local_base + 0
|
|
localYPitch EQU local_base + 4
|
|
localChromaPitch EQU local_base + 8
|
|
localAspectAdjustmentCount EQU local_base + 12
|
|
localCCOPitch EQU local_base + 16
|
|
CCOCursor EQU local_base + 20
|
|
CCOSkipDistance EQU local_base + 24
|
|
YLimit EQU local_base + 28
|
|
DistanceFromVToU EQU local_base + 32
|
|
currAspectCount EQU local_base + 36
|
|
YCursorEven EQU local_base + 40
|
|
YCursorOdd EQU local_base + 44
|
|
tmpCCOPitch EQU local_base + 48
|
|
StashESP EQU local_base + 52
|
|
; space for two DWORD locals
|
|
temp_mmx EQU local_base + 64 ; note it is 64 bytes, align at QWORD
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
mov ebp, esp
|
|
sub esp, LocalFrameSize
|
|
and esp, -32 ; align at cache line boundary
|
|
mov [StashESP], ebp
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Save some parameters on local stack frame
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
mov ebx, [FrameWidth]
|
|
;
|
|
mov [localFrameWidth], ebx
|
|
mov ebx, [YPitch]
|
|
mov [localYPitch], ebx
|
|
mov ebx, [ChromaPitch]
|
|
mov [localChromaPitch], ebx
|
|
mov ebx, [AspectAdjustmentCount]
|
|
mov [localAspectAdjustmentCount], ebx
|
|
mov ebx, [CCOPitch]
|
|
mov [localCCOPitch], ebx
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Set-up rest of the local stack frame
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
mov ebx, [VPlane]
|
|
mov ecx, [UPlane]
|
|
mov eax, [ColorConvertedFrame]
|
|
sub ecx, ebx
|
|
mov edx, [DCIOffset]
|
|
mov [DistanceFromVToU], ecx ; UPlane - VPlane
|
|
mov ecx, [CCOffsetToLine0]
|
|
add eax, edx ; ColorConvertedFrame+DCIOffset
|
|
mov edx, [FrameHeight]
|
|
add eax, ecx ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0
|
|
mov ecx, [localYPitch]
|
|
mov [CCOCursor],eax ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0
|
|
mov ebx, [localFrameWidth]
|
|
mov eax, [CCOPitch]
|
|
;
|
|
imul edx, ecx ; FrameHeight*YPitch
|
|
;
|
|
sub eax, ebx ; CCOPitch-FrameWidth
|
|
mov esi, [YPlane] ; Fetch cursor over luma plane.
|
|
sub eax, ebx ; CCOPitch-2*FrameWidth
|
|
add edx, esi ; YPlane+Size_of_Y_array
|
|
sub eax, ebx ; CCOPitch-3*FrameWidth
|
|
mov [YLimit], edx ; YPlane+Size_of_Y_array
|
|
mov [CCOSkipDistance], eax ; CCOPitch-3*FrameWidth
|
|
mov edx, [localAspectAdjustmentCount]
|
|
mov esi, [VPlane]
|
|
cmp edx,1
|
|
je finish
|
|
mov [currAspectCount], edx
|
|
mov eax, [localYPitch]
|
|
mov edi, [CCOCursor]
|
|
mov edx, [DistanceFromVToU]
|
|
mov ebp, [YPlane]
|
|
mov ebx, [localFrameWidth]
|
|
add ebp,ebx
|
|
;
|
|
mov [YCursorEven], ebp
|
|
add ebp,eax
|
|
mov [YCursorOdd], ebp
|
|
;
|
|
sar ebx,1
|
|
;
|
|
add esi,ebx
|
|
;
|
|
add edx,esi
|
|
neg ebx
|
|
mov [localFrameWidth], ebx ; -FrameWidth/2
|
|
;
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;
|
|
; The following loops do two lines of Y (one line of UV).
|
|
; The inner loop (do_next_8x2_block) does 8 pels on the even line and
|
|
; the 8 pels immediately below them (sharing the same chroma) on the
|
|
; odd line.
|
|
;
|
|
; Core Register Usage:
|
|
; eax output pitch (for odd line writes)
|
|
; ebx cursor within the line. Starts at -Width, runs up to 0
|
|
; ecx -- unused --
|
|
; edx U plane base address
|
|
; ebp Y plane base address
|
|
; esi V plane base address
|
|
; edi output RGB plane pointer
|
|
;
|
|
; The YUV plane base addresses are previously biased by -Width and are
|
|
; used in conjunction with ebx.
|
|
;
|
|
; CAUTION: Parameters should not be referenced beyond this point.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
PrepareChromaLine:
|
|
mov ebp, [currAspectCount]
|
|
mov ebx, [localFrameWidth]
|
|
sub ebp, 2
|
|
mov eax, [localCCOPitch]
|
|
mov [tmpCCOPitch], eax
|
|
ja continue
|
|
xor eax, eax
|
|
add ebp, [localAspectAdjustmentCount]
|
|
mov [tmpCCOPitch], eax
|
|
|
|
continue:
|
|
mov [currAspectCount], ebp
|
|
|
|
do_next_8x2_block:
|
|
mov ebp, [YCursorEven]
|
|
;
|
|
movdt mm1, [edx+ebx] ; mm1 = xxxxxxxx U76 U54 U32 U10
|
|
pxor mm0, mm0 ; mm0 = 0
|
|
movdt mm2, [esi+ebx] ; mm2 = xxxxxxxx V76 V54 V32 V10
|
|
punpcklbw mm1, mm0 ; mm1 = .U76 .U54 .U32 .U10
|
|
psubw mm1, Minusg ; unbias U (sub 128)
|
|
punpcklbw mm2, mm0 ; mm2 = .V76 .V54 .V32 .V10
|
|
psubw mm2, Minusg ; unbias V (sub 128)
|
|
movq mm3, mm1 ; mm3 = .U76 .U54 .U32 .U10
|
|
; *** delay cycle for store ***
|
|
movq [temp_mmx+48], mm1 ; stash .U76 .U54 .U32 .U10
|
|
punpcklwd mm1, mm2 ; mm1 = .V32 .U32 .V10 .U10
|
|
pmaddwd mm1, UVtG ; mm1 = .....G32 .....G10 (from chroma)
|
|
punpckhwd mm3, mm2 ; mm3 = .V76 .U76 .V54 .U54
|
|
pmaddwd mm3, UVtG ; mm3 = .....G76 .....G54 (from chroma)
|
|
;
|
|
movq [temp_mmx], mm2 ; stash .V76 .V54 .V32 .V10
|
|
;
|
|
movq mm6, [ebp+2*ebx] ; mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
|
|
;
|
|
psubusb mm6, Yadd ; unbias Y (sub 16) & clip at 0
|
|
packssdw mm1, mm3 ; mm1 = .G76 .G54 .G32 .G10 (from chroma)
|
|
movq mm7, mm6 ; mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
|
|
punpcklbw mm6, mm0 ; mm6 = ..Y3 ..Y2 ..Y1 ..Y0
|
|
pmullw mm6, Ymul ; mm6 = ..G3 ..G2 ..G1 ..G0 (from luma)
|
|
punpckhbw mm7, mm0 ; mm7 = ..Y7 ..Y6 ..Y5 ..Y4
|
|
pmullw mm7, Ymul ; mm7 = ..G7 ..G6 ..G5 ..G4 (from luma)
|
|
movq mm4, mm1 ; mm4 = .G76 .G54 .G32 .G10 (from chroma)
|
|
movq [temp_mmx+8], mm1 ; stash .G76 .G54 .G32 .G10 (from chroma)
|
|
punpcklwd mm1, mm1 ; mm1 = .G32 .G32 .G10 .G10 (from chroma)
|
|
punpckhwd mm4, mm4 ; mm4 = .G76 .G76 .G54 .G54 (from chroma)
|
|
movq mm0, mm6 ; mm0 = RGB3 RGB2 RGB1 RGB0 (from luma)
|
|
movq mm3, mm7 ; mm3 = RGB7 RGB6 RGB5 RGB4 (from luma)
|
|
psubw mm6, mm1 ; mm6 = ..G3 ..G2 ..G1 ..G0 (scaled total)
|
|
movq mm1, [temp_mmx+48] ; mm1 = .U76 .U54 .U32 .U10
|
|
psubw mm7, mm4 ; mm1 = ..G7 ..G6 ..G5 ..G4 (scaled total)
|
|
psraw mm6, 6 ; mm6 = ..G3 ..G2 ..G1 ..G0 (total)
|
|
movq mm2, mm1 ; mm2 = .U76 .U54 .U32 .U10
|
|
punpcklwd mm1, mm1 ; mm1 = .U32 .U32 .U10 .U10
|
|
;
|
|
pmullw mm1, UtB ; mm1 = .B32 .B32 .B10 .B10 (from U)
|
|
punpckhwd mm2, mm2 ; mm2 = .U76 .U76 .U54 .U54
|
|
pmullw mm2, UtB ; mm2 = .B76 .B76 .B54 .B54 (from U)
|
|
psraw mm7, 6 ; mm6 = ..G7 ..G6 ..G5 ..G4 (total)
|
|
packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
|
|
; ; -------- green done --------
|
|
movq [temp_mmx+16], mm1 ; stash .B32 .B32 .B10 .B10 (from U)
|
|
;
|
|
movq [temp_mmx+40], mm2 ; stash .B76 .B76 .B54 .B54 (from U)
|
|
paddw mm1, mm0 ; mm1 = ..B3 ..B2 ..B1 ..B0 (scaled total)
|
|
paddw mm2, mm3 ; mm1 = ..B7 ..B6 ..B5 ..B4 (scaled total)
|
|
psraw mm1, 6 ; mm1 = ..B3 ..B2 ..B1 ..B0 (total)
|
|
psraw mm2, 6 ; mm1 = ..B7 ..B6 ..B5 ..B4 (total)
|
|
;
|
|
packuswb mm1, mm2 ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
|
|
; ; -------- blue done --------
|
|
movq mm2, [temp_mmx] ; mm2 = .V76 .V54 .V32 .V10
|
|
;
|
|
movq mm7, mm2 ; mm7 = .V76 .V54 .V32 .V10
|
|
punpcklwd mm2, mm2 ; mm2 = .V32 .V32 .V10 .V10
|
|
pmullw mm2, VtR ; mm2 = .R32 .R32 .R10 .R10 (from V)
|
|
punpckhwd mm7, mm7 ; mm7 = .V76 .V76 .V54 .V54
|
|
pmullw mm7, VtR ; mm7 = .R76 .R76 .R54 .R54 (from V)
|
|
;
|
|
; *** delay for multiply ***
|
|
movq [temp_mmx+24], mm2 ; stash .R32 .R32 .R10 .R10 (from V)
|
|
paddw mm2, mm0 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total scaled)
|
|
psraw mm2, 6 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total)
|
|
;
|
|
movq [temp_mmx+32], mm7 ; stash .R76 .R76 .R54 .R54 (from V)
|
|
paddw mm7, mm3 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total scaled)
|
|
psraw mm7, 6 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total)
|
|
movq mm5, mm1 ; mm5 = B7 B6 B5 B4 B3 B2 B1 B0
|
|
packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
|
|
; ; -------- red done --------
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; shuffle up the results:
|
|
; red = mm2
|
|
; green = mm6
|
|
; blue = mm1
|
|
; into red-green-blue order and store
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
punpcklbw mm5, mm6 ; mm5: G3 B3 G2 B2 G1 B1 G0 B0
|
|
movq mm4, mm2 ; mm4 = R7 R6 R5 R4 R3 R2 R1 R0
|
|
punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0
|
|
movq mm3, mm5 ; mm3 = G3 B3 G2 B2 G1 B1 G0 B0
|
|
punpcklwd mm5, mm4 ; mm5: R1 R1 G1 B1 R0 R0 G0 B0
|
|
;
|
|
movq mm0, mm5 ; mm0 = R1 R1 G1 B1 R0 R0 G0 B0
|
|
;
|
|
pand mm5, lowrgb ; mm5: 0 0 0 0 0 R0 G0 B0
|
|
;
|
|
pand mm0, lowrgbn ; mm0: R1 R1 G1 B1 R0 0 0 0
|
|
;
|
|
psrlq mm0, 8 ; mm0: 0 R1 R1 G1 B1 R0 0 0
|
|
;
|
|
por mm0, mm5 ; mm0: x x R1 G1 B1 R0 G0 B0
|
|
;
|
|
pand mm0, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0
|
|
movq mm5, mm3 ; mm5 = G3 B3 G2 B2 G1 B1 G0 B0
|
|
punpckhwd mm5, mm4 ; mm5: R3 R3 G3 B3 R2 R2 G2 B2
|
|
;
|
|
movq mm4, mm5 ; mm4 = R3 R3 G3 B3 R2 R2 G2 B2
|
|
;
|
|
psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0
|
|
;
|
|
por mm0, mm4 ; mm0: G2 B2 R1 G1 B1 R0 G0 B0
|
|
psrlq mm5, 24 ; mm5: 0 0 0 R3 R3 G3 B3 R2
|
|
|
|
punpckhbw mm1, mm6 ; mm1: G7 B7 G6 B6 G5 B5 G4 B4
|
|
;
|
|
punpckhbw mm2, mm2 ; mm2: R7 R7 R6 R6 R5 R5 R4 R4
|
|
;
|
|
movq [edi], mm0 ; !! aligned
|
|
movq mm7, mm1 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4
|
|
punpcklwd mm1, mm2 ; mm1: R5 R5 G5 B5 R4 R4 G4 B4
|
|
;
|
|
movq mm6, mm1 ; mm6: R5 R5 G5 B5 R4 R4 G4 B4
|
|
punpckldq mm5, mm1 ; mm5: R4 R4 G4 B4 R3 G3 B3 R2
|
|
pand mm5, highpn ; mm5: 0 R4 G4 B4 R3 G3 B3 R2
|
|
psllq mm6, 24 ; mm6: B5 R4 R4 G4 B4 0 0 0
|
|
pand mm6, highp ; mm6: B5 0 0 0 0 0 0 0
|
|
psrlq mm1, 40 ; mm1: 0 0 0 0 0 R5 R5 G5
|
|
mov ebp, [YCursorOdd] ; moved to here to save cycles before odd line
|
|
por mm5, mm6 ; mm5: B5 R4 G4 B4 R3 G3 B3 R2
|
|
punpckhwd mm7, mm2 ; mm7: R7 R7 G7 B7 R6 R6 G6 B6
|
|
;
|
|
punpcklwd mm1, mm7 ; mm1: x x x x G6 B6 R5 G5
|
|
;
|
|
movq [edi+8], mm5 ; !! aligned
|
|
;
|
|
movdf [edi+16], mm1 ; !!!! aligned
|
|
;
|
|
;
|
|
; start odd line
|
|
;
|
|
movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
|
|
psrlq mm7, 24 ; belong to even line - for cycles saving
|
|
movdf [edi+20], mm7 ; !!!! aligned
|
|
;
|
|
psubusb mm1, Yadd ; mm1 has 8 pixels y-16
|
|
;
|
|
movq mm5, mm1
|
|
;
|
|
punpcklbw mm1, mzero ; get 4 low y-16 unsign pixels word
|
|
;
|
|
punpckhbw mm5, mzero ; 4 high y-16
|
|
;
|
|
pmullw mm1, Ymul ; low 4 luminance contribution
|
|
;
|
|
pmullw mm5, Ymul ; high 4 luminance contribution
|
|
movq mm0, mm1
|
|
paddw mm0, [temp_mmx+24] ; low 4 R
|
|
movq mm6, mm5
|
|
paddw mm5, [temp_mmx+32] ; high 4 R
|
|
psraw mm0, 6
|
|
psraw mm5, 6
|
|
;
|
|
movq mm2, mm1
|
|
packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
|
|
; -------- red done --------
|
|
paddw mm2, [temp_mmx+16] ; low 4 B
|
|
movq mm5, mm6
|
|
paddw mm5, [temp_mmx+40] ; high 4 B
|
|
psraw mm2, 6
|
|
psraw mm5, 6
|
|
;
|
|
packuswb mm2, mm5 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
|
|
; ; -------- blue done --------
|
|
|
|
movq mm3, [temp_mmx+8] ; chroma G low 4
|
|
;
|
|
movq mm4, mm3
|
|
punpcklwd mm3, mm3 ; replicate low 2
|
|
punpckhwd mm4, mm4 ; replicate high 2
|
|
psubw mm1, mm3 ; 4 low G
|
|
psubw mm6, mm4 ; 4 high G values in signed 16 bit
|
|
psraw mm1, 6 ; low G
|
|
psraw mm6, 6 ; high G
|
|
;
|
|
packuswb mm1, mm6 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
|
|
; ; -------- green done --------
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; shuffle up the results:
|
|
; red = mm0
|
|
; green = mm1
|
|
; blue = mm2
|
|
; into red-green-blue order and store
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
movq mm3, mm2 ; B
|
|
;
|
|
punpcklbw mm3, mm1 ; mm3: G3 B3 G2 B2 G1 B1 G0 B0
|
|
movq mm4, mm0 ; R
|
|
punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0
|
|
movq mm5, mm3 ; BG
|
|
mov eax, [tmpCCOPitch]
|
|
punpcklwd mm3, mm4 ; mm3: R1 R1 G1 B1 R0 R0 G0 B0
|
|
movq mm6, mm3 ; save mm3
|
|
;
|
|
pand mm6, lowrgb ; mm6: 0 0 0 0 0 R0 G0 B0
|
|
;
|
|
pand mm3, lowrgbn ; mm3: R1 R1 G1 B1 R0 0 0 0
|
|
;
|
|
psrlq mm3, 8 ; mm3: 0 R1 R1 G1 B1 R0 0 0
|
|
;
|
|
por mm3, mm6 ; mm3: x x R1 G1 B1 R0 G0 B0
|
|
;
|
|
pand mm3, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0
|
|
movq mm6, mm5 ; BG
|
|
punpckhwd mm6, mm4 ; mm6: R3 R3 G3 B3 R2 R2 G2 B2
|
|
;
|
|
movq mm4, mm6
|
|
;
|
|
psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0
|
|
;
|
|
por mm3, mm4 ; mm3: G2 B2 R1 G1 B1 R0 G0 B0
|
|
;
|
|
movq [edi+eax], mm3
|
|
psrlq mm6, 24 ; mm6: 0 0 0 R3 R3 G3 B3 R2
|
|
punpckhbw mm2, mm1 ; mm2: G7 B7 G6 B6 G5 B5 G4 B4
|
|
punpckhbw mm0, mm0 ; mm0: R7 R7 R6 R6 R5 R5 R4 R4
|
|
movq mm7, mm2 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4
|
|
punpcklwd mm7, mm0 ; mm7: x R5 G5 B5 x R4 G4 B4
|
|
;
|
|
punpckldq mm6, mm7 ; mm6: R4 R4 G4 B4 R3 G3 B3 R2
|
|
movq mm4, mm7
|
|
psllq mm4, 24 ; mm4: B5 R4 R4 G4 B4 0 0 0
|
|
;
|
|
pand mm6, highpn ; mm6: 0 R4 G4 B4 R3 G3 B3 R2
|
|
psrlq mm7, 40 ; mm7: 0 0 0 0 0 R5 R5 G5
|
|
pand mm4, highp ; mm4: B5 0 0 0 0 0 0 0 0
|
|
punpckhwd mm2, mm0 ; mm2: R7 R7 G7 B7 R6 R6 G6 B6
|
|
por mm6, mm4 ; mm6: B5 R4 G4 B4 R3 G3 B3 R2
|
|
punpcklwd mm7, mm2 ; mm7 x x x x G6 B6 R5 G5
|
|
psrlq mm2, 24
|
|
;
|
|
punpckldq mm7, mm2
|
|
;
|
|
movq [edi+eax+8], mm6 ; aligned
|
|
;
|
|
movq [edi+eax+16], mm7
|
|
add edi, 24 ; ih take 24 instead of 12 output
|
|
add ebx, 4 ; ? to take 4 pixels together instead of 2
|
|
jl do_next_8x2_block ; ? update the loop for 8 y pixels at once
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Update:
|
|
; edi: output RGB plane pointer for odd and even line
|
|
; ebp: Y Plane address
|
|
; esi: V Plane address
|
|
; edx: U Plane address
|
|
; YcursorEven: Even Y line address
|
|
; YCursorOdd: Odd Y line address
|
|
; Note: eax, ebx, ecx can be used as scratch registers
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
mov ecx, [CCOSkipDistance]
|
|
mov eax, [localYPitch]
|
|
add edi, ecx ; go to begin of next even line
|
|
mov ecx, [tmpCCOPitch]
|
|
add edi, ecx ; skip odd line
|
|
mov ecx, [localChromaPitch]
|
|
add esi, ecx
|
|
add ebp, eax ; skip two lines
|
|
mov [YCursorEven], ebp ; save even line address
|
|
mov ecx, [localChromaPitch]
|
|
add edx, ecx
|
|
add ebp, eax ; odd line address
|
|
mov [YCursorOdd], ebp ; save odd line address
|
|
mov eax, [YLimit] ; Done with last line?
|
|
cmp ebp, eax
|
|
jbe PrepareChromaLine
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; end do 2 lines loop
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
finish:
|
|
mov esp, [StashESP]
|
|
;
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
|
|
MMX_YUV12ToRGB24 ENDP
|
|
|
|
MMXCODE1 ENDS
|
|
|
|
END
|