You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
426 lines
14 KiB
426 lines
14 KiB
OPTION PROLOGUE: None
|
|
OPTION EPILOGUE: ReturnAndRelieveEpilogueMacro
|
|
|
|
.xlist
|
|
include iammx.inc
|
|
include memmodel.inc
|
|
.list
|
|
|
|
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
|
|
MMXCODE1 ENDS
|
|
|
|
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
|
|
MMXDATA1 ENDS
|
|
|
|
MMXCODE1 SEGMENT
|
|
MMX_YUV12ToYUY2 proc DIST LANG PUBLIC,
|
|
AuYPlane: DWORD,
|
|
AuVPlane: DWORD,
|
|
AuUPlane: DWORD,
|
|
AuWidth: DWORD,
|
|
AuHeight: DWORD,
|
|
AuYPitch: DWORD,
|
|
AUVPitch: DWORD,
|
|
AbShapingFlag: DWORD,
|
|
AuCCOutputBuffer: DWORD,
|
|
AlOutput: DWORD,
|
|
AuOffsetToLine0: DWORD,
|
|
AintPitch: DWORD,
|
|
ACCType: DWORD
|
|
|
|
LocalFrameSize = 52
|
|
RegisterStorageSize = 16 ; 4 registers pushed
|
|
|
|
; Argument offsets (after register pushed)
|
|
|
|
uYPlane = LocalFrameSize + RegisterStorageSize + 4
|
|
uVPlane = LocalFrameSize + RegisterStorageSize + 8
|
|
uUPlane = LocalFrameSize + RegisterStorageSize + 12
|
|
uWidth = LocalFrameSize + RegisterStorageSize + 16
|
|
uHeight = LocalFrameSize + RegisterStorageSize + 20
|
|
uYPitch = LocalFrameSize + RegisterStorageSize + 24
|
|
uUVPitch = LocalFrameSize + RegisterStorageSize + 28
|
|
bShapingFlag = LocalFrameSize + RegisterStorageSize + 32
|
|
uCCOutputBuffer = LocalFrameSize + RegisterStorageSize + 36
|
|
lOutput = LocalFrameSize + RegisterStorageSize + 40
|
|
uOffsetToLine0 = LocalFrameSize + RegisterStorageSize + 44
|
|
intPitch = LocalFrameSize + RegisterStorageSize + 48
|
|
CCType = LocalFrameSize + RegisterStorageSize + 52
|
|
|
|
; Local offsets (after register pushes)
|
|
|
|
ASMTMP1 = 48 ; 13
|
|
Y = 44 ; 12
|
|
U = 40 ; 11
|
|
V = 36 ; 10
|
|
Outt = 32 ; 9
|
|
YTemp = 28 ; 8
|
|
UTemp = 24 ; 7
|
|
VTemp = 20 ; 6
|
|
ASMTMP2 = 16 ; 5
|
|
Col = 12 ; 4
|
|
OutTemp = 8 ; 3
|
|
VAL = 4 ; 2
|
|
LineCount = 0 ; 1
|
|
|
|
; Arguments relative to esp
|
|
|
|
_uYPlane EQU [esp + uYPlane]
|
|
_uVPlane EQU [esp + uVPlane]
|
|
_UUPlane EQU [esp + uUPlane]
|
|
_uWidth EQU [esp + uWidth ]
|
|
_uHeight EQU [esp + uHeight]
|
|
_uYPitch EQU [esp + uYPitch]
|
|
_uUVPitch EQU [esp + uUVPitch]
|
|
_bShapingFlag EQU [esp + bShapingFlag]
|
|
_uCCOutputBuffer EQU [esp + uCCOutputBuffer]
|
|
_lOutput EQU [esp + lOutput]
|
|
_uOffsetToLine0 EQU [esp + uOffsetToLine0]
|
|
_intPitch EQU [esp + intPitch]
|
|
_uCCType EQU [esp + CCType]
|
|
|
|
; Locals relative to esp
|
|
|
|
_ASMTMP1 EQU [esp + ASMTMP1]
|
|
_Y EQU [esp + Y]
|
|
_U EQU [esp + U]
|
|
_V EQU [esp + V]
|
|
_Out EQU [esp + Outt]
|
|
_YTemp EQU [esp + YTemp]
|
|
_UTemp EQU [esp + UTemp]
|
|
_VTemp EQU [esp + VTemp]
|
|
_ASMTMP2 EQU [esp + ASMTMP2]
|
|
_Col EQU [esp + Col]
|
|
_OutTemp EQU [esp + OutTemp]
|
|
_VAL EQU [esp + VAL]
|
|
_LineCount EQU [esp + LineCount]
|
|
|
|
; Save registers and start working
|
|
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
|
|
sub esp, LocalFrameSize
|
|
|
|
mov eax, DWORD PTR _bShapingFlag ; eax = bShapingFlag
|
|
mov ecx, DWORD PTR _uYPlane ; ecx = uYPlane
|
|
dec eax ; eax = bShapingFlag - 1
|
|
mov edx, DWORD PTR _uUPlane ; edx = uUPlane
|
|
mov DWORD PTR _LineCount, eax ; eax = FREE, LineCount
|
|
mov DWORD PTR _Y, ecx ; ecx = FREE, Y
|
|
|
|
mov eax, DWORD PTR _uVPlane ; eax = uVPlane
|
|
mov ecx, DWORD PTR _uOffsetToLine0 ; ecx = uOffsetToLine0
|
|
mov DWORD PTR _U, edx ; edx = FREE, U
|
|
add ecx, DWORD PTR _lOutput ; ecx = uOffsetToLine0 +
|
|
|
|
mov DWORD PTR _V, eax ; eax = FREE, V
|
|
mov eax, DWORD PTR _uCCOutputBuffer ; eax = uCCOutputBuffer
|
|
add eax, ecx ; eax = uCCOutputBuffer +
|
|
; uOffsetToLine0 +
|
|
; lOutput
|
|
; ecx = FREE
|
|
mov DWORD PTR _Out, eax ; eax = FREE, Out
|
|
mov eax, DWORD PTR _uHeight ; eax = uHeight
|
|
|
|
sar eax, 1 ; eax = uHeight/2
|
|
mov DWORD PTR _ASMTMP2, eax ; eax = FREE, Row ready to
|
|
; count down
|
|
|
|
RowLoop:; L27704 outer loop over all rows
|
|
|
|
mov ecx, DWORD PTR _Y ; ecx = Y: ecx EQU YTemp
|
|
mov edi, DWORD PTR _U ; edi = U: edi EQU UTemp
|
|
mov ebp, DWORD PTR _V ; ebp = V: ebp EQU VTemp
|
|
mov esi, DWORD PTR _Out ; esi = OutTemp
|
|
mov eax, DWORD PTR _LineCount ; eax = LineCount
|
|
test eax, eax ; is LineCount == 0? eax = FREE
|
|
je SkipEvenRow ; L27708 loop if so, skip the even loop
|
|
mov eax, DWORD PTR _uWidth ; eax = uWidth
|
|
|
|
; Due to the fact that YUV12 non-compressed input files can be
|
|
; any dimension that is a multiple of 4x4 up to CIF, we must
|
|
; check for extra bytes that can't be processed in the following
|
|
; loop. Here, we don't have the luxury of buffer padding to overrun
|
|
; the frame size.
|
|
|
|
test eax, 0FFFFFFF0H
|
|
jz L100
|
|
|
|
EvenRowPels:; L27709 loop over columns in even row - two YUY2 pels at a time.
|
|
|
|
movq mm0, [ecx] ; [ Y07 Y06 Y05 Y04 Y03 Y02 Y01 Y00 ]
|
|
movq mm1, [edi] ; [ U07 U06 U05 U04 U03 U02 U01 U00 ]
|
|
movq mm2, [ebp] ; [ V07 V06 V05 V04 V03 V02 V01 V00 ]
|
|
movq mm3, mm1
|
|
punpcklbw mm3, mm2 ; [ V03 U03 V02 U02 V01 U01 V00 U00 ]
|
|
|
|
movq mm4, mm0
|
|
punpcklbw mm4, mm3 ; [ V01 Y03 U01 Y02 V00 Y01 U00 Y00 ]
|
|
movq [esi], mm4 ; Write out 8 data values.
|
|
psrlq mm3, 32 ; [ 0 0 0 0 V03 U03 V02 U02 ]
|
|
psrlq mm0, 32 ; [ 0 0 0 0 Y07 Y06 Y05 Y04 ]
|
|
punpcklbw mm0, mm3 ; [ V03 Y07 U03 Y06 V02 Y05 U02 Y04 ]
|
|
movq [esi+8], mm0 ; Write out 8 data values.
|
|
movq mm0, [ecx+8] ; [ Y15 Y14 Y13 Y12 Y11 Y10 Y09 Y08 ]
|
|
psrlq mm1, 32 ; [ 0 0 0 0 U07 U06 U05 U04 ]
|
|
psrlq mm2, 32 ; [ 0 0 0 0 V07 V06 V05 V04 ]
|
|
movq mm3, mm1
|
|
punpcklbw mm3, mm2 ; [ V07 U07 V06 U06 V05 U05 V04 U04 ]
|
|
movq mm4, mm0
|
|
punpcklbw mm4, mm3 ; [ V05 Y11 U05 Y10 V04 Y09 U04 Y08 ]
|
|
movq [esi+16], mm4 ; Write out 8 data values.
|
|
psrlq mm3, 32 ; [ 0 0 0 0 V07 U07 V06 U06 ]
|
|
psrlq mm0, 32 ; [ 0 0 0 0 Y15 Y14 Y13 Y12 ]
|
|
punpcklbw mm0, mm3 ; [ V07 Y15 U07 Y14 V06 Y13 U06 Y12 ]
|
|
movq [esi+24], mm0 ; Write out 8 data values.
|
|
lea ecx, [ecx+16] ; Advance Y pointer.
|
|
lea edi, [edi+8] ; Advance U pointer.
|
|
lea ebp, [ebp+8] ; Advance V pointer.
|
|
lea esi, [esi+32] ; Advance Out pointer.
|
|
sub eax, 16
|
|
test eax, 0FFFFFFF0H
|
|
jnz EvenRowPels
|
|
|
|
test eax, eax
|
|
jz L101
|
|
|
|
; eax can be 4, 8 or 12
|
|
L100:
|
|
mov ebx, [ecx] ; [ Y03 Y02 Y01 Y00 ]
|
|
mov dl, [edi] ; [ U00 ]
|
|
mov dh, [ebp] ; [ V00 ]
|
|
mov [esi], bl
|
|
mov [esi+1], dl
|
|
mov [esi+2], bh
|
|
mov [esi+3], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+1] ; [ U01 ]
|
|
mov dh, [ebp+1] ; [ V01 ]
|
|
mov [esi+4], bl
|
|
mov [esi+5], dl
|
|
mov [esi+6], bh
|
|
mov [esi+7], dh
|
|
sub eax, 4
|
|
jz L101
|
|
|
|
mov ebx, [ecx+4] ; [ Y07 Y06 Y05 Y04 ]
|
|
mov dl, [edi+2] ; [ U02 ]
|
|
mov dh, [ebp+2] ; [ V02 ]
|
|
mov [esi+8], bl
|
|
mov [esi+9], dl
|
|
mov [esi+10], bh
|
|
mov [esi+11], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+3] ; [ U03 ]
|
|
mov dh, [ebp+3] ; [ V03 ]
|
|
mov [esi+12], bl
|
|
mov [esi+13], dl
|
|
mov [esi+14], bh
|
|
mov [esi+15], dh
|
|
sub eax, 4
|
|
jz L101
|
|
|
|
mov ebx, [ecx+8] ; [ Y11 Y10 Y09 Y08 ]
|
|
mov dl, [edi+4] ; [ U04 ]
|
|
mov dh, [ebp+4] ; [ V04 ]
|
|
mov [esi+16], bl
|
|
mov [esi+17], dl
|
|
mov [esi+18], bh
|
|
mov [esi+19], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+5] ; [ U05 ]
|
|
mov dh, [ebp+5] ; [ V05 ]
|
|
mov [esi+20], bl
|
|
mov [esi+21], dl
|
|
mov [esi+22], bh
|
|
mov [esi+23], dh
|
|
|
|
L101:
|
|
mov eax, DWORD PTR _LineCount ; eax = LineCount
|
|
jmp SHORT UpdatePointers ; L27770
|
|
|
|
SkipEvenRow:; L27708
|
|
|
|
mov eax, DWORD PTR _bShapingFlag ; eax = bShapingFlag
|
|
mov edx, DWORD PTR _Out ; edx = Out
|
|
mov ebx, DWORD PTR _intPitch ; edx = intPitch
|
|
sub edx, ebx ; edx = Out - intPitch
|
|
mov DWORD PTR _Out, edx ; save Out
|
|
|
|
UpdatePointers: ; L27770
|
|
|
|
mov ecx, DWORD PTR _Y ; ecx = Y
|
|
dec eax ; eax = LineCount-1 OR bShapingFlag - 1
|
|
mov edx, DWORD PTR _intPitch ; edx = intPitch
|
|
mov esi, DWORD PTR _Out ; esi = Out
|
|
mov DWORD PTR _LineCount, eax ; store decremented linecount
|
|
; eax = FREE
|
|
add esi, edx ; (esi) Out += intPitch ***
|
|
mov eax, DWORD PTR _uYPitch ; eax = uYPitch
|
|
mov edi, DWORD PTR _U ; edi = U ***
|
|
add ecx, eax ; (ecx) Y += uYPitch ***
|
|
mov ebp, DWORD PTR _V ; ebp = V ***
|
|
mov DWORD PTR _Y, ecx ; store updated Y
|
|
|
|
mov DWORD PTR _Out, esi ; store Out
|
|
mov eax, DWORD PTR _LineCount ; eax = LineCount
|
|
|
|
test eax, eax ; is LineCount == 0?
|
|
; if so, ignore the odd
|
|
; row loop over columns
|
|
je SkipOddRow ; L27714
|
|
|
|
mov eax, DWORD PTR _uWidth ; eax = uWidth
|
|
|
|
; Due to the fact that YUV12 non-compressed input files can be
|
|
; any dimension that is a multiple of 4x4 up to CIF, we must
|
|
; check for extra bytes that can't be processed in the following
|
|
; loop. Here, we don't have the luxury of buffer padding to overrun
|
|
; the frame size.
|
|
|
|
test eax, 0FFFFFFF0H
|
|
jz L102
|
|
|
|
OddRowPels: ;L27715 loop over columns of odd rows
|
|
|
|
movq mm0, [ecx] ; [ Y07 Y06 Y05 Y04 Y03 Y02 Y01 Y00 ]
|
|
movq mm1, [edi] ; [ U07 U06 U05 U04 U03 U02 U01 U00 ]
|
|
movq mm2, [ebp] ; [ V07 V06 V05 V04 V03 V02 V01 V00 ]
|
|
movq mm3, mm1
|
|
punpcklbw mm3, mm2 ; [ V03 U03 V02 U02 V01 U01 V00 U00 ]
|
|
movq mm4, mm0
|
|
punpcklbw mm4, mm3 ; [ V01 Y03 U01 Y02 V00 Y01 U00 Y00 ]
|
|
movq [esi], mm4 ; Write out 8 data values.
|
|
psrlq mm3, 32 ; [ 0 0 0 0 V03 U03 V02 U02 ]
|
|
psrlq mm0, 32 ; [ 0 0 0 0 Y07 Y06 Y05 Y04 ]
|
|
punpcklbw mm0, mm3 ; [ V03 Y07 U03 Y06 V02 Y05 U02 Y04 ]
|
|
movq [esi+8], mm0 ; Write out 8 data values.
|
|
movq mm0, [ecx+8] ; [ Y15 Y14 Y13 Y12 Y11 Y10 Y09 Y08 ]
|
|
psrlq mm1, 32 ; [ 0 0 0 0 U07 U06 U05 U04 ]
|
|
psrlq mm2, 32 ; [ 0 0 0 0 V07 V06 V05 V04 ]
|
|
movq mm3, mm1
|
|
punpcklbw mm3, mm2 ; [ V07 U07 V06 U06 V05 U05 V04 U04 ]
|
|
movq mm4, mm0
|
|
punpcklbw mm4, mm3 ; [ V05 Y11 U05 Y10 V04 Y09 U04 Y08 ]
|
|
movq [esi+16], mm4 ; Write out 8 data values.
|
|
psrlq mm3, 32 ; [ 0 0 0 0 V07 U07 V06 U06 ]
|
|
psrlq mm0, 32 ; [ 0 0 0 0 Y15 Y14 Y13 Y12 ]
|
|
punpcklbw mm0, mm3 ; [ V07 Y15 U07 Y14 V06 Y13 U06 Y12 ]
|
|
movq [esi+24], mm0 ; Write out 8 data values.
|
|
lea ecx, [ecx+16] ; Advance Y pointer.
|
|
lea edi, [edi+8] ; Advance U pointer.
|
|
lea ebp, [ebp+8] ; Advance V pointer.
|
|
lea esi, [esi+32] ; Advance Out pointer.
|
|
sub eax, 16
|
|
test eax, 0FFFFFFF0H
|
|
jnz OddRowPels
|
|
|
|
test eax, eax
|
|
jz L103
|
|
|
|
; eax can be 4, 8 or 12
|
|
L102:
|
|
mov ebx, [ecx] ; [ Y03 Y02 Y01 Y00 ]
|
|
mov dl, [edi] ; [ U00 ]
|
|
mov dh, [ebp] ; [ V00 ]
|
|
mov [esi], bl
|
|
mov [esi+1], dl
|
|
mov [esi+2], bh
|
|
mov [esi+3], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+1] ; [ U01 ]
|
|
mov dh, [ebp+1] ; [ V01 ]
|
|
mov [esi+4], bl
|
|
mov [esi+5], dl
|
|
mov [esi+6], bh
|
|
mov [esi+7], dh
|
|
sub eax, 4
|
|
jz L103
|
|
|
|
mov ebx, [ecx+4] ; [ Y07 Y06 Y05 Y04 ]
|
|
mov dl, [edi+2] ; [ U02 ]
|
|
mov dh, [ebp+2] ; [ V02 ]
|
|
mov [esi+8], bl
|
|
mov [esi+9], dl
|
|
mov [esi+10], bh
|
|
mov [esi+11], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+3] ; [ U03 ]
|
|
mov dh, [ebp+3] ; [ V03 ]
|
|
mov [esi+12], bl
|
|
mov [esi+13], dl
|
|
mov [esi+14], bh
|
|
mov [esi+15], dh
|
|
sub eax, 4
|
|
jz L103
|
|
|
|
mov ebx, [ecx+8] ; [ Y11 Y10 Y09 Y08 ]
|
|
mov dl, [edi+4] ; [ U04 ]
|
|
mov dh, [ebp+4] ; [ V04 ]
|
|
mov [esi+16], bl
|
|
mov [esi+17], dl
|
|
mov [esi+18], bh
|
|
mov [esi+19], dh
|
|
shr ebx, 16
|
|
mov dl, [edi+5] ; [ U05 ]
|
|
mov dh, [ebp+5] ; [ V05 ]
|
|
mov [esi+20], bl
|
|
mov [esi+21], dl
|
|
mov [esi+22], bh
|
|
mov [esi+23], dh
|
|
|
|
L103:
|
|
mov eax, DWORD PTR _LineCount ; eax = LineCount
|
|
jmp SHORT UpdateAllPointers ; L27771
|
|
|
|
SkipOddRow: ;L27714
|
|
|
|
mov eax, DWORD PTR _bShapingFlag ; eax = bShapingFlag
|
|
mov edx, DWORD PTR _Out ; edx = Out
|
|
mov ebx, DWORD PTR _intPitch ; edx = intPitch
|
|
sub edx, ebx ; edx = Out - intPitch
|
|
mov DWORD PTR _Out, edx ; save Out
|
|
|
|
UpdateAllPointers: ; L27771 update pointers
|
|
|
|
dec eax ; eax = LineCount-1 OR bShapingFlag - 1
|
|
mov ecx, DWORD PTR _Y ; ecx = Y
|
|
mov edx, DWORD PTR _intPitch ; edx = intPitch
|
|
mov ebx, DWORD PTR _Out ; ebx = Out
|
|
add ebx, edx ; ebx = Out + intPitch
|
|
mov ebp, DWORD PTR _ASMTMP2 ; ebp = row loop counter
|
|
mov DWORD PTR _LineCount, eax ; store updated LineCount
|
|
mov DWORD PTR _Out, ebx ; store updated Out
|
|
mov edx, DWORD PTR _uUVPitch ; edx = uUVPitch
|
|
mov eax, DWORD PTR _U ; eax = U
|
|
mov esi, DWORD PTR _V ; esi = V
|
|
add eax, edx ; eax = U + uUVPitch
|
|
add esi, edx ; esi = V + uUVPitch
|
|
mov DWORD PTR _U, eax ; store updated U
|
|
mov DWORD PTR _V, esi ; store updated V
|
|
add ecx, DWORD PTR _uYPitch ; ecx = Y + uYPitch
|
|
dec ebp ; decrement loop counter
|
|
mov DWORD PTR _Y, ecx ; store updated Y
|
|
mov DWORD PTR _ASMTMP2, ebp ; store updated loop counter
|
|
|
|
jne RowLoop ; back to L27704 row loop
|
|
|
|
CleanUp:
|
|
|
|
add esp, LocalFrameSize ; restore esp to registers
|
|
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
|
|
ret 52 ; 13*4 bytes of arguments
|
|
|
|
MMX_YUV12ToYUY2 ENDP
|
|
|
|
MMXCODE1 ENDS
|
|
|
|
END
|