|
|
;************************************************************************* ;** INTEL Corporation Proprietary Information ;** ;** This listing is supplied under the terms of a license ;** agreement with INTEL Corporation and may not be copied ;** nor disclosed except in accordance with the terms of ;** that agreement. ;** ;** Copyright (c) 1995 Intel Corporation. ;** All Rights Reserved. ;** ;************************************************************************* ;// ;// $Header: S:\h26x\src\dec\cx512yuv.asv 1.5 30 Dec 1996 20:02:08 MDUDA $ ;// ;// $Log: S:\h26x\src\dec\cx512yuv.asv $ ;// ;// Rev 1.5 30 Dec 1996 20:02:08 MDUDA ;// Fixed problem where buffer boundaries were being over-written. ;// ;// Rev 1.4 11 Dec 1996 14:58:52 JMCVEIGH ;// ;// Changed to support width the are multiples of 4. ;// ;// Rev 1.3 18 Jul 1996 12:52:58 KLILLEVO ;// changed cache heating to speed things up a bit ;// ;// Rev 1.2 18 Jul 1996 09:39:34 KLILLEVO ;// ;// added PVCS header and log
;; Very straightforward implementation of the YUV pitch changer ;; Does 16 pels at a time. If the width is not a multiple of 16 ;; the remainder pels are handled as a special case. We assume ;; that the width is at least a multiple of 4
OPTION PROLOGUE: None OPTION EPILOGUE: ReturnAndRelieveEpilogueMacro
.xlist include memmodel.inc .list .DATA
; any data would go here
.CODE
ASSUME cs: FLAT ASSUME ds: FLAT ASSUME es: FLAT ASSUME fs: FLAT ASSUME gs: FLAT ASSUME ss: FLAT
PUBLIC YUV12ToYUV
YUV12ToYUV proc DIST LANG AuYPlane: DWORD, AuVPlane: DWORD, AuUPlane: DWORD, AuWidth: DWORD, AuHeight: DWORD, AuYPitch: DWORD, AUVPitch: DWORD, AbShapingFlag: DWORD, AuCCOutputBuffer: DWORD, AlOutput: DWORD, AuOffsetToLine0: DWORD, AintPitch: DWORD, ACCType: DWORD
LocalFrameSize = 12
RegisterStorageSize = 16 ; 4 registers pushed
; Argument offsets (after register pushed)
uYPlane = LocalFrameSize + RegisterStorageSize + 4 uVPlane = LocalFrameSize + RegisterStorageSize + 8 uUPlane = LocalFrameSize + RegisterStorageSize + 12 uWidth = LocalFrameSize + RegisterStorageSize + 16 uHeight = LocalFrameSize + RegisterStorageSize + 20 uYPitch = LocalFrameSize + RegisterStorageSize + 24 uUVPitch = LocalFrameSize + RegisterStorageSize + 28 bShapingFlag = LocalFrameSize + RegisterStorageSize + 32 uCCOutputBuffer = LocalFrameSize + RegisterStorageSize + 36 lOutput = LocalFrameSize + RegisterStorageSize + 40 uOffsetToLine0 = LocalFrameSize + RegisterStorageSize + 44 intPitch = LocalFrameSize + RegisterStorageSize + 48 CCType = LocalFrameSize + RegisterStorageSize + 52
; Local offsets (after register pushes)
LineAdd = 0 ; 1 LineWidth = 4 ; 2
; Arguments relative to esp
_uYPlane EQU [esp + uYPlane] _uVPlane EQU [esp + uVPlane] _UUPlane EQU [esp + uUPlane] _uWidth EQU [esp + uWidth ] _uHeight EQU [esp + uHeight] _uYPitch EQU [esp + uYPitch] _uUVPitch EQU [esp + uUVPitch] _bShapingFlag EQU [esp + bShapingFlag] _uCCOutputBuffer EQU [esp + uCCOutputBuffer] _lOutput EQU [esp + lOutput] _uOffsetToLine0 EQU [esp + uOffsetToLine0] _intPitch EQU [esp + intPitch] _uCCType EQU [esp + CCType]
; Locals relative to esp
_LineAdd EQU [esp + LineAdd] _LineWidth EQU [esp + LineWidth] _uRemainderEdgePels EQU [esp + uRemainderEdgePels]
; Save registers and start working
push ebx push esi push edi push ebp
sub esp, LocalFrameSize
mov eax, _uCCOutputBuffer add eax, _uOffsetToLine0 mov ecx, _lOutput add eax, ecx mov ebx, _uYPitch mov ecx, _uWidth mov esi, _uYPlane mov edi, eax
; luma sub ebx, ecx ; ebx = pitch - width mov edx, _uHeight mov eax, _uWidth mov _LineAdd, ebx
L2: test ecx, 0FFFFFFF0H jz LEdgePels ; Width may be less than 16
L1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 16 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 16], eax mov DWORD PTR [edi - 12], ebx mov eax, DWORD PTR [esi + 8] mov ebx, DWORD PTR [esi +12] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx
add esi, 16 sub ecx, 16
test ecx, 0FFFFFFF0H jnz L1
LEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16)
; Check 8 edge pels test ecx, 08H jz Lchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 mov ebx, DWORD PTR [esi + 4] ; Input pels 4-7 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 mov DWORD PTR [edi + 4], ebx ; Output pels 4-7 add esi, 8 add edi, 8
Lchk4: ; Check 4 edge pels test ecx, 04H jz L2_cont mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4
L2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz L2
; chroma mov esi, _uUPlane mov ecx, _uWidth shr ecx, 1 mov ebx, _uUVPitch sub ebx, ecx ; ebx = pitch - width/2 mov edx, _uHeight shr edx, 1 mov _LineAdd, ebx mov _uWidth, ecx mov _uHeight, edx
U2: test ecx, 0FFFFFFF8H jz UEdgePels ; Width may be less than 16
U1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 8 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx
add esi, 8 sub ecx, 8
test ecx, 0FFFFFFF8H jnz U1
UEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16)
; Check 4 edge pels test ecx, 04H jz Uchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4
Uchk4: ; Check 2 edge pels test ecx, 02H jz U2_cont mov ax, WORD PTR [esi + 0] ; Input pels 0-3 add esi, 2 mov WORD PTR [edi + 0], ax ; Output pels 0-3 add edi, 2
U2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz U2
; chroma mov esi, _uVPlane mov ecx, _uWidth mov edx, _uHeight nop
V2: test ecx, 0FFFFFFF8H jz UEdgePels ; Width may be less than 16
V1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 8 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx
add esi, 8 sub ecx, 8
test ecx, 0FFFFFFF8H jnz V1
VEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16)
; Check 4 edge pels test ecx, 04H jz Vchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4
Vchk4: ; Check 2 edge pels test ecx, 02H jz V2_cont mov ax, WORD PTR [esi + 0] ; Input pels 0-3 add esi, 2 mov WORD PTR [edi + 0], ax ; Output pels 0-3 add edi, 2
V2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz V2
add esp, LocalFrameSize ; restore esp to registers
pop ebp pop edi pop esi pop ebx ret 52 ; 13*4 bytes of arguments
YUV12ToYUV ENDP
END
|