;************************************************************************* ;** INTEL Corporation Proprietary Information ;** ;** This listing is supplied under the terms of a license ;** agreement with INTEL Corporation and may not be copied ;** nor disclosed except in accordance with the terms of ;** that agreement. ;** ;** Copyright (c) 1995 Intel Corporation. ;** All Rights Reserved. ;** ;************************************************************************* ;// ;// $Header: S:\h26x\src\dec\cx512yuv.asv 1.5 30 Dec 1996 20:02:08 MDUDA $ ;// ;// $Log: S:\h26x\src\dec\cx512yuv.asv $ ;// ;// Rev 1.5 30 Dec 1996 20:02:08 MDUDA ;// Fixed problem where buffer boundaries were being over-written. ;// ;// Rev 1.4 11 Dec 1996 14:58:52 JMCVEIGH ;// ;// Changed to support width the are multiples of 4. ;// ;// Rev 1.3 18 Jul 1996 12:52:58 KLILLEVO ;// changed cache heating to speed things up a bit ;// ;// Rev 1.2 18 Jul 1996 09:39:34 KLILLEVO ;// ;// added PVCS header and log ;; Very straightforward implementation of the YUV pitch changer ;; Does 16 pels at a time. If the width is not a multiple of 16 ;; the remainder pels are handled as a special case. We assume ;; that the width is at least a multiple of 4 OPTION PROLOGUE: None OPTION EPILOGUE: ReturnAndRelieveEpilogueMacro .xlist include memmodel.inc .list .DATA ; any data would go here .CODE ASSUME cs: FLAT ASSUME ds: FLAT ASSUME es: FLAT ASSUME fs: FLAT ASSUME gs: FLAT ASSUME ss: FLAT PUBLIC YUV12ToYUV YUV12ToYUV proc DIST LANG AuYPlane: DWORD, AuVPlane: DWORD, AuUPlane: DWORD, AuWidth: DWORD, AuHeight: DWORD, AuYPitch: DWORD, AUVPitch: DWORD, AbShapingFlag: DWORD, AuCCOutputBuffer: DWORD, AlOutput: DWORD, AuOffsetToLine0: DWORD, AintPitch: DWORD, ACCType: DWORD LocalFrameSize = 12 RegisterStorageSize = 16 ; 4 registers pushed ; Argument offsets (after register pushed) uYPlane = LocalFrameSize + RegisterStorageSize + 4 uVPlane = LocalFrameSize + RegisterStorageSize + 8 uUPlane = LocalFrameSize + RegisterStorageSize + 12 uWidth = LocalFrameSize + RegisterStorageSize + 16 uHeight = LocalFrameSize + RegisterStorageSize + 20 uYPitch = LocalFrameSize + RegisterStorageSize + 24 uUVPitch = LocalFrameSize + RegisterStorageSize + 28 bShapingFlag = LocalFrameSize + RegisterStorageSize + 32 uCCOutputBuffer = LocalFrameSize + RegisterStorageSize + 36 lOutput = LocalFrameSize + RegisterStorageSize + 40 uOffsetToLine0 = LocalFrameSize + RegisterStorageSize + 44 intPitch = LocalFrameSize + RegisterStorageSize + 48 CCType = LocalFrameSize + RegisterStorageSize + 52 ; Local offsets (after register pushes) LineAdd = 0 ; 1 LineWidth = 4 ; 2 ; Arguments relative to esp _uYPlane EQU [esp + uYPlane] _uVPlane EQU [esp + uVPlane] _UUPlane EQU [esp + uUPlane] _uWidth EQU [esp + uWidth ] _uHeight EQU [esp + uHeight] _uYPitch EQU [esp + uYPitch] _uUVPitch EQU [esp + uUVPitch] _bShapingFlag EQU [esp + bShapingFlag] _uCCOutputBuffer EQU [esp + uCCOutputBuffer] _lOutput EQU [esp + lOutput] _uOffsetToLine0 EQU [esp + uOffsetToLine0] _intPitch EQU [esp + intPitch] _uCCType EQU [esp + CCType] ; Locals relative to esp _LineAdd EQU [esp + LineAdd] _LineWidth EQU [esp + LineWidth] _uRemainderEdgePels EQU [esp + uRemainderEdgePels] ; Save registers and start working push ebx push esi push edi push ebp sub esp, LocalFrameSize mov eax, _uCCOutputBuffer add eax, _uOffsetToLine0 mov ecx, _lOutput add eax, ecx mov ebx, _uYPitch mov ecx, _uWidth mov esi, _uYPlane mov edi, eax ; luma sub ebx, ecx ; ebx = pitch - width mov edx, _uHeight mov eax, _uWidth mov _LineAdd, ebx L2: test ecx, 0FFFFFFF0H jz LEdgePels ; Width may be less than 16 L1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 16 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 16], eax mov DWORD PTR [edi - 12], ebx mov eax, DWORD PTR [esi + 8] mov ebx, DWORD PTR [esi +12] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx add esi, 16 sub ecx, 16 test ecx, 0FFFFFFF0H jnz L1 LEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16) ; Check 8 edge pels test ecx, 08H jz Lchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 mov ebx, DWORD PTR [esi + 4] ; Input pels 4-7 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 mov DWORD PTR [edi + 4], ebx ; Output pels 4-7 add esi, 8 add edi, 8 Lchk4: ; Check 4 edge pels test ecx, 04H jz L2_cont mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4 L2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz L2 ; chroma mov esi, _uUPlane mov ecx, _uWidth shr ecx, 1 mov ebx, _uUVPitch sub ebx, ecx ; ebx = pitch - width/2 mov edx, _uHeight shr edx, 1 mov _LineAdd, ebx mov _uWidth, ecx mov _uHeight, edx U2: test ecx, 0FFFFFFF8H jz UEdgePels ; Width may be less than 16 U1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 8 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx add esi, 8 sub ecx, 8 test ecx, 0FFFFFFF8H jnz U1 UEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16) ; Check 4 edge pels test ecx, 04H jz Uchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4 Uchk4: ; Check 2 edge pels test ecx, 02H jz U2_cont mov ax, WORD PTR [esi + 0] ; Input pels 0-3 add esi, 2 mov WORD PTR [edi + 0], ax ; Output pels 0-3 add edi, 2 U2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz U2 ; chroma mov esi, _uVPlane mov ecx, _uWidth mov edx, _uHeight nop V2: test ecx, 0FFFFFFF8H jz UEdgePels ; Width may be less than 16 V1: mov ebx, DWORD PTR [edi] ; heat cache add edi, 8 mov eax, DWORD PTR [esi + 0] mov ebx, DWORD PTR [esi + 4] mov DWORD PTR [edi - 8], eax mov DWORD PTR [edi - 4], ebx add esi, 8 sub ecx, 8 test ecx, 0FFFFFFF8H jnz V1 VEdgePels: ; Do edge pels is needed (if width a multiple of 4, but not 16) ; Check 4 edge pels test ecx, 04H jz Vchk4 mov eax, DWORD PTR [esi + 0] ; Input pels 0-3 add esi, 4 mov DWORD PTR [edi + 0], eax ; Output pels 0-3 add edi, 4 Vchk4: ; Check 2 edge pels test ecx, 02H jz V2_cont mov ax, WORD PTR [esi + 0] ; Input pels 0-3 add esi, 2 mov WORD PTR [edi + 0], ax ; Output pels 0-3 add edi, 2 V2_cont: add esi, _LineAdd mov ecx, _uWidth dec edx jnz V2 add esp, LocalFrameSize ; restore esp to registers pop ebp pop edi pop esi pop ebx ret 52 ; 13*4 bytes of arguments YUV12ToYUV ENDP END