;************************************************************************* ;** INTEL Corporation Proprietary Information ;** ;** This listing is supplied under the terms of a license ;** agreement with INTEL Corporation and may not be copied ;** nor disclosed except in accordance with the terms of ;** that agreement. ;** ;** Copyright (c) 1995 Intel Corporation. ;** All Rights Reserved. ;** ;************************************************************************* ;// ;// $Header: S:\h26x\src\dec\cx51209.asv ;// ;// $Log: S:\h26x\src\dec\cx51209.asv ;// ;//////////////////////////////////////////////////////////////////////////// ; cx1209 -- This function performs YUV12 to IF09 color conversion for H26x. ; IF09 consists of Y, V, U in 8-bit, planar format, plus a plane of ; 4-bit flags, each in 8 bits of storage, with each bit indicative ; of which dwords of Y are unchanged from the previous frame. ; IF09 is only applicable using DCI. ; ; This version is tuned for maximum performance on both the Pentium ; (r) microcprocessor and the Pentium Pro (tm) microprocessor. ; ; Indentation of instructions indicates expected U/V pipe execution ; on Pentium (r) microprocessor; indented instructions are ; expected to execute in V-pipe, outdented instructions in U-pipe. ; Inside loops, blank lines delineate groups of 1, 2, or 3 ; instructions that are expected to be decoded simultaneously ; on the Pentium Pro (tm) microprocessor. ; ; cx1209 ; ^^^^^^ ; ||||++----- Convert to IF09. ; ||++------- Convert from YUV12. ; |+--------- For both H261 and H263. ; +---------- Color convertor. ;------------------------------------------------------------------------------- OPTION PROLOGUE:None OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro include locals.inc include ccinst.inc include decconst.inc IFNDEF DSEGNAME IFNDEF WIN32 DSEGNAME TEXTEQU ENDIF ENDIF IFDEF WIN32 .xlist include memmodel.inc .list .DATA ELSE DSEGNAME SEGMENT WORD PUBLIC 'DATA' ENDIF ; any data would go here IFNDEF WIN32 DSEGNAME ENDS .xlist include memmodel.inc .list ENDIF IFNDEF SEGNAME IFNDEF WIN32 SEGNAME TEXTEQU <_CODE32> ENDIF ENDIF ifdef WIN32 .CODE ASSUME cs : FLAT ASSUME ds : FLAT ASSUME es : FLAT ASSUME fs : FLAT ASSUME gs : FLAT ASSUME ss : FLAT else SEGNAME SEGMENT PARA PUBLIC USE32 'CODE' ASSUME CS : SEGNAME ASSUME DS : Nothing ASSUME ES : Nothing ASSUME FS : Nothing ASSUME GS : Nothing endif PUBLIC YUV12ToIF09 YUV12ToIF09 proc DIST LANG AYPlane: DWORD, AVPlane: DWORD, AUPlane: DWORD, AFrameWidth: DWORD, AFrameHeight: DWORD, AYPitch: DWORD, AUVPitch: DWORD, AAspectAdjustmentCnt: DWORD, AColorConvertedFrame: DWORD, ADCIOffset: DWORD, ACCOffsetToLine0: DWORD, ACCOPitch: DWORD, ACCType: DWORD ; void * YUV12ToIF09 ( ; U8 * YPlane, ; U8 * VPlane, ; U8 * UPlane, ; UN FrameWidth, ; UN FrameHeight, ; UN YPitch, ; UN UVPitch, ; UN AspectAdjustmentCount, ; U8 * ColorConvertedFrame, ; U32 DCIOffset, ; U32 CCOffsetToLine0, ; IN CCOPitch, ; IN CCType) ; ; YPlane and VPlane are offsets relative to InstanceBase. In 16-bit Microsoft ; Windows (tm), space in this segment is used for local variables and tables. ; In 32-bit variants of Microsoft Windows (tm), the local variables are on ; the stack, while the tables are in the one and only data segment. ; ; CCOffsetToLine0 is relative to ColorConvertedFrame. ; IFDEF WIN32 LocalFrameSize = 32 RegisterStorageSize = 16 ; Arguments: YPlane = LocalFrameSize + RegisterStorageSize + 4 VPlane = LocalFrameSize + RegisterStorageSize + 8 FrameWidth = LocalFrameSize + RegisterStorageSize + 12 FrameHeight = LocalFrameSize + RegisterStorageSize + 16 YPitch = LocalFrameSize + RegisterStorageSize + 20 ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 24 DCIOffset = LocalFrameSize + RegisterStorageSize + 28 CCOffsetToLine0 = LocalFrameSize + RegisterStorageSize + 32 CCOPitch = LocalFrameSize + RegisterStorageSize + 36 CCType = LocalFrameSize + RegisterStorageSize + 40 EndOfArgList = LocalFrameSize + RegisterStorageSize + 44 ; Locals (on local stack frame) CCOCursor = 0 YLimit = 4 CCOVCursor = 8 CCOUCursor = 12 CCOSkipCursor = 16 VLimit = 20 YLine1Limit = 24 CCOUVPitch = 28 LCL EQU ELSE ; Arguments: RegisterStorageSize = 20 ; Put local variables on stack. InstanceBase_zero = RegisterStorageSize + 4 InstanceBase_SegNum = RegisterStorageSize + 6 YPlane_arg = RegisterStorageSize + 8 VPlane_arg = RegisterStorageSize + 12 FrameWidth_arg = RegisterStorageSize + 16 FrameHeight_arg = RegisterStorageSize + 18 YPitch_arg = RegisterStorageSize + 20 ColorConvertedFrame = RegisterStorageSize + 22 ColorConvertedFrame_SegNum = RegisterStorageSize + 24 DCIOffset = RegisterStorageSize + 26 CCOffsetToLine0 = RegisterStorageSize + 30 CCOPitch_arg = RegisterStorageSize + 34 EndOfArgList = RegisterStorageSize + 36 ; Locals (in per-instance data segment) CCOCursor = LocalStorageCC + 0 YLimit = LocalStorageCC + 4 CCOVCursor = LocalStorageCC + 8 CCOUCursor = LocalStorageCC + 12 CCOSkipCursor = LocalStorageCC + 16 VLimit = LocalStorageCC + 20 YLine1Limit = LocalStorageCC + 24 CCOUVPitch = LocalStorageCC + 28 YPlane = LocalStorageCC + 32 VPlane = LocalStorageCC + 36 FrameWidth = LocalStorageCC + 40 FrameHeight = LocalStorageCC + 44 YPitch = LocalStorageCC + 48 CCOPitch = LocalStorageCC + 52 LCL EQU <> ENDIF push esi push edi push ebp push ebx IFDEF WIN32 sub esp,LocalFrameSize mov eax,PD [esp+ColorConvertedFrame] add eax,PD [esp+DCIOffset] add eax,PD [esp+CCOffsetToLine0] mov PD [esp+CCOCursor],eax ELSE xor eax,eax mov eax,ds push eax mov ebp,esp and ebp,00000FFFFH mov ds, PW [ebp+InstanceBase_SegNum] mov es, PW [ebp+ColorConvertedFrame_SegNum] mov ebx,PD [ebp+YPlane_arg] ; Make YPlane accessible mov ds:PD YPlane,ebx mov ebx,PD [ebp+VPlane_arg] ; Make VPlane accessible. mov ds:PD VPlane,ebx mov ax,PW [ebp+FrameWidth_arg] ; Make FrameWidth accessible mov ds:PD FrameWidth,eax mov ax,PW [ebp+FrameHeight_arg] ; Make FrameHeight accessible mov ds:PD FrameHeight,eax mov ax,PW [ebp+YPitch_arg] ; Make YPitch accessible mov ds:PD YPitch,eax mov ax,PW [ebp+ColorConvertedFrame] ; Init CCOCursor add eax,PD [ebp+DCIOffset] mov ds:PD CCOCursor,eax movsx ebx,PW [ebp+CCOPitch_arg] ; Make CCOPitch accessible mov ds:PD CCOPitch,ebx ENDIF Ledx FrameHeight Lebx CCOPitch shr ebx,2 ; UV pitch for the output Lecx YPitch add ebx,3 ; Pitch is always a multiple of 4. Lebp CCOPitch and ebx,0FFFFFFFCH Lesi YPlane ; Fetch cursor over luma plane. Sebx CCOUVPitch Leax CCOCursor imul ecx,edx ; ecx: size of Y input. imul ebp,edx ; ebp: was CCOPitch, now size of Y output. imul ebx,edx ; ebp: size of U/V output (times 4). add ecx,esi ; ecx: Ylimit add eax,ebp ; eax was CCOCursor, now CCOVCursor Secx YLimit Seax CCOVCursor sar ebx,2 ; ebx: UVsize of output Lecx FrameWidth ; ecx: Y frame width add esi,ecx ; esi: end of first input Y add eax,ebx ; eax: now CCOUCursor shr ecx,2 Seax CCOUCursor Lebp VPlane ; ebp Vplane input Ledx YPitch lea esi,[edx+esi] ; End of Y line 1 add ebp,ecx ; end of Vline Sesi YLine1Limit add eax,ebx ; CCO Skip Blocks Sebp VLimit ; UV width for input Seax CCOSkipCursor ; Prepare the UV contribution to decide the skip blocks, and copy chroma ; planes at the same time. ; ; Register usage: ; ; esi: V plane input pointer ; edi; V output pointer ; ebp: U output pointer ; edx: Y plane input pointer ; ecx: V limit ; ebx: Work area for U ; eax: Work area for V ChromaPrep: Ledi CCOVCursor Lebp CCOUCursor Ledx YPlane Leax YPitch Lesi VPlane Lecx VLimit sub edi,esi ; make edi offset to esi. sub ebp,esi ; make ebp offset to esi to save inc in the loop. lea edx,[eax+edx-1296] ; make edx point at place for chroma prep. mov eax,PD [esi] ; fetch four V add eax,eax ; Change to 8-bit. (Low bit undef, usually 0). ChromaLoop: mov Ze PD[esi+edi*1],eax ; Store four V. mov ebx,PD [esi+UOFFSET] ; fetch four U add esi,4 mov PD [edx],eax ; Store four V to chroma-prep line in Y frame. add edx,16 ; Advance chroma-prep cursor. add ebx,ebx ; Change to 8-bit. (Low bit undef, usually 0). mov Ze PD[esi+ebp*1-4],ebx ; Store four U. mov eax,PD [esi] ; fetch next four V. add eax,eax ; Change to 8-bit. (Low bit undef, usually 0). mov PD [edx-12],ebx ; Store four U to chroma-prep line in Y frame. mov bl,Ze PB [esi+edi*1] ; Pre-load output cache line cmp esi,ecx mov bl,Ze PB [esi+ebp*1] ; Pre-load output cache line jb ChromaLoop ; update chroma pointers. add ecx,VPITCH Lebx CCOUVPitch Ledi CCOVCursor Lebp CCOUCursor Secx VLimit add edi,ebx ; update V output ptr to the next line Leax VPlane add ebp,ebx ; update U output ptr to the next line Sedi CCOVCursor add eax,VPITCH Sebp CCOUCursor Seax VPlane ; now do Luma a row of 4x4 blocks ; ; register usage: ; ; esi: Y cursor ; edi: CCOCursor ; ebp: counts down 4 lines of luma. ; ecx: counts down frame width. ; ebx: Y Pitch. ; eax: Work area. ; copy a row of 4x4 luma Lesi YPlane Lecx FrameWidth Ledi CCOCursor add esi,ecx neg ecx Lebx YPitch sub edi,ecx mov eax,PD[esi+ecx] ; Fetch 4 Y pels. add eax,eax ; Make them 8-bit. Low bit undef, but usually 0. mov ebp,4 YLoop: mov Ze PD[edi+ecx],eax ; Store them to IF09 output, Y plane. mov eax,PD[esi+ecx+4] ; Fetch 4 Y pels. add eax,eax ; Make them 8-bit. Low bit undef, but usually 0. add ecx,4 ; Advance induction variable. jl YLoop YLoopDone: Lecx FrameWidth add esi,ebx add edi,ecx neg ecx mov eax,PD[esi+ecx] ; Fetch 4 Y pels. add eax,eax ; Make them 8-bit. Low bit undef, but usually 0. dec ebp jne YLoop add edi,ecx Sedi CCOCursor ; save the output ptr for next four lines ; Build the skip block mask ; ; Register usage: ; ; esi: Y ptr ; edi: Mask Ptr ; ebp: Y Pitch ; edx: mask ; ecx: Archive value ; ebx: UV contribution ; eax: Dword of Y pels ; ; Y starts with Line 1 of 4x4 blocks, since UV pattern has been saved ; relative to line 1. Lesi YPlane Lebp YPitch Ledi CCOSkipCursor add esi,ebp ; esi point at line 1 of luma BuildSkipDescrLoop: mov ebx,PD [esi-1296] ; Fetch 4 U's; byte0 corresponds to this Y. mov eax,PD [esi-1292] ; Fetch 4 V's; byte0 corresponds to this Y. shl ebx,11 ; Position U. and eax,0000000FCH ; Extract 6 bits of V. and ebx,00007E000H ; Extract 6 bits of U. mov edx,PD [esi+ebp*2] ; Line 3 of luma first. and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles. mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame lea ebx,[ebx+eax*8] ; Build UV. mov eax,PD[esi+ebp*1] ; line 2 of luma. add edx,ebx ; combine Y with UV pattern and eax,07E7E7E7EH mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive sub ecx,edx ; compare with the previous archive add ecx,-1 ; CF == 1 iff curr differs from prev lea eax,[eax+ebx] sbb edx,edx ; edx == -1 iff different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] mov PD[esi+ebp*1+YARCHIVEOFFSET],eax sub ecx,eax mov eax,PD[esi] sub esi,ebp ; Gain acces to line 0. and eax,07E7E7E7EH add ecx,-1 adc edx,edx ; edx[0] == 1 if different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] add eax,ebx lea edi,[edi+1] sub ecx,eax mov PD[esi+ebp*1+YARCHIVEOFFSET],eax mov eax,PD[esi] add ecx,-1 adc edx,edx and eax,07E7E7E7EH mov ecx,PD[esi+YARCHIVEOFFSET] add eax,ebx sub ecx,eax Lebx YLine1Limit mov PD[esi+YARCHIVEOFFSET],eax add ecx,-1 lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block adc edx,edx xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword. cmp esi,ebx ; check the end of line 1 of Y mov Ze PB[edi-1],dl ; write to the skip block buffer je BuildSkipDescrLoopDone mov ebx,PD [esi-1300] ; Fetch 4 U's; byte1 corresponds to this Y. mov eax,PD [esi-1296] ; Fetch 4 V's; byte1 corresponds to this Y. shl ebx,11 ; Position U. and eax,00000FC00H ; Extract 6 bits of V. and ebx,007E00000H ; Extract 6 bits of U. mov edx,PD [esi+ebp*2] ; Line 3 of luma first. and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles. mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame lea ebx,[ebx+eax*8] ; Build UV. mov eax,PD[esi+ebp*1] ; line 2 of luma. add edx,ebx ; combine Y with UV pattern and eax,07E7E7E7EH mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive sub ecx,edx ; compare with the previous archive add ecx,-1 ; CF == 1 iff curr differs from prev lea eax,[eax+ebx] sbb edx,edx ; edx == -1 iff different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] mov PD[esi+ebp*1+YARCHIVEOFFSET],eax sub ecx,eax mov eax,PD[esi] sub esi,ebp ; Gain acces to line 0. and eax,07E7E7E7EH add ecx,-1 adc edx,edx ; edx[0] == 1 if different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] add eax,ebx lea edi,[edi+1] sub ecx,eax mov PD[esi+ebp*1+YARCHIVEOFFSET],eax mov eax,PD[esi] add ecx,-1 adc edx,edx and eax,07E7E7E7EH mov ecx,PD[esi+YARCHIVEOFFSET] add eax,ebx sub ecx,eax Lebx YLine1Limit mov PD[esi+YARCHIVEOFFSET],eax add ecx,-1 lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block adc edx,edx xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword. cmp esi,ebx ; check the end of line 1 of Y mov Ze PB[edi-1],dl ; write to the skip block buffer je BuildSkipDescrLoopDone mov ebx,PD [esi-1304] ; Fetch 4 U's; byte2 corresponds to this Y. mov eax,PD [esi-1300] ; Fetch 4 V's; byte2 corresponds to this Y. shr ebx,5 ; Position U. and eax,000FC0000H ; Extract 6 bits of V. and ebx,00007E000H ; Extract 6 bits of U. mov edx,PD [esi+ebp*2] ; Line 3 of luma first. and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles. mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame lea ebx,[ebx+eax*8] ; Build UV. mov eax,PD[esi+ebp*1] ; line 2 of luma. add edx,ebx ; combine Y with UV pattern and eax,07E7E7E7EH mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive sub ecx,edx ; compare with the previous archive add ecx,-1 ; CF == 1 iff curr differs from prev lea eax,[eax+ebx] sbb edx,edx ; edx == -1 iff different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] mov PD[esi+ebp*1+YARCHIVEOFFSET],eax sub ecx,eax mov eax,PD[esi] sub esi,ebp ; Gain acces to line 0. and eax,07E7E7E7EH add ecx,-1 adc edx,edx ; edx[0] == 1 if different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] add eax,ebx lea edi,[edi+1] sub ecx,eax mov PD[esi+ebp*1+YARCHIVEOFFSET],eax mov eax,PD[esi] add ecx,-1 adc edx,edx and eax,07E7E7E7EH mov ecx,PD[esi+YARCHIVEOFFSET] add eax,ebx sub ecx,eax Lebx YLine1Limit mov PD[esi+YARCHIVEOFFSET],eax add ecx,-1 lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block adc edx,edx xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword. cmp esi,ebx ; check the end of line 1 of Y mov Ze PB[edi-1],dl ; write to the skip block buffer je BuildSkipDescrLoopDone mov ebx,PD [esi-1308] ; Fetch 4 U's; byte3 corresponds to this Y. mov eax,PD [esi-1304] ; Fetch 4 V's; byte3 corresponds to this Y. shr ebx,5 ; Position U. mov edx,PD [esi+ebp*2] ; Line 3 of luma first. shr eax,26 ; Extract 6 bits of V. and ebx,007E00000H ; Extract 6 bits of U. and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles. mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame lea ebx,[ebx+eax*8] ; Build UV. mov eax,PD[esi+ebp*1] ; line 2 of luma. add edx,ebx ; combine Y with UV pattern and eax,07E7E7E7EH mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive sub ecx,edx ; compare with the previous archive add ecx,-1 ; CF == 1 iff curr differs from prev lea eax,[eax+ebx] sbb edx,edx ; edx == -1 iff different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] mov PD[esi+ebp*1+YARCHIVEOFFSET],eax sub ecx,eax mov eax,PD[esi] sub esi,ebp ; Gain acces to line 0. and eax,07E7E7E7EH add ecx,-1 adc edx,edx ; edx[0] == 1 if different, else 0. mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET] add eax,ebx lea edi,[edi+1] sub ecx,eax mov PD[esi+ebp*1+YARCHIVEOFFSET],eax mov eax,PD[esi] add ecx,-1 adc edx,edx and eax,07E7E7E7EH mov ecx,PD[esi+YARCHIVEOFFSET] add eax,ebx sub ecx,eax Lebx YLine1Limit mov PD[esi+YARCHIVEOFFSET],eax add ecx,-1 lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block adc edx,edx xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword. cmp esi,ebx ; check the end of line 1 of Y mov Ze PB[edi-1],dl ; write to the skip block buffer jne BuildSkipDescrLoop BuildSkipDescrLoopDone: add edi,3 ; Round to next dword. lea ebx,[ebx+ebp*4] ; update YLine1Limit for next row of blocks and edi,0FFFFFFFCH Lesi YPlane Sedi CCOSkipCursor Sebx YLine1Limit lea esi,[esi+ebp*4] Leax YLimit Sesi YPlane cmp esi,eax jl ChromaPrep IFDEF WIN32 add esp,LocalFrameSize ELSE pop ebx mov ds,ebx ENDIF pop ebx pop ebp pop edi pop esi rturn YUV12ToIF09 endp IFNDEF WIN32 SEGNAME ENDS ENDIF END