|
|
;------------------------------------------------------------------------- ; INTEL Corporation Proprietary Information ; ; This listing is supplied under the terms of a license ; agreement with INTEL Corporation and may not be copied ; nor disclosed except in accordance with the terms of ; that agreement. ; ; Copyright (c) 1996 Intel Corporation. ; All Rights Reserved. ; ;-------------------------------------------------------------------------
;------------------------------------------------------------------------- ;// ;// $Header: S:\h26x\src\dec\cxm12162.asv ;// ;// $Log: S:\h26x\src\dec\cxm12162.asv $ ;// ;// Rev 1.10 01 Apr 1997 12:51:50 BNICKERS ;// Fix bugs # 153 and 156 -- wrong color when U is small; right edge flickeri ;// ;// Rev 1.9 09 Dec 1996 15:20:40 BECHOLS ;// Brian fixed ARC bug #94. ;// ;// Rev 1.8 06 Sep 1996 16:07:58 BNICKERS ;// Re-written to filter new points. ;// ;------------------------------------------------------------------------- ; ; +---------- Color convertor. ; |+--------- For both H261 and H263. ; ||+-------- Version for Intel Microprocessors with MMX Technology ; |||++------ Convert from YUV12. ; |||||++---- Convert to RGB16. ; |||||||+--- Zoom by two. ; |||||||| ; cxm12162 -- This function performs zoom-by-2 YUV12-to-RGB16 color conversion ; for H26x. It is tuned for best performance on Intel ; Microprocessors with MMX Technology. It handles any format in ; which there are three fields, the low order field being B and ; starting in bit 0, the second field being G, and the high order ; field being R. Present support for 555, 565, 655, and 644 ; formats only. This version adds new rows and columns by ; averaging them with the originals to either side. ; ; The YUV12 input is planar, 8 bits per pel. The Y plane may have ; a pitch of up to 768. It may have a width less than or equal ; to the pitch. It must be QWORD aligned. Pitch and Width must ; be a multiple of eight. Height may be any amount, but must be ; a multiple of two. The U and V planes may have a different ; pitch than the Y plane, subject to the same limitations. ; ; The color convertor is non-destructive; the input Y, U, and V ; planes will not be clobbered.
OPTION PROLOGUE:None OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
include ccinst.inc
.xlist include iammx.inc include memmodel.inc .list
MMXCCDATA SEGMENT PAGE ALIGN 16
Luma0020004000200000 LABEL DWORD REPEAT 16 DD 0, 0 ENDM CNT = 0 REPEAT 219 DW 0 DW (CNT*04A7FH)/00200H DW (CNT*04A7FH)/00100H DW (CNT*04A7FH)/00200H CNT = CNT + 1 ENDM REPEAT 21 DW 00000H DW 01FFFH DW 03FFFH DW 01FFFH ENDM
UContribToBandG LABEL DWORD DW -(-128*0C83H)/00040H DW 08000H DW -(-127*0C83H)/00040H DW 08000H CNT = -126 REPEAT 253 DW -(CNT*00C83H)/00040H DW (CNT*0408BH)/00040H CNT = CNT + 1 ENDM DW (127*0C83H)/00040H DW 07FFFH
VContribToRandG LABEL DWORD CNT = -128 REPEAT 256 DW -(CNT*01A04H)/00040H DW (CNT*03312H)/00040H CNT = CNT + 1 ENDM
MMXCCDATA ENDS
.CODE
ASSUME ds : FLAT ASSUME es : FLAT ASSUME fs : FLAT ASSUME gs : FLAT ASSUME ss : FLAT
; void FAR ASM_CALLTYPE YUV12ToRGB16ZoomBy2 (U8 * YPlane, ; U8 * VPlane, ; U8 * UPlane, ; UN FrameWidth, ; UN FrameHeight, ; UN YPitch, ; UN VPitch, ; UN AspectAdjustmentCount, ; U8 * ColorConvertedFrame, ; U32 DCIOffset, ; U32 CCOffsetToLine0, ; IN CCOPitch, ; IN CCType) ; ; CCOffsetToLine0 is relative to ColorConvertedFrame. ;
; due to the need for the ebp reg, these parameter declarations aren't used, ; they are here so the assembler knows how many bytes to relieve from the stack
PUBLIC MMX_YUV12ToRGB16ZoomBy2
MMX_YUV12ToRGB16ZoomBy2 proc DIST LANG AYPlane: DWORD, AVPlane: DWORD, AUPlane: DWORD, AFrameWidth: DWORD, AFrameHeight: DWORD, AYPitch: DWORD, AVPitch: DWORD, AAspectAdjustmentCnt: DWORD, AColorConvertedFrame: DWORD, ADCIOffset: DWORD, ACCOffsetToLine0: DWORD, ACCOPitch: DWORD, ACCType: DWORD
MAXWIDTH = 768 LocalFrameSize = MAXWIDTH*20+128+64 RegisterStorageSize = 16
; Arguments:
YPlane_arg = RegisterStorageSize + 4 VPlane_arg = RegisterStorageSize + 8 UPlane_arg = RegisterStorageSize + 12 FrameWidth_arg = RegisterStorageSize + 16 FrameHeight = RegisterStorageSize + 20 YPitch_arg = RegisterStorageSize + 24 ChromaPitch_arg = RegisterStorageSize + 28 AspectAdjustmentCount_arg = RegisterStorageSize + 32 ColorConvertedFrame = RegisterStorageSize + 36 DCIOffset = RegisterStorageSize + 40 CCOffsetToLine0 = RegisterStorageSize + 44 CCOPitch_arg = RegisterStorageSize + 48 CCType = RegisterStorageSize + 52 EndOfArgList = RegisterStorageSize + 56
; Locals (on local stack frame)
DitherB EQU [esp+ 0] DitherG EQU [esp+ 8] DitherR EQU [esp+ 16] SelectBBits EQU [esp+ 24] SelectGBits EQU [esp+ 32] SelectRBits EQU [esp+ 40]
ShiftCountForB EQU [esp+ 48] ShiftCountForG EQU [esp+ 52] ShiftCountForR EQU [esp+ 56]
CCOCursor EQU [esp+ 60] CCOPitch EQU [esp+MAXWIDTH*20+128+ 0] YCursor EQU [esp+MAXWIDTH*20+128+ 4]
YLimit EQU [esp+MAXWIDTH*20+128+ 8] YPitch EQU [esp+MAXWIDTH*20+128+12] UCursor EQU [esp+MAXWIDTH*20+128+16] DistanceFromUToV EQU [esp+MAXWIDTH*20+128+20] ChromaPitch EQU [esp+MAXWIDTH*20+128+24] AspectCount EQU [esp+MAXWIDTH*20+128+28] AspectAdjustmentCount EQU [esp+MAXWIDTH*20+128+32] StartIndexOfYLine EQU [esp+MAXWIDTH*20+128+36] StashESP EQU [esp+MAXWIDTH*20+128+40]
FiltLine0 EQU [esp+ 64] ; Must be 32 byte aligned. FiltLine1 EQU [esp+ 72] FiltLine2 EQU [esp+ 80] FiltLine3 EQU [esp+ 88] HFiltLinePrev EQU [esp+ 96]
push esi push edi push ebp push ebx
mov edi,esp and esp,0FFFFF000H sub esp,4096 mov eax,[esp] sub esp,4096 mov eax,[esp] sub esp,4096 mov eax,[esp] sub esp,LocalFrameSize-12288 mov eax,[esp]
mov eax,768 sub eax,[edi+FrameWidth_arg] imul eax,20 mov StartIndexOfYLine,eax
mov eax,[edi+YPlane_arg] mov YCursor,eax
mov ebx,[edi+YPitch_arg] mov YPitch,ebx mov ecx,[edi+FrameHeight] imul ebx,ecx add eax,ebx mov YLimit,eax
mov eax,[edi+UPlane_arg] mov ebx,[edi+VPlane_arg] mov UCursor,eax sub ebx,eax mov DistanceFromUToV,ebx
mov eax,[edi+ColorConvertedFrame] add eax,[edi+DCIOffset] add eax,[edi+CCOffsetToLine0] mov CCOCursor,eax
mov eax,[edi+ChromaPitch_arg] mov ChromaPitch,eax
mov eax,[edi+CCOPitch_arg] mov CCOPitch,eax
mov eax,[edi+AspectAdjustmentCount_arg] mov AspectAdjustmentCount,eax mov AspectCount,eax
mov StashESP,edi
mov eax,[edi+CCType] cmp eax,CCTYPE_RGB16555ZoomBy2 je CCTypeIs555 cmp eax,CCTYPE_RGB16555ZoomBy2DCI je CCTypeIs555 cmp eax,CCTYPE_RGB16565ZoomBy2 je CCTypeIs565 cmp eax,CCTYPE_RGB16565ZoomBy2DCI je CCTypeIs565 cmp eax,CCTYPE_RGB16655ZoomBy2 je CCTypeIs655 cmp eax,CCTYPE_RGB16655ZoomBy2DCI je CCTypeIs655 cmp eax,CCTYPE_RGB16664ZoomBy2DCI je CCTypeIs664 cmp eax,CCTYPE_RGB16664ZoomBy2 je CCTypeIs664 mov eax,0DEADBEEFH mov YCursor,eax
CCTypeIs555:
mov eax,000000200H ; Dither pattern. mov ebx,002000000H mov DitherB,eax mov DitherB+4,eax mov DitherG,ebx mov DitherG+4,ebx mov DitherR,eax mov DitherR+4,eax mov eax,003E003E0H ; Bits to extract for fields mov ebx,07C007C00H mov SelectGBits,eax mov SelectGBits+4,eax mov SelectRBits,ebx mov SelectRBits+4,ebx mov eax,0001F001FH xor ecx,ecx ; Left shift count for R mov SelectBBits,eax mov SelectBBits+4,eax mov eax,10 ; Right shift count for B mov ebx,5 ; Right shift count for G mov ShiftCountForB,eax mov ShiftCountForG,ebx mov ShiftCountForR,ecx jmp CCTypeInitialized CCTypeIs565:
mov eax,000000200H mov ebx,004000000H mov DitherB,eax mov DitherB+4,eax mov DitherG,ebx mov DitherG+4,ebx mov DitherR,eax mov DitherR+4,eax mov eax,007E007E0H mov ebx,0F800F800H mov SelectGBits,eax mov SelectGBits+4,eax mov SelectRBits,ebx mov SelectRBits+4,ebx mov eax,0001F001FH mov ecx,1 mov SelectBBits,eax mov SelectBBits+4,eax mov eax,10 mov ebx,4 mov ShiftCountForB,eax mov ShiftCountForG,ebx mov ShiftCountForR,ecx jmp CCTypeInitialized
CCTypeIs655:
mov eax,000000200H ; Dither pattern. mov ebx,004000000H mov DitherB,eax mov DitherB+4,eax mov DitherG,eax mov DitherG+4,eax mov DitherR,ebx mov DitherR+4,ebx mov eax,003E003E0H ; Bits to extract for fields mov ebx,0FC00FC00H mov SelectGBits,eax mov SelectGBits+4,eax mov SelectRBits,ebx mov SelectRBits+4,ebx mov eax,0001F001FH mov ecx,1 ; Left shift count for R mov SelectBBits,eax mov SelectBBits+4,eax mov eax,10 ; Right shift count for B mov ebx,5 ; Right shift count for G mov ShiftCountForB,eax mov ShiftCountForG,ebx mov ShiftCountForR,ecx jmp CCTypeInitialized
CCTypeIs664:
mov eax,000000400H ; Dither pattern. mov ebx,001000000H mov DitherB,ebx mov DitherB+4,ebx mov DitherG,eax mov DitherG+4,eax mov DitherR,eax mov DitherR+4,eax mov eax,003F003F0H ; Bits to extract for fields mov ebx,0FC00FC00H mov SelectGBits,eax mov SelectGBits+4,eax mov SelectRBits,ebx mov SelectRBits+4,ebx mov eax,0000F000FH mov ecx,1 ; Left shift count for R mov SelectBBits,eax mov SelectBBits+4,eax mov eax,11 ; Right shift count for B mov ebx,5 ; Right shift count for G mov ShiftCountForB,eax mov ShiftCountForG,ebx mov ShiftCountForR,ecx
CCTypeInitialized:
mov esi,YCursor mov ebp,YPitch mov edi,StartIndexOfYLine xor eax,eax lea edx,[esi+ebp*2] xor ebx,ebx mov YCursor,edx mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge). mov al,[esi] ; Get Y00 (A of line L2; for left edge).
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 > mov bl,[esi+ebp*1+2] ; Get c. movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 > mov al,[esi+2] ; Get C.
; esi -- Cursor over input line of Y. ; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20. ; ebp -- Pitch from one line of Y to the next. ; al, bl -- Y pels ; mm0 -- For line 0, contribution of pel to left of two pels under cursor now. ; mm1 -- For line 1, contribution of pel to left of two pels under cursor now. ; mm2-mm6 -- Scratch.
Next2PelsOfFirst2LumaLines:
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 > psrlq mm1,32 ; L1:< 0 0 32a 64a > movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 > punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a > xor ebx,ebx xor eax,eax mov bl,[esi+ebp*1+1] ; Get b. psrlq mm0,32 ; L0:< 0 0 32A 64A > mov al,[esi+1] ; Get B. add edi,40 ; Inc filtered luma temp stg idx. paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a > punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A > paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
movq HFiltLinePrev[edi-40],mm1 ; Save L1 as next iters LPrev. paddw mm1,mm0 ; L0+L1 paddw mm0,mm0 ; 2L0 add esi,2 ; Increment input index. movq FiltLine3[edi-40],mm1 ; Save filtered line L0+L1. movq mm1,mm3 ; Next iters a. movq FiltLine2[edi-40],mm0 ; Save filtered line 2L0. movq mm0,mm2 ; Next iters A. mov bl,[esi+ebp*1+2] ; Get c. cmp edi,MAXWIDTH*20-40 ; Done yet. mov al,[esi+2] ; Get C. jl Next2PelsOfFirst2LumaLines
xor ebx,ebx xor ecx,ecx mov bl,[esi+ebp*1+1] ; Get c. cmp edi,MAXWIDTH*20 ; Done yet. mov al,[esi+1] ; Get C. jl Next2PelsOfFirst2LumaLines
mov ebp,DistanceFromUToV lea eax,FiltLine2 mov esi,UCursor mov edx,StartIndexOfYLine jmp DoOutputLine
Last2OutputLines:
mov ebp,DistanceFromUToV lea esi,[edi+40] ja Done
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20. ; mm0-mm6 -- Scratch.
movq mm0,HFiltLinePrev[edi] ; Fetch horizontally filtered line LP. paddw mm0,mm0 ; 2LP
Next2PelsOfLast2LumaLines:
movq FiltLine3[edi],mm0 ; Save horz and vert filt line 2LP. movq FiltLine2[edi],mm0 ; Save horz and vert filt line 2LP. movq mm0,HFiltLinePrev[edi+40]; Fetch horizontally filtered line LP. add edi,40 paddw mm0,mm0 ; 2LP cmp edi,MAXWIDTH*20 ; Done yet. jne Next2PelsOfLast2LumaLines
lea eax,FiltLine2 mov edx,StartIndexOfYLine mov esi,UCursor jmp DoOutputLine
Next4OutputLines:
mov esi,YCursor mov ebp,YPitch mov edi,StartIndexOfYLine mov ecx,YLimit lea edx,[esi+ebp*2] xor eax,eax mov YCursor,edx xor ebx,ebx mov al,[esi] ; Get Y00 (A of line L2; for left edge). cmp esi,ecx mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge). jae Last2OutputLines
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 > mov bl,[esi+ebp*1+2] ; Get c. movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 > mov al,[esi+2] ; Get C.
; esi -- Cursor over input line of Y. ; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20. ; ebp -- Pitch from one line of Y to the next. ; al, bl -- Y pels ; mm0 -- For line 0, contribution of pel to left of two pels under cursor now. ; mm1 -- For line 1, contribution of pel to left of two pels under cursor now. ; mm2-mm6 -- Scratch.
Next2PelsOf2LumaLines:
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 > psrlq mm1,32 ; L1:< 0 0 32a 64a > movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 > punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a > movq mm4,HFiltLinePrev[edi] ; LP psrlq mm0,32 ; L0:< 0 0 32A 64A > xor ebx,ebx xor eax,eax mov bl,[esi+ebp*1+1] ; Get b. movq mm5,mm4 ; LP mov al,[esi+1] ; Get B. add esi,2 ; Increment input index. paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a > punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A > paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A > paddw mm5,mm5 ; 2LP movq HFiltLinePrev[edi],mm1 ; Save L1 as next iters LPrev. paddw mm4,mm0 ; LP+L0 movq FiltLine0[edi],mm5 ; Save 2LP paddw mm1,mm0 ; L0+L1 movq FiltLine1[edi],mm4 ; Save LP+L0 paddw mm0,mm0 ; 2L0 movq FiltLine3[edi],mm1 ; Save L0+L1 movq mm1,mm3 ; Next iters a. movq FiltLine2[edi],mm0 ; Save 2L0 movq mm0,mm2 ; Next iters A. add edi,40 ; Inc filtered luma temp stg idx. mov bl,[esi+ebp*1+2] ; Get c. cmp edi,MAXWIDTH*20-40 ; Done yet. mov al,[esi+2] ; Get C. jl Next2PelsOf2LumaLines
xor ebx,ebx xor ecx,ecx mov bl,[esi+ebp*1+1] ; Get c. cmp edi,MAXWIDTH*20 ; Done yet. mov al,[esi+1] ; Get C. jl Next2PelsOf2LumaLines
mov ebp,DistanceFromUToV mov esi,UCursor lea eax,FiltLine0 mov edx,StartIndexOfYLine
DoOutputLine:
mov edi,CCOCursor mov ecx,AspectCount dec ecx ; If count is non-zero, we keep the line. mov ebx,CCOPitch mov AspectCount,ecx je SkipOutputLine
add ebx,edi xor ecx,ecx mov cl,[esi] add eax,MAXWIDTH*20 movdt mm3,ShiftCountForB pcmpeqw mm6,mm6 movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gu > mov cl,[esi+ebp*1] sub edx,MAXWIDTH*20 movdt mm4,ShiftCountForG psllw mm6,15 ; Four words of -32768 movdt mm5,ShiftCountForR punpcklwd mm0,mm0 ; < Bu Bu Gu Gu > movq mm7,SelectBBits mov CCOCursor,ebx jmp StartDoOutputLine
; ebp -- Distance from U to V ; esi -- Cursor over U ; edi -- Cursor over output ; edx -- Index over Y storage area ; eax -- Base address of Y line ; mm6 -- Four words of -32768, to clamp at floor. ; mm3, mm4, mm5 -- Shift counts to apply to R, G, and B.
DoNext4OutputPels:
movq [edi-8],mm2 ; Save 4 output pels. punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
StartDoOutputLine:
movdt mm2,VContribToRandG[ecx*4] ; < 0 0 Rv Gv > punpcklwd mm2,mm2 ; < Rv Rv Gv Gv > movq mm1,mm0 ; < junk junk Gu Gu > punpckhdq mm0,mm0 ; < Bu Bu Bu Bu > paddsw mm0,[eax+edx] ; < B B B B > with ceiling clamped. paddw mm1,mm2 ; < junk junk Guv Guv > paddsw mm0,DitherB ; B with dither added. punpckldq mm1,mm1 ; < Guv Guv Guv Guv > paddsw mm1,[eax+edx] ; < G G G G > with ceiling clamped. punpckhdq mm2,mm2 ; < Rv Rv Rv Rv > paddsw mm1,DitherG ; G with dither added. paddsw mm0,mm6 ; B with floor clamped. paddsw mm2,[eax+edx] ; < R R R R > with ceiling clamped. paddsw mm1,mm6 ; G with floor clamped. paddsw mm2,DitherR ; R with dither added. psrlw mm0,mm3 ; Position B bits. paddsw mm2,mm6 ; R with floor clamped. psrlw mm1,mm4 ; Position G bits. pand mm1,SelectGBits ; Eliminate fractional bits. psllw mm2,mm5 ; Position R bits. inc esi ; Advance input cursor xor ecx,ecx pand mm2,SelectRBits ; Eliminate fractional bits. pand mm0,mm7 mov cl,[esi] ; Fetch next U. add edi,8 ; Advance output cursor. por mm2,mm0 ; R and B combined. add edx,40 ; Increment Y index. movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gv > next iter. por mm2,mm1 ; Completed RGB16 for 4 output pels. mov cl,[esi+ebp*1] ; Fetch next V. jne DoNext4OutputPels
movq [edi-8],mm2 ; Save 4 output pels.
movq mm0,DitherB ; Reverse dither patterns. movq mm1,DitherG psrlq mm0,16 movq mm2,DitherR psrlq mm1,16 psrlq mm2,16 punpckldq mm0,mm0 punpckldq mm1,mm1 movq DitherB,mm0 punpckldq mm2,mm2 movq DitherG,mm1 movq DitherR,mm2
PrepareForNextOutputLine:
mov edx,StartIndexOfYLine add eax,8-MAXWIDTH*20 ; Advance to next filtered line of Y. mov esi,UCursor test al,8 ; Jump if just did line 0 or 2. mov ebx,ChromaPitch jne DoOutputLine
add esi,ebx ; Advance to next chroma line. test al,16 ; Jump if about to do line 2. mov UCursor,esi jne DoOutputLine
sub esi,ebx ; Done with 4 lines. Restore UCursor. mov UCursor,esi jmp Next4OutputLines
SkipOutputLine: mov ecx,AspectAdjustmentCount add eax,MAXWIDTH*20 mov AspectCount,ecx jmp PrepareForNextOutputLine
Done:
mov esp,StashESP pop ebx pop ebp pop edi pop esi rturn
MMX_YUV12ToRGB16ZoomBy2 endp
END
|