Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

515 lines
15 KiB

;*************************************************************************
;** INTEL Corporation Proprietary Information
;**
;** This listing is supplied under the terms of a license
;** agreement with INTEL Corporation and may not be copied
;** nor disclosed except in accordance with the terms of
;** that agreement.
;**
;** Copyright (c) 1995 Intel Corporation.
;** All Rights Reserved.
;**
;*************************************************************************
;//
;//
;////////////////////////////////////////////////////////////////////////////
; yuv12enc -- This function performs "color conversion" in the H26X decoder for
; consumption by the H26X encoder. This entails reformatting the decoder's
; YVU data into the shape required by the encoder - including YUV order. It
; Also includes 7-bit pels.
; $Header: S:\h26x\src\dec\yuv12enc.asv 1.5 30 Oct 1996 14:31:00 mbodart $
; $Log: S:\h26x\src\dec\yuv12enc.asv $
;//
;// Rev 1.5 30 Oct 1996 14:31:00 mbodart
;// Re-checking in changes originally made by Atul, but lost when the server
;// ran out of disk space during a PVCS operation. Atul's original log msg:
;//
;// Removed AGI in IA code. Added MMX code but it is not ready for prime-time.
;//
;// Rev 1.4 08 Mar 1996 15:11:10 AGUPTA2
;// Removed segment register override when compiling for WIN32.
;// Should speed-up this routine substantially.
;//
;
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
include locals.inc
include decconst.inc
include iammx.inc
IFNDEF DSEGNAME
IFNDEF WIN32
DSEGNAME TEXTEQU <DataH26x_YUV12ForEnc>
ENDIF
ENDIF
IFDEF WIN32
.xlist
include memmodel.inc
.list
.DATA
ELSE
DSEGNAME SEGMENT WORD PUBLIC 'DATA'
ENDIF
; any data would go here
IFNDEF WIN32
DSEGNAME ENDS
.xlist
include memmodel.inc
.list
ENDIF
IFNDEF SEGNAME
IFNDEF WIN32
SEGNAME TEXTEQU <_CODE32>
ENDIF
ENDIF
ifdef WIN32
.CODE
else
SEGNAME SEGMENT PARA PUBLIC USE32 'CODE'
endif
ifdef WIN32
ASSUME cs : FLAT
ASSUME ds : FLAT
ASSUME es : FLAT
ASSUME fs : FLAT
ASSUME gs : FLAT
ASSUME ss : FLAT
else
ASSUME CS : SEGNAME
ASSUME DS : Nothing
ASSUME ES : Nothing
ASSUME FS : Nothing
ASSUME GS : Nothing
endif
; void FAR ASM_CALLTYPE H26x_YUV12ForEnc (
; U8 FAR * InstanceBase,
; X32 YPlane,
; X32 VPlane,
; X32 UPlane,
; UN FrameWidth,
; UN FrameHeight,
; UN Pitch,
; U8 FAR * ColorConvertedFrame, // encoder's buffers.
; X32 YOutputPlane,
; X32 VOutputPlane,
; X32 UOutputPlane)
;
; YPlane, VPlane, YOutputPlane, and VOutputPlane are offsets. In 16-bit Microsoft
; Windows (tm), space in this segment is used for local variables and tables.
; In 32-bit variants of Microsoft Windows (tm), the local variables are on
; the stack, while the tables are in the one and only data segment.
;
PUBLIC H26x_YUV12ForEnc
; due to the need for the ebp reg, these parameter declarations aren't used,
; they are here so the assembler knows how many bytes to relieve from the stack
H26x_YUV12ForEnc proc DIST LANG PUBLIC,
AInstanceBase: DWORD,
AYPlane: DWORD,
AVPlane: DWORD,
AUPlane: DWORD,
AFrameWidth: DWORD,
AFrameHeight: DWORD,
APitch: DWORD,
AColorConvertedFrame: DWORD,
AYOutputPlane: DWORD,
AVOutputPLane: DWORD,
AUOutputPLane: DWORD
LocalFrameSize = 0
RegisterStorageSize = 16
; Arguments:
InstanceBase = LocalFrameSize + RegisterStorageSize + 4
YPlane = LocalFrameSize + RegisterStorageSize + 8
VPlane = LocalFrameSize + RegisterStorageSize + 12
UPlane = LocalFrameSize + RegisterStorageSize + 16
FrameWidth = LocalFrameSize + RegisterStorageSize + 20
FrameHeight = LocalFrameSize + RegisterStorageSize + 24
Pitch = LocalFrameSize + RegisterStorageSize + 28
ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 32
YOutputPlane = LocalFrameSize + RegisterStorageSize + 36
VOutputPlane = LocalFrameSize + RegisterStorageSize + 40
UOutputPlane = LocalFrameSize + RegisterStorageSize + 44
EndOfArgList = LocalFrameSize + RegisterStorageSize + 48
LCL EQU <esp+>
push esi
push edi
push ebp
push ebx
sub esp,LocalFrameSize
mov eax,PD [esp+InstanceBase]
add PD [esp+YPlane],eax
add PD [esp+VPlane],eax
add PD [esp+UPlane],eax
mov eax,PD [esp+ColorConvertedFrame]
add PD [esp+YOutputPlane],eax
add PD [esp+VOutputPlane],eax
add PD [esp+UOutputPlane],eax
; We copy 16 pels in one iteration of the inner loop
; Register usage:
; edi -- Y plane output cursor
; esi -- Y plane input cursor
; ebp -- Count down Y plane height
; ecx -- Count down Y plane width
; ebx -- Y plane input pitch
; eax,edx -- scratch
Lebp FrameHeight
Lecx FrameWidth
Lesi YPlane
Lebx Pitch
Ledi YOutputPlane
YLoopHeader:
mov eax, PD [esi+ecx-8] ;
mov edx, PD [esi+ecx-4]
ALIGN 4
YLoop:
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8], eax
mov PD [edi+ecx-4], edx
; NEXT 8 PELS
mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
mov edx, PD [esi+ecx-4-8] ; this avoids AGI
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8-8], eax
mov PD [edi+ecx-4-8], edx
mov eax, PD [esi+ecx-8-16] ; speculatively load next 8 pels
mov edx, PD [esi+ecx-4-16] ; for next iteration
sub ecx, 16
jg YLoop
Lecx FrameWidth
add esi, ebx
add edi, ebx
dec ebp
jne YLoopHeader
; We copy 8 pels in one iteration of the inner loop
; Register usage:
; edi -- V plane output cursor
; esi -- V plane input cursor
; ebp -- Count down V plane height
; ecx -- Count down V plane width
; ebx -- Pitch
; eax,edx -- scratch
Lebp FrameHeight
Lecx FrameWidth
sar ecx,1
Lesi VPlane
sar ebp,1
Ledi VOutputPlane
ALIGN 4
VLoopHeader:
mov eax, PD [esi+ecx-8]
mov edx, PD [esi+ecx-4]
VLoop:
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8], eax
mov PD [edi+ecx-4], edx
mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
mov edx, PD [esi+ecx-4-8] ; this avoids AGI
sub ecx, 8
jg VLoop
Lecx FrameWidth
add esi,ebx
shr ecx,1
add edi,ebx
dec ebp
jne VLoopHeader
; We copy 8 pels in one iteration of the inner loop
; Register usage:
; edi -- U plane output cursor
; esi -- U plane input cursor
; ebp -- Count down U plane height
; ecx -- Count down U plane width
; ebx -- Pitch
; eax,edx -- scratch
Lebp FrameHeight
Lecx FrameWidth
sar ecx,1
Lesi UPlane
sar ebp,1
Ledi UOutputPlane
ALIGN 4
ULoopHeader:
mov eax,PD [esi+ecx-8]
mov edx,PD [esi+ecx-4]
ULoop:
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8], eax
mov PD [edi+ecx-4], edx
mov eax, PD [esi+ecx-8-8]
mov edx, PD [esi+ecx-4-8]
sub ecx, 8
jg ULoop
Lecx FrameWidth
add esi, ebx
shr ecx, 1
add edi, ebx
dec ebp
jne ULoopHeader
add esp,LocalFrameSize
pop ebx
pop ebp
pop edi
pop esi
rturn
H26x_YUV12ForEnc endp
IFDEF H263P
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
MMXDATA1 SEGMENT
ALIGN 8
CLEAR_LOW_BIT_MASK LABEL DWORD
DWORD 0FEFEFEFEH, 0FEFEFEFEH
CLEAR_HIGH_BIT_MASK LABEL DWORD
DWORD 07F7F7F7FH, 07F7F7F7FH
MMXDATA1 ENDS
PUBLIC MMX_H26x_YUV12ForEnc
; due to the need for the ebp reg, these parameter declarations aren't used,
; they are here so the assembler knows how many bytes to relieve from the stack
MMX_H26x_YUV12ForEnc proc DIST LANG PUBLIC,
AInstanceBase: DWORD,
AYPlane: DWORD,
AVPlane: DWORD,
AUPlane: DWORD,
AFrameWidth: DWORD,
AFrameHeight: DWORD,
APitch: DWORD,
AColorConvertedFrame: DWORD,
AYOutputPlane: DWORD,
AVOutputPLane: DWORD,
AUOutputPLane: DWORD
LocalFrameSize = 0
RegisterStorageSize = 16
; Arguments:
InstanceBase = LocalFrameSize + RegisterStorageSize + 4
YPlane = LocalFrameSize + RegisterStorageSize + 8
VPlane = LocalFrameSize + RegisterStorageSize + 12
UPlane = LocalFrameSize + RegisterStorageSize + 16
FrameWidth = LocalFrameSize + RegisterStorageSize + 20
FrameHeight = LocalFrameSize + RegisterStorageSize + 24
Pitch = LocalFrameSize + RegisterStorageSize + 28
ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 32
YOutputPlane = LocalFrameSize + RegisterStorageSize + 36
VOutputPlane = LocalFrameSize + RegisterStorageSize + 40
UOutputPlane = LocalFrameSize + RegisterStorageSize + 44
EndOfArgList = LocalFrameSize + RegisterStorageSize + 48
LCL EQU <esp+>
CLEAR_LOW_BIT EQU mm6
CLEAR_HIGH_BIT EQU mm7
push esi
push edi
push ebp
push ebx
sub esp,LocalFrameSize
mov eax,PD [esp+InstanceBase]
add PD [esp+YPlane],eax
add PD [esp+VPlane],eax
add PD [esp+UPlane],eax
mov eax,PD [esp+ColorConvertedFrame]
add PD [esp+YOutputPlane],eax
add PD [esp+VOutputPlane],eax
add PD [esp+UOutputPlane],eax
; We copy 16 pels of two lines in one iteration of the inner loop
; Register usage:
; edi -- Y plane output cursor (line 0)
; edx -- Y plane output cursor (line 1)
; esi -- Y plane input cursor (line 0)
; eax -- Y plane input cursor (line 1)
; ebp -- Count down Y plane height / 2
; ecx -- Count down Y plane width
; ebx -- Y plane input pitch
Lebp FrameHeight
Lebx Pitch
Lesi YPlane
Lecx FrameWidth
Ledi YOutputPlane
lea eax, [esi + ebx] ; line 1 of input
movq mm6, CLEAR_LOW_BIT_MASK
lea edx, [edi + ebx] ; line 1 of output
movq mm7, CLEAR_HIGH_BIT_MASK
shr ebp, 1 ; two lines in one iteration
YLoopHeader:
movq mm0, [esi+ecx-16] ;00
;
movq mm1, [esi+ecx-8] ;01
psrlq mm0, 1 ;00 Shift packed pel by 1 to convert to 7-bit
YLoop:
movq mm2, [eax+ecx-16] ;10
pand mm0, CLEAR_HIGH_BIT ;00 and to get rid of high bit
movq mm3, [eax+ecx-8] ;11
psrlq mm1, 1 ;01
movq [edi+ecx-16], mm0 ;00
pand mm1, CLEAR_LOW_BIT ;01 and to get rid of low bit
movq mm0, [esi+ecx-16-16] ; speculatively load next 8 pels
psrlq mm2, 1 ;10 Shift packed pel by 1 to convert to 7-bit
movq [edi+ecx-8 ], mm1 ;01
pand mm2, CLEAR_HIGH_BIT ;10 and to get rid of high bit
movq mm1, [esi+ecx-8 -16] ; for next iteration
pand mm3, CLEAR_LOW_BIT ;11 and to get rid of low bit
movq [edx+ecx-16], mm2 ;10
psrlq mm3, 1 ;11
psrlq mm0, 1 ;00 Shift packed pel by 1 to convert to 7-bit
;
movq [edx+ecx-8 ], mm3 ;11
sub ecx, 16
jg YLoop
Lecx FrameWidth
lea esi, [esi + 2*ebx]
lea edi, [edi + 2*ebx]
lea eax, [eax + 2*ebx] ; line 1 of input
lea edx, [edx + 2*ebx] ; line 1 of output
dec ebp
jne YLoopHeader
; We copy 8 pels in one iteration of the inner loop
; Register usage:
; edi -- V plane output cursor
; esi -- V plane input cursor
; ebp -- Count down V plane height
; ecx -- Count down V plane width
; ebx -- Pitch
; eax,edx -- scratch
Lebp FrameHeight
Lecx FrameWidth
sar ecx,1
Lesi VPlane
sar ebp,1
Ledi VOutputPlane
ALIGN 4
VLoopHeader:
mov eax, PD [esi+ecx-8]
mov edx, PD [esi+ecx-4]
VLoop:
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8], eax
mov PD [edi+ecx-4], edx
mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
mov edx, PD [esi+ecx-4-8] ; this avoids AGI
sub ecx, 8
jg VLoop
Lecx FrameWidth
add esi,ebx
shr ecx,1
add edi,ebx
dec ebp
jne VLoopHeader
; We copy 8 pels in one iteration of the inner loop
; Register usage:
; edi -- U plane output cursor
; esi -- U plane input cursor
; ebp -- Count down U plane height
; ecx -- Count down U plane width
; ebx -- Pitch
; eax,edx -- scratch
Lebp FrameHeight
Lecx FrameWidth
sar ecx,1
Lesi UPlane
sar ebp,1
Ledi UOutputPlane
ALIGN 4
ULoopHeader:
mov eax,PD [esi+ecx-8]
mov edx,PD [esi+ecx-4]
ULoop:
shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
and edx, 0FEFEFEFEH ; and to get rid of upper bit
shr edx, 1
and eax, 07F7F7F7Fh ; and to get rid of upper bit
mov PD [edi+ecx-8], eax
mov PD [edi+ecx-4], edx
mov eax, PD [esi+ecx-8-8]
mov edx, PD [esi+ecx-4-8]
sub ecx, 8
jg ULoop
Lecx FrameWidth
add esi, ebx
shr ecx, 1
add edi, ebx
dec ebp
jne ULoopHeader
add esp,LocalFrameSize
pop ebx
pop ebp
pop edi
pop esi
rturn
MMX_H26x_YUV12ForEnc endp
ENDIF ;H263P
IFNDEF WIN32
SEGNAME ENDS
ENDIF
END