Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

686 lines
31 KiB

;--------------------------------------------------------------------------
; INTEL Corporation Proprietary Information
;
; This listing is supplied under the terms of a license
; agreement with INTEL Corporation and may not be copied
; nor disclosed except in accordance with the terms of
; that agreement.
;
; Copyright (c) 1996 Intel Corporation.
; All Rights Reserved.
;
;--------------------------------------------------------------------------
;--------------------------------------------------------------------------
;
; $Author: SCDAY $
; $Date: 31 Oct 1996 09:00:56 $
; $Archive: S:\h26x\src\dec\d3mbkadd.asv $
; $Header: S:\h26x\src\dec\d3mbkadd.asv 1.8 31 Oct 1996 09:00:56 SCDAY $
; $Log: S:\h26x\src\dec\d3mbkadd.asv $
;//
;// Rev 1.8 31 Oct 1996 09:00:56 SCDAY
;// Raj added IFDEF H261 MMX_BlockAddSpecial and MMX_BlockCopySpecial
;//
;// Rev 1.7 09 Jul 1996 16:50:42 AGUPTA2
;// DC value for INTRA blocks is added back in ClipAndMove routine.
;// Cleaned-up code.
;//
;// Rev 1.6 04 Apr 1996 13:42:58 AGUPTA2
;// Removed a store stall from MMX_BlockAdd
;//
;// Rev 1.5 03 Apr 1996 17:42:30 AGUPTA2
;// Added MMX version of BlockCopy routine.
;//
;// Rev 1.4 03 Apr 1996 11:08:22 RMCKENZX
;// Added clearing of IDCT output. Cleaned comments.
;//
;// Rev 1.3 22 Mar 1996 15:43:30 AGUPTA2
;// Fixed fastcall bug: return from rtns with more than 2 params.
;//
;// Rev 1.2 14 Mar 1996 17:15:14 AGUPTA2
;//
;// Included Bob's MMX_ClipAndMove rtn. This rtn works on INTRA output.
;//
;// Rev 1.1 27 Feb 1996 16:48:52 RMCKENZX
;// Added rounding of IDCT output.
;
;--------------------------------------------------------------------------
;==========================================================================
;
; d3mbkadd.asm
;
; Routines:
; MMX_BlockAdd
; MMX_ClipAndMove
;
; Prototypes in d3mblk.h:
; extern "C" {
; void __fastcall MMX_BlockAdd(
; U32 uResidual, // pointer to IDCT output
; U32 uRefBlock, // pointer to predicted values
; U32 uDstBlock); // pointer to destination
;
; void __fastcall MMX_ClipAndMove(
; U32 uResidual, // pointer to IDCT output
; U32 uDstBlock, // pointer to destination
; U32 ScaledDC); // scaled DC
; }
;
;==========================================================================
;--------------------------------------------------------------------------
;
; MMX_BlockAdd
;
; Description:
; This routine performs block addition of the IDCT output with the
; predicted value to find the final value. The IDCT values are converted
; to integers then added to the prediction. The result of the addition is
; then clipped to 0...255. The routine is called with the __fastcall option,
; with the first two parameters in ecx and edx and the third on the stack.
;
; The routine clears the IDCT output after reading it.
; Parameters:
; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values with
; 6 fractional bits. They are not clipped to -256 ... +255.
; They are packed into a qword aligned 8x8 array of dwords.
;
; edx = uSrc2 pointer to prediction values. Vaules are unsigned, 8-bit
; values. They are packed into a (possibly unaligned) 8x8 array of
; bytes.
; esp+4 = uDst pointer to output values. Values will be unsigned, 8-bit
; values. They will be written into a qword aligned 8x8 array
; of bytes with a PITCH of 384 in between rows.
;
;--------------------------------------------------------------------------
.586
.MODEL FLAT
OPTION CASEMAP:NONE
OPTION PROLOGUE:None
OPTION EPILOGUE:None
.xlist
include iammx.inc
.list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
MMXDATA1 SEGMENT
ALIGN 8
MMX_Round32 DWORD 000200020H, 000200020H
MMXDATA1 ENDS
MMXCODE1 SEGMENT
ALIGN 4
@MMX_BlockAdd@12 PROC
; Parameters
pSrc1 EQU ecx
pSrc2 EQU edx
pDst EQU eax
PITCH EQU 384
;
; This loop is 2-folded and fully unrolled. 2-folded means that
; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that
; it doesn't really loop at all -- all 8 "passes" are placed
; in succession.
;
; The result which each instruction is working on is identified
; by a number as the first item in the comment field.
;
movq mm6, [MMX_Round32] ; rounding for IDC output
;
movq mm3, [ecx+8] ; 1 - last 4 words of In1
pxor mm7, mm7 ; zero for PUNPCK and clearing.
movq mm1, [ecx] ; 1 - first 4 words of In1
;
movq [ecx+8], mm7 ; 1 - zero last 4 words of In1
paddw mm3, mm6 ; 1 - add in rounding
movq [ecx], mm7 ; 1 - zero first 4 words of In1
paddw mm1, mm6 ; 1 - add in rounding
mov eax, [esp+4] ; destination pointer
psraw mm3, 6 ; 1 - convert to int
movq mm2, [edx] ; 1 - 8 bytes of In2
psraw mm1, 6 ; 1 - convert to int
; pass 1
movq mm0, mm2 ; 1 - second copy of In2
punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2
paddw mm2, mm3 ; 1 - sum last 4 bytes
punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2
movq mm3, [ecx+24] ; 2 - last 4 words of In1
paddw mm0, mm1 ; 1 - sum first 4 bytes
movq mm1, [ecx+16] ; 2 - first 4 words of In1
packuswb mm0, mm2 ; 1 - combine & clip sum
movq [ecx+24], mm7 ; 2 - zero last 4 words of In1
paddw mm3, mm6 ; 2 - add in rounding
movq [ecx+16], mm7 ; 2 - zero first 4 words of In1
paddw mm1, mm6 ; 2 - add in rounding
movq mm2, [edx+PITCH] ; 2 - 8 bytes of In2
psraw mm3, 6 ; 2 - convert to int
movq [eax], mm0 ; 1 - store result
psraw mm1, 6 ; 2 - convert to int
; pass 2
movq mm0, mm2 ; 2 - second copy of In2
punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2
paddw mm2, mm3 ; 2 - sum last 4 bytes
punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2
movq mm3, [ecx+40] ; 3 - last 4 words of In1
paddw mm0, mm1 ; 2 - sum first 4 bytes
movq mm1, [ecx+32] ; 3 - first 4 words of In1
packuswb mm0, mm2 ; 2 - combine & clip sum
movq [ecx+40], mm7 ; 3 - zero last 4 words of In1
paddw mm3, mm6 ; 3 - add in rounding
movq [ecx+32], mm7 ; 3 - zero first 4 words of In1
paddw mm1, mm6 ; 3 - add in rounding
movq mm2, [edx+2*PITCH] ; 3 - 8 bytes of In2
psraw mm3, 6 ; 3 - convert to int
movq [eax+PITCH], mm0 ; 2 - store result
psraw mm1, 6 ; 3 - convert to int
; pass 3
movq mm0, mm2 ; 3 - second copy of In2
punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2
paddw mm2, mm3 ; 3 - sum last 4 bytes
punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2
movq mm3, [ecx+56] ; 4 - last 4 words of In1
paddw mm0, mm1 ; 3 - sum first 4 bytes
movq mm1, [ecx+48] ; 4 - first 4 words of In1
packuswb mm0, mm2 ; 3 - combine & clip sum
movq [ecx+56], mm7 ; 4 - zero last 4 words of In1
paddw mm3, mm6 ; 4 - add in rounding
movq [ecx+48], mm7 ; 4 - zero first 4 words of In1
paddw mm1, mm6 ; 4 - add in rounding
movq mm2, [edx+3*PITCH] ; 4 - 8 bytes of In2
psraw mm3, 6 ; 4 - convert to int
movq [eax+2*PITCH], mm0 ; 3 - store result
psraw mm1, 6 ; 4 - convert to int
; pass 4
movq mm0, mm2 ; 4 - second copy of In2
punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2
paddw mm2, mm3 ; 4 - sum last 4 bytes
punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2
movq mm3, [ecx+72] ; 5 - last 4 words of In1
paddw mm0, mm1 ; 4 - sum first 4 bytes
movq mm1, [ecx+64] ; 5 - first 4 words of In1
packuswb mm0, mm2 ; 4 - combine & clip sum
movq [ecx+72], mm7 ; 5 - zero last 4 words of In1
paddw mm3, mm6 ; 5 - add in rounding
movq [ecx+64], mm7 ; 5 - zero first 4 words of In1
paddw mm1, mm6 ; 5 - add in rounding
movq mm2, [edx+4*PITCH] ; 5 - 8 bytes of In2
psraw mm3, 6 ; 5 - convert to int
movq [eax+3*PITCH], mm0 ; 4 - store result
psraw mm1, 6 ; 5 - convert to int
; pass 5
movq mm0, mm2 ; 5 - second copy of In2
punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2
paddw mm2, mm3 ; 5 - sum last 4 bytes
punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2
movq mm3, [ecx+88] ; 6 - last 4 words of In1
paddw mm0, mm1 ; 5 - sum first 4 bytes
movq mm1, [ecx+80] ; 6 - first 4 words of In1
packuswb mm0, mm2 ; 5 - combine & clip sum
movq [ecx+88], mm7 ; 6 - zero last 4 words of In1
paddw mm3, mm6 ; 6 - add in rounding
movq [ecx+80], mm7 ; 6 - zero first 4 words of In1
paddw mm1, mm6 ; 6 - add in rounding
movq mm2, [edx+5*PITCH] ; 6 - 8 bytes of In2
psraw mm3, 6 ; 6 - convert to int
movq [eax+4*PITCH], mm0 ; 5 - store result
psraw mm1, 6 ; 6 - convert to int
; pass 6
movq mm0, mm2 ; 6 - second copy of In2
punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2
paddw mm2, mm3 ; 6 - sum last 4 bytes
punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2
movq mm3, [ecx+104] ; 7 - last 4 words of In1
paddw mm0, mm1 ; 6 - sum first 4 bytes
movq mm1, [ecx+96] ; 7 - first 4 words of In1
packuswb mm0, mm2 ; 6 - combine & clip sum
movq [ecx+104], mm7 ; 7 - zero last 4 words of In1
paddw mm3, mm6 ; 7 - add in rounding
movq [ecx+96], mm7 ; 7 - zero first 4 words of In1
paddw mm1, mm6 ; 7 - add in rounding
movq mm2, [edx+6*PITCH] ; 7 - 8 bytes of In2
psraw mm3, 6 ; 7 - convert to int
movq [eax+5*PITCH], mm0 ; 6 - store result
psraw mm1, 6 ; 7 - convert to int
; pass 7
movq mm0, mm2 ; 7 - second copy of In2
punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2
paddw mm2, mm3 ; 7 - sum last 4 bytes
punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2
movq mm3, [ecx+120] ; 8 - last 4 words of In1
paddw mm0, mm1 ; 7 - sum first 4 bytes
movq mm1, [ecx+112] ; 8 - first 4 words of In1
packuswb mm0, mm2 ; 7 - combine & clip sum
movq [ecx+120], mm7 ; 8 - zero last 4 words of In1
paddw mm3, mm6 ; 8 - add in rounding
movq [ecx+112], mm7 ; 8 - zero first 4 words of In1
paddw mm1, mm6 ; 8 - add in rounding
movq mm2, [edx+7*PITCH] ; 8 - 8 bytes of In2
psraw mm3, 6 ; 8 - convert to int
movq [eax+6*PITCH], mm0 ; 7 - store result
psraw mm1, 6 ; 8 - convert to int
;
; pass 8
; wrap up
;
movq mm0, mm2 ; 8 - second copy of In2
punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2
paddw mm2, mm3 ; 8 - sum last 4 bytes
punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2
paddw mm0, mm1 ; 8 - sum first 4 bytes
;
packuswb mm0, mm2 ; 8 - combine & clip sum
;
movq [eax+7*PITCH], mm0 ; 8 - store result
ret 4
@MMX_BlockAdd@12 ENDP
IFDEF H261
ALIGN 4
@MMX_BlockAddSpecial@12 PROC
; Parameters
pSrc1 EQU ecx
pSrc2 EQU edx
pDst EQU eax
PITCH EQU 384
;
; This loop is 2-folded and fully unrolled. 2-folded means that
; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that
; it doesn't really loop at all -- all 8 "passes" are placed
; in succession.
;
; The result which each instruction is working on is identified
; by a number as the first item in the comment field.
;
movq mm6, [MMX_Round32] ; rounding for IDC output
;
movq mm3, [ecx+8] ; 1 - last 4 words of In1
pxor mm7, mm7 ; zero for PUNPCK and clearing.
movq mm1, [ecx] ; 1 - first 4 words of In1
;
movq [ecx+8], mm7 ; 1 - zero last 4 words of In1
paddw mm3, mm6 ; 1 - add in rounding
movq [ecx], mm7 ; 1 - zero first 4 words of In1
paddw mm1, mm6 ; 1 - add in rounding
mov eax, [esp+4] ; destination pointer
psraw mm3, 6 ; 1 - convert to int
movq mm2, [edx] ; 1 - 8 bytes of In2
psraw mm1, 6 ; 1 - convert to int
; pass 1
movq mm0, mm2 ; 1 - second copy of In2
punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2
paddw mm2, mm3 ; 1 - sum last 4 bytes
punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2
movq mm3, [ecx+24] ; 2 - last 4 words of In1
paddw mm0, mm1 ; 1 - sum first 4 bytes
movq mm1, [ecx+16] ; 2 - first 4 words of In1
packuswb mm0, mm2 ; 1 - combine & clip sum
movq [ecx+24], mm7 ; 2 - zero last 4 words of In1
paddw mm3, mm6 ; 2 - add in rounding
movq [ecx+16], mm7 ; 2 - zero first 4 words of In1
paddw mm1, mm6 ; 2 - add in rounding
movq mm2, [edx+8] ; 2 - 8 bytes of In2
psraw mm3, 6 ; 2 - convert to int
movq [eax], mm0 ; 1 - store result
psraw mm1, 6 ; 2 - convert to int
; pass 2
movq mm0, mm2 ; 2 - second copy of In2
punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2
paddw mm2, mm3 ; 2 - sum last 4 bytes
punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2
movq mm3, [ecx+40] ; 3 - last 4 words of In1
paddw mm0, mm1 ; 2 - sum first 4 bytes
movq mm1, [ecx+32] ; 3 - first 4 words of In1
packuswb mm0, mm2 ; 2 - combine & clip sum
movq [ecx+40], mm7 ; 3 - zero last 4 words of In1
paddw mm3, mm6 ; 3 - add in rounding
movq [ecx+32], mm7 ; 3 - zero first 4 words of In1
paddw mm1, mm6 ; 3 - add in rounding
movq mm2, [edx+2*8] ; 3 - 8 bytes of In2
psraw mm3, 6 ; 3 - convert to int
movq [eax+PITCH], mm0 ; 2 - store result
psraw mm1, 6 ; 3 - convert to int
; pass 3
movq mm0, mm2 ; 3 - second copy of In2
punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2
paddw mm2, mm3 ; 3 - sum last 4 bytes
punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2
movq mm3, [ecx+56] ; 4 - last 4 words of In1
paddw mm0, mm1 ; 3 - sum first 4 bytes
movq mm1, [ecx+48] ; 4 - first 4 words of In1
packuswb mm0, mm2 ; 3 - combine & clip sum
movq [ecx+56], mm7 ; 4 - zero last 4 words of In1
paddw mm3, mm6 ; 4 - add in rounding
movq [ecx+48], mm7 ; 4 - zero first 4 words of In1
paddw mm1, mm6 ; 4 - add in rounding
movq mm2, [edx+3*8] ; 4 - 8 bytes of In2
psraw mm3, 6 ; 4 - convert to int
movq [eax+2*PITCH], mm0 ; 3 - store result
psraw mm1, 6 ; 4 - convert to int
; pass 4
movq mm0, mm2 ; 4 - second copy of In2
punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2
paddw mm2, mm3 ; 4 - sum last 4 bytes
punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2
movq mm3, [ecx+72] ; 5 - last 4 words of In1
paddw mm0, mm1 ; 4 - sum first 4 bytes
movq mm1, [ecx+64] ; 5 - first 4 words of In1
packuswb mm0, mm2 ; 4 - combine & clip sum
movq [ecx+72], mm7 ; 5 - zero last 4 words of In1
paddw mm3, mm6 ; 5 - add in rounding
movq [ecx+64], mm7 ; 5 - zero first 4 words of In1
paddw mm1, mm6 ; 5 - add in rounding
movq mm2, [edx+4*8] ; 5 - 8 bytes of In2
psraw mm3, 6 ; 5 - convert to int
movq [eax+3*PITCH], mm0 ; 4 - store result
psraw mm1, 6 ; 5 - convert to int
; pass 5
movq mm0, mm2 ; 5 - second copy of In2
punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2
paddw mm2, mm3 ; 5 - sum last 4 bytes
punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2
movq mm3, [ecx+88] ; 6 - last 4 words of In1
paddw mm0, mm1 ; 5 - sum first 4 bytes
movq mm1, [ecx+80] ; 6 - first 4 words of In1
packuswb mm0, mm2 ; 5 - combine & clip sum
movq [ecx+88], mm7 ; 6 - zero last 4 words of In1
paddw mm3, mm6 ; 6 - add in rounding
movq [ecx+80], mm7 ; 6 - zero first 4 words of In1
paddw mm1, mm6 ; 6 - add in rounding
movq mm2, [edx+5*8] ; 6 - 8 bytes of In2
psraw mm3, 6 ; 6 - convert to int
movq [eax+4*PITCH], mm0 ; 5 - store result
psraw mm1, 6 ; 6 - convert to int
; pass 6
movq mm0, mm2 ; 6 - second copy of In2
punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2
paddw mm2, mm3 ; 6 - sum last 4 bytes
punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2
movq mm3, [ecx+104] ; 7 - last 4 words of In1
paddw mm0, mm1 ; 6 - sum first 4 bytes
movq mm1, [ecx+96] ; 7 - first 4 words of In1
packuswb mm0, mm2 ; 6 - combine & clip sum
movq [ecx+104], mm7 ; 7 - zero last 4 words of In1
paddw mm3, mm6 ; 7 - add in rounding
movq [ecx+96], mm7 ; 7 - zero first 4 words of In1
paddw mm1, mm6 ; 7 - add in rounding
movq mm2, [edx+6*8] ; 7 - 8 bytes of In2
psraw mm3, 6 ; 7 - convert to int
movq [eax+5*PITCH], mm0 ; 6 - store result
psraw mm1, 6 ; 7 - convert to int
; pass 7
movq mm0, mm2 ; 7 - second copy of In2
punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2
paddw mm2, mm3 ; 7 - sum last 4 bytes
punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2
movq mm3, [ecx+120] ; 8 - last 4 words of In1
paddw mm0, mm1 ; 7 - sum first 4 bytes
movq mm1, [ecx+112] ; 8 - first 4 words of In1
packuswb mm0, mm2 ; 7 - combine & clip sum
movq [ecx+120], mm7 ; 8 - zero last 4 words of In1
paddw mm3, mm6 ; 8 - add in rounding
movq [ecx+112], mm7 ; 8 - zero first 4 words of In1
paddw mm1, mm6 ; 8 - add in rounding
movq mm2, [edx+7*8] ; 8 - 8 bytes of In2
psraw mm3, 6 ; 8 - convert to int
movq [eax+6*PITCH], mm0 ; 7 - store result
psraw mm1, 6 ; 8 - convert to int
;
; pass 8
; wrap up
;
movq mm0, mm2 ; 8 - second copy of In2
punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2
paddw mm2, mm3 ; 8 - sum last 4 bytes
punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2
paddw mm0, mm1 ; 8 - sum first 4 bytes
;
packuswb mm0, mm2 ; 8 - combine & clip sum
;
movq [eax+7*PITCH], mm0 ; 8 - store result
ret 4
@MMX_BlockAddSpecial@12 ENDP
ENDIF
;----------------------------------------------------------------------------
;
; MMX_ClipAndMove
;
; Description:
; This routine takes the MMx IDCT output, converts (with round)
; to integer, and clips to 0...255. Routine is called with the
; __fastcall option, with the two parameters in ecx and edx.
;
; The routine clears the IDCT output after reading it.
;
; MMx version.
;
; Parameters:
; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values
; with 6 fractional bits. They are not clipped to -256 ...
; +255. They are packed into a qword aligned 8x8 array
; of words.
;
; edx = uDst pointer to output values. Values will be unsigned, 8-bit
; values. They will be written into a qword aligned 8x8 array
; of bytes with a PITCH of 384 in between rows.
; esp + 4 = Scaled DC value with 7 fraction bits
;----------------------------------------------------------------------------
ALIGN 4
@MMX_ClipAndMove@12 PROC
; Parameters
pSrc1 EQU ecx
pDst EQU edx
ScaledDC EQU DWORD PTR [esp + 4]
;
; preamble
;
movd mm0, ScaledDC ; Scaled DC value
pxor mm6, mm6 ; zero
movq mm1, mm0
psllq mm0, 16
movq mm2, [ecx] ; 3: fetch first 4 words
por mm0, mm1 ; lower 2 WORDS have ScaledDC
movq mm7, mm0
psllq mm0, 32
por mm7, mm0 ; all 4 WORDS have ScaledDC
mov eax, 3 ; loop control
movq mm3, [ecx+8] ; 3: fetch last 4 words
psrlw mm7, 1 ; DC with 6 bits of fraction
paddw mm7, [MMX_Round32] ; rounding+DC for IDCT output
;
movq [ecx], mm6 ; 3: zero first 4 words
paddw mm2, mm7 ; 3: add in round
movq [ecx+8], mm6 ; 3: zero first 4 words
paddw mm3, mm7 ; 3: add in round
psraw mm2, 6 ; 3: convert to integer
;
;
; main loop:
; This loop is 3-folded and 2-unrolled. 3-folded means that it
; works on 3 different results per iteration. 2-unrolled that
; it produces 2 results per iteration.
;
; The result which each instruction works on is identified by a
; number (1:, 2:, or 3:) at the start of the comment field. These
; identify 3 stages as follows:
;
; Stage Description
; ----- -----------
; 1 Convert the last 4 words of a line to integer, pack together
; into 8 bytes, and write the result.
; 2 Do all processing for the next line: load and clear 8 words,
; add in round, convert to integer, pack to bytes, and write
; the result.
; 3 Load and zero all 8 words of a line, add in round,
; and convert the first 4 of them to integers. (Processing
; of this stage is completed as stage 1 of the next pass.)
;
MainLoop:
movq mm0, [ecx+16] ; 2: fetch first 4 words
psraw mm3, 6 ; 1: convert to integer
movq mm1, [ecx+24] ; 2: fetch last 4 words
packuswb mm2, mm3 ; 1: pack and clip
movq [ecx+16], mm6 ; 2: zero first 4 words
paddw mm0, mm7 ; 2: add in round
movq [ecx+24], mm6 ; 2: zero last 4 words
paddw mm1, mm7 ; 2: add in round
movq [edx], mm2 ; 1: store result
psraw mm0, 6 ; 2: convert to integer
movq mm2, [ecx+32] ; 3: fetch first 4 words
psraw mm1, 6 ; 2: convert to integer
movq mm3, [ecx+40] ; 3: fetch last 4 words
packuswb mm0, mm1 ; 2: pack and clip
movq [ecx+32], mm6 ; 2: zero first 4 words
paddw mm2, mm7 ; 3: add in round
movq [ecx+40], mm6 ; 2: zero first 4 words
paddw mm3, mm7 ; 3: add in round
movq [edx+PITCH], mm0 ; 2: store result
psraw mm2, 6 ; 3: convert to integer
add ecx, 32 ; increment source pointer
add edx, 2*PITCH ; increment destination pointer
dec eax ; decrement loop control
jne MainLoop ; repeat three times
;
; postamble
;
movq mm0, [ecx+16] ; 2: fetch first 4 words
psraw mm3, 6 ; 1: convert to integer
movq mm1, [ecx+24] ; 2: fetch last 4 words
packuswb mm2, mm3 ; 1: pack and clip
paddw mm0, mm7 ; 2: add in round
paddw mm1, mm7 ; 2: add in round
movq [edx], mm2 ; 1: store result
psraw mm0, 6 ; 2: convert to integer
movq [ecx+16], mm6 ; 2: zero first 4 words
psraw mm1, 6 ; 2: convert to integer
movq [ecx+24], mm6 ; 2: zero last 4 words
packuswb mm0, mm1 ; 2: pack and clip
movq [edx+PITCH], mm0 ; 2: store result
ret 4
@MMX_ClipAndMove@12 ENDP
;----------------------------------------------------------------------------
;
; MMX_BlockCopy
; Copy in chunks of 4 as suggested in MMX guide. (
; Parameters:
; ecx = Pointer to output values
;
; edx = Pointer to input values
;----------------------------------------------------------------------------
ALIGN 4
@MMX_BlockCopy@8 PROC
; Parameters
pDst EQU ecx
pSrc EQU edx
movq mm0, [pSrc]
;
movq mm1, [pSrc + PITCH]
;
movq mm2, [pSrc + PITCH*2]
;
movq mm3, [pSrc + PITCH*3]
;
movq [pDst], mm0
;
movq [pDst + PITCH], mm1
;
movq [pDst + PITCH*2], mm2
;
movq [pDst + PITCH*3], mm3
;
movq mm4, [pSrc + PITCH*4]
;
movq mm5, [pSrc + PITCH*5]
;
movq mm6, [pSrc + PITCH*6]
;
movq mm7, [pSrc + PITCH*7]
;
movq [pDst + PITCH*4], mm4
;
movq [pDst + PITCH*5], mm5
;
movq [pDst + PITCH*6], mm6
;
movq [pDst + PITCH*7], mm7
;
ret
@MMX_BlockCopy@8 ENDP
IFDEF H261
;----------------------------------------------------------------------------
;
; MMX_BlockCopySpecial
; Copy in chunks of 4 as suggested in MMX guide. (
; Parameters:
; ecx = Pointer to output values
;
; edx = Pointer to input values
;----------------------------------------------------------------------------
ALIGN 4
@MMX_BlockCopySpecial@8 PROC
; Parameters
pDst EQU ecx
pSrc EQU edx
PITCH8 EQU 8
movq mm0, [pSrc]
;
movq mm1, [pSrc + PITCH8]
;
movq mm2, [pSrc + PITCH8*2]
;
movq mm3, [pSrc + PITCH8*3]
;
movq [pDst], mm0
;
movq [pDst + PITCH], mm1
;
movq [pDst + PITCH*2], mm2
;
movq [pDst + PITCH*3], mm3
;
movq mm4, [pSrc + PITCH8*4]
;
movq mm5, [pSrc + PITCH8*5]
;
movq mm6, [pSrc + PITCH8*6]
;
movq mm7, [pSrc + PITCH8*7]
;
movq [pDst + PITCH*4], mm4
;
movq [pDst + PITCH*5], mm5
;
movq [pDst + PITCH*6], mm6
;
movq [pDst + PITCH*7], mm7
;
ret
@MMX_BlockCopySpecial@8 ENDP
ENDIF
MMXCODE1 ENDS
END