;-------------------------------------------------------------------------- ; INTEL Corporation Proprietary Information ; ; This listing is supplied under the terms of a license ; agreement with INTEL Corporation and may not be copied ; nor disclosed except in accordance with the terms of ; that agreement. ; ; Copyright (c) 1996 Intel Corporation. ; All Rights Reserved. ; ;-------------------------------------------------------------------------- ;-------------------------------------------------------------------------- ; ; $Author: SCDAY $ ; $Date: 31 Oct 1996 09:00:56 $ ; $Archive: S:\h26x\src\dec\d3mbkadd.asv $ ; $Header: S:\h26x\src\dec\d3mbkadd.asv 1.8 31 Oct 1996 09:00:56 SCDAY $ ; $Log: S:\h26x\src\dec\d3mbkadd.asv $ ;// ;// Rev 1.8 31 Oct 1996 09:00:56 SCDAY ;// Raj added IFDEF H261 MMX_BlockAddSpecial and MMX_BlockCopySpecial ;// ;// Rev 1.7 09 Jul 1996 16:50:42 AGUPTA2 ;// DC value for INTRA blocks is added back in ClipAndMove routine. ;// Cleaned-up code. ;// ;// Rev 1.6 04 Apr 1996 13:42:58 AGUPTA2 ;// Removed a store stall from MMX_BlockAdd ;// ;// Rev 1.5 03 Apr 1996 17:42:30 AGUPTA2 ;// Added MMX version of BlockCopy routine. ;// ;// Rev 1.4 03 Apr 1996 11:08:22 RMCKENZX ;// Added clearing of IDCT output. Cleaned comments. ;// ;// Rev 1.3 22 Mar 1996 15:43:30 AGUPTA2 ;// Fixed fastcall bug: return from rtns with more than 2 params. ;// ;// Rev 1.2 14 Mar 1996 17:15:14 AGUPTA2 ;// ;// Included Bob's MMX_ClipAndMove rtn. This rtn works on INTRA output. ;// ;// Rev 1.1 27 Feb 1996 16:48:52 RMCKENZX ;// Added rounding of IDCT output. ; ;-------------------------------------------------------------------------- ;========================================================================== ; ; d3mbkadd.asm ; ; Routines: ; MMX_BlockAdd ; MMX_ClipAndMove ; ; Prototypes in d3mblk.h: ; extern "C" { ; void __fastcall MMX_BlockAdd( ; U32 uResidual, // pointer to IDCT output ; U32 uRefBlock, // pointer to predicted values ; U32 uDstBlock); // pointer to destination ; ; void __fastcall MMX_ClipAndMove( ; U32 uResidual, // pointer to IDCT output ; U32 uDstBlock, // pointer to destination ; U32 ScaledDC); // scaled DC ; } ; ;========================================================================== ;-------------------------------------------------------------------------- ; ; MMX_BlockAdd ; ; Description: ; This routine performs block addition of the IDCT output with the ; predicted value to find the final value. The IDCT values are converted ; to integers then added to the prediction. The result of the addition is ; then clipped to 0...255. The routine is called with the __fastcall option, ; with the first two parameters in ecx and edx and the third on the stack. ; ; The routine clears the IDCT output after reading it. ; Parameters: ; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values with ; 6 fractional bits. They are not clipped to -256 ... +255. ; They are packed into a qword aligned 8x8 array of dwords. ; ; edx = uSrc2 pointer to prediction values. Vaules are unsigned, 8-bit ; values. They are packed into a (possibly unaligned) 8x8 array of ; bytes. ; esp+4 = uDst pointer to output values. Values will be unsigned, 8-bit ; values. They will be written into a qword aligned 8x8 array ; of bytes with a PITCH of 384 in between rows. ; ;-------------------------------------------------------------------------- .586 .MODEL FLAT OPTION CASEMAP:NONE OPTION PROLOGUE:None OPTION EPILOGUE:None .xlist include iammx.inc .list MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE' MMXCODE1 ENDS MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA' MMXDATA1 ENDS MMXDATA1 SEGMENT ALIGN 8 MMX_Round32 DWORD 000200020H, 000200020H MMXDATA1 ENDS MMXCODE1 SEGMENT ALIGN 4 @MMX_BlockAdd@12 PROC ; Parameters pSrc1 EQU ecx pSrc2 EQU edx pDst EQU eax PITCH EQU 384 ; ; This loop is 2-folded and fully unrolled. 2-folded means that ; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that ; it doesn't really loop at all -- all 8 "passes" are placed ; in succession. ; ; The result which each instruction is working on is identified ; by a number as the first item in the comment field. ; movq mm6, [MMX_Round32] ; rounding for IDC output ; movq mm3, [ecx+8] ; 1 - last 4 words of In1 pxor mm7, mm7 ; zero for PUNPCK and clearing. movq mm1, [ecx] ; 1 - first 4 words of In1 ; movq [ecx+8], mm7 ; 1 - zero last 4 words of In1 paddw mm3, mm6 ; 1 - add in rounding movq [ecx], mm7 ; 1 - zero first 4 words of In1 paddw mm1, mm6 ; 1 - add in rounding mov eax, [esp+4] ; destination pointer psraw mm3, 6 ; 1 - convert to int movq mm2, [edx] ; 1 - 8 bytes of In2 psraw mm1, 6 ; 1 - convert to int ; pass 1 movq mm0, mm2 ; 1 - second copy of In2 punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2 paddw mm2, mm3 ; 1 - sum last 4 bytes punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2 movq mm3, [ecx+24] ; 2 - last 4 words of In1 paddw mm0, mm1 ; 1 - sum first 4 bytes movq mm1, [ecx+16] ; 2 - first 4 words of In1 packuswb mm0, mm2 ; 1 - combine & clip sum movq [ecx+24], mm7 ; 2 - zero last 4 words of In1 paddw mm3, mm6 ; 2 - add in rounding movq [ecx+16], mm7 ; 2 - zero first 4 words of In1 paddw mm1, mm6 ; 2 - add in rounding movq mm2, [edx+PITCH] ; 2 - 8 bytes of In2 psraw mm3, 6 ; 2 - convert to int movq [eax], mm0 ; 1 - store result psraw mm1, 6 ; 2 - convert to int ; pass 2 movq mm0, mm2 ; 2 - second copy of In2 punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2 paddw mm2, mm3 ; 2 - sum last 4 bytes punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2 movq mm3, [ecx+40] ; 3 - last 4 words of In1 paddw mm0, mm1 ; 2 - sum first 4 bytes movq mm1, [ecx+32] ; 3 - first 4 words of In1 packuswb mm0, mm2 ; 2 - combine & clip sum movq [ecx+40], mm7 ; 3 - zero last 4 words of In1 paddw mm3, mm6 ; 3 - add in rounding movq [ecx+32], mm7 ; 3 - zero first 4 words of In1 paddw mm1, mm6 ; 3 - add in rounding movq mm2, [edx+2*PITCH] ; 3 - 8 bytes of In2 psraw mm3, 6 ; 3 - convert to int movq [eax+PITCH], mm0 ; 2 - store result psraw mm1, 6 ; 3 - convert to int ; pass 3 movq mm0, mm2 ; 3 - second copy of In2 punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2 paddw mm2, mm3 ; 3 - sum last 4 bytes punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2 movq mm3, [ecx+56] ; 4 - last 4 words of In1 paddw mm0, mm1 ; 3 - sum first 4 bytes movq mm1, [ecx+48] ; 4 - first 4 words of In1 packuswb mm0, mm2 ; 3 - combine & clip sum movq [ecx+56], mm7 ; 4 - zero last 4 words of In1 paddw mm3, mm6 ; 4 - add in rounding movq [ecx+48], mm7 ; 4 - zero first 4 words of In1 paddw mm1, mm6 ; 4 - add in rounding movq mm2, [edx+3*PITCH] ; 4 - 8 bytes of In2 psraw mm3, 6 ; 4 - convert to int movq [eax+2*PITCH], mm0 ; 3 - store result psraw mm1, 6 ; 4 - convert to int ; pass 4 movq mm0, mm2 ; 4 - second copy of In2 punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2 paddw mm2, mm3 ; 4 - sum last 4 bytes punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2 movq mm3, [ecx+72] ; 5 - last 4 words of In1 paddw mm0, mm1 ; 4 - sum first 4 bytes movq mm1, [ecx+64] ; 5 - first 4 words of In1 packuswb mm0, mm2 ; 4 - combine & clip sum movq [ecx+72], mm7 ; 5 - zero last 4 words of In1 paddw mm3, mm6 ; 5 - add in rounding movq [ecx+64], mm7 ; 5 - zero first 4 words of In1 paddw mm1, mm6 ; 5 - add in rounding movq mm2, [edx+4*PITCH] ; 5 - 8 bytes of In2 psraw mm3, 6 ; 5 - convert to int movq [eax+3*PITCH], mm0 ; 4 - store result psraw mm1, 6 ; 5 - convert to int ; pass 5 movq mm0, mm2 ; 5 - second copy of In2 punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2 paddw mm2, mm3 ; 5 - sum last 4 bytes punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2 movq mm3, [ecx+88] ; 6 - last 4 words of In1 paddw mm0, mm1 ; 5 - sum first 4 bytes movq mm1, [ecx+80] ; 6 - first 4 words of In1 packuswb mm0, mm2 ; 5 - combine & clip sum movq [ecx+88], mm7 ; 6 - zero last 4 words of In1 paddw mm3, mm6 ; 6 - add in rounding movq [ecx+80], mm7 ; 6 - zero first 4 words of In1 paddw mm1, mm6 ; 6 - add in rounding movq mm2, [edx+5*PITCH] ; 6 - 8 bytes of In2 psraw mm3, 6 ; 6 - convert to int movq [eax+4*PITCH], mm0 ; 5 - store result psraw mm1, 6 ; 6 - convert to int ; pass 6 movq mm0, mm2 ; 6 - second copy of In2 punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2 paddw mm2, mm3 ; 6 - sum last 4 bytes punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2 movq mm3, [ecx+104] ; 7 - last 4 words of In1 paddw mm0, mm1 ; 6 - sum first 4 bytes movq mm1, [ecx+96] ; 7 - first 4 words of In1 packuswb mm0, mm2 ; 6 - combine & clip sum movq [ecx+104], mm7 ; 7 - zero last 4 words of In1 paddw mm3, mm6 ; 7 - add in rounding movq [ecx+96], mm7 ; 7 - zero first 4 words of In1 paddw mm1, mm6 ; 7 - add in rounding movq mm2, [edx+6*PITCH] ; 7 - 8 bytes of In2 psraw mm3, 6 ; 7 - convert to int movq [eax+5*PITCH], mm0 ; 6 - store result psraw mm1, 6 ; 7 - convert to int ; pass 7 movq mm0, mm2 ; 7 - second copy of In2 punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2 paddw mm2, mm3 ; 7 - sum last 4 bytes punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2 movq mm3, [ecx+120] ; 8 - last 4 words of In1 paddw mm0, mm1 ; 7 - sum first 4 bytes movq mm1, [ecx+112] ; 8 - first 4 words of In1 packuswb mm0, mm2 ; 7 - combine & clip sum movq [ecx+120], mm7 ; 8 - zero last 4 words of In1 paddw mm3, mm6 ; 8 - add in rounding movq [ecx+112], mm7 ; 8 - zero first 4 words of In1 paddw mm1, mm6 ; 8 - add in rounding movq mm2, [edx+7*PITCH] ; 8 - 8 bytes of In2 psraw mm3, 6 ; 8 - convert to int movq [eax+6*PITCH], mm0 ; 7 - store result psraw mm1, 6 ; 8 - convert to int ; ; pass 8 ; wrap up ; movq mm0, mm2 ; 8 - second copy of In2 punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2 paddw mm2, mm3 ; 8 - sum last 4 bytes punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2 paddw mm0, mm1 ; 8 - sum first 4 bytes ; packuswb mm0, mm2 ; 8 - combine & clip sum ; movq [eax+7*PITCH], mm0 ; 8 - store result ret 4 @MMX_BlockAdd@12 ENDP IFDEF H261 ALIGN 4 @MMX_BlockAddSpecial@12 PROC ; Parameters pSrc1 EQU ecx pSrc2 EQU edx pDst EQU eax PITCH EQU 384 ; ; This loop is 2-folded and fully unrolled. 2-folded means that ; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that ; it doesn't really loop at all -- all 8 "passes" are placed ; in succession. ; ; The result which each instruction is working on is identified ; by a number as the first item in the comment field. ; movq mm6, [MMX_Round32] ; rounding for IDC output ; movq mm3, [ecx+8] ; 1 - last 4 words of In1 pxor mm7, mm7 ; zero for PUNPCK and clearing. movq mm1, [ecx] ; 1 - first 4 words of In1 ; movq [ecx+8], mm7 ; 1 - zero last 4 words of In1 paddw mm3, mm6 ; 1 - add in rounding movq [ecx], mm7 ; 1 - zero first 4 words of In1 paddw mm1, mm6 ; 1 - add in rounding mov eax, [esp+4] ; destination pointer psraw mm3, 6 ; 1 - convert to int movq mm2, [edx] ; 1 - 8 bytes of In2 psraw mm1, 6 ; 1 - convert to int ; pass 1 movq mm0, mm2 ; 1 - second copy of In2 punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2 paddw mm2, mm3 ; 1 - sum last 4 bytes punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2 movq mm3, [ecx+24] ; 2 - last 4 words of In1 paddw mm0, mm1 ; 1 - sum first 4 bytes movq mm1, [ecx+16] ; 2 - first 4 words of In1 packuswb mm0, mm2 ; 1 - combine & clip sum movq [ecx+24], mm7 ; 2 - zero last 4 words of In1 paddw mm3, mm6 ; 2 - add in rounding movq [ecx+16], mm7 ; 2 - zero first 4 words of In1 paddw mm1, mm6 ; 2 - add in rounding movq mm2, [edx+8] ; 2 - 8 bytes of In2 psraw mm3, 6 ; 2 - convert to int movq [eax], mm0 ; 1 - store result psraw mm1, 6 ; 2 - convert to int ; pass 2 movq mm0, mm2 ; 2 - second copy of In2 punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2 paddw mm2, mm3 ; 2 - sum last 4 bytes punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2 movq mm3, [ecx+40] ; 3 - last 4 words of In1 paddw mm0, mm1 ; 2 - sum first 4 bytes movq mm1, [ecx+32] ; 3 - first 4 words of In1 packuswb mm0, mm2 ; 2 - combine & clip sum movq [ecx+40], mm7 ; 3 - zero last 4 words of In1 paddw mm3, mm6 ; 3 - add in rounding movq [ecx+32], mm7 ; 3 - zero first 4 words of In1 paddw mm1, mm6 ; 3 - add in rounding movq mm2, [edx+2*8] ; 3 - 8 bytes of In2 psraw mm3, 6 ; 3 - convert to int movq [eax+PITCH], mm0 ; 2 - store result psraw mm1, 6 ; 3 - convert to int ; pass 3 movq mm0, mm2 ; 3 - second copy of In2 punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2 paddw mm2, mm3 ; 3 - sum last 4 bytes punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2 movq mm3, [ecx+56] ; 4 - last 4 words of In1 paddw mm0, mm1 ; 3 - sum first 4 bytes movq mm1, [ecx+48] ; 4 - first 4 words of In1 packuswb mm0, mm2 ; 3 - combine & clip sum movq [ecx+56], mm7 ; 4 - zero last 4 words of In1 paddw mm3, mm6 ; 4 - add in rounding movq [ecx+48], mm7 ; 4 - zero first 4 words of In1 paddw mm1, mm6 ; 4 - add in rounding movq mm2, [edx+3*8] ; 4 - 8 bytes of In2 psraw mm3, 6 ; 4 - convert to int movq [eax+2*PITCH], mm0 ; 3 - store result psraw mm1, 6 ; 4 - convert to int ; pass 4 movq mm0, mm2 ; 4 - second copy of In2 punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2 paddw mm2, mm3 ; 4 - sum last 4 bytes punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2 movq mm3, [ecx+72] ; 5 - last 4 words of In1 paddw mm0, mm1 ; 4 - sum first 4 bytes movq mm1, [ecx+64] ; 5 - first 4 words of In1 packuswb mm0, mm2 ; 4 - combine & clip sum movq [ecx+72], mm7 ; 5 - zero last 4 words of In1 paddw mm3, mm6 ; 5 - add in rounding movq [ecx+64], mm7 ; 5 - zero first 4 words of In1 paddw mm1, mm6 ; 5 - add in rounding movq mm2, [edx+4*8] ; 5 - 8 bytes of In2 psraw mm3, 6 ; 5 - convert to int movq [eax+3*PITCH], mm0 ; 4 - store result psraw mm1, 6 ; 5 - convert to int ; pass 5 movq mm0, mm2 ; 5 - second copy of In2 punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2 paddw mm2, mm3 ; 5 - sum last 4 bytes punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2 movq mm3, [ecx+88] ; 6 - last 4 words of In1 paddw mm0, mm1 ; 5 - sum first 4 bytes movq mm1, [ecx+80] ; 6 - first 4 words of In1 packuswb mm0, mm2 ; 5 - combine & clip sum movq [ecx+88], mm7 ; 6 - zero last 4 words of In1 paddw mm3, mm6 ; 6 - add in rounding movq [ecx+80], mm7 ; 6 - zero first 4 words of In1 paddw mm1, mm6 ; 6 - add in rounding movq mm2, [edx+5*8] ; 6 - 8 bytes of In2 psraw mm3, 6 ; 6 - convert to int movq [eax+4*PITCH], mm0 ; 5 - store result psraw mm1, 6 ; 6 - convert to int ; pass 6 movq mm0, mm2 ; 6 - second copy of In2 punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2 paddw mm2, mm3 ; 6 - sum last 4 bytes punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2 movq mm3, [ecx+104] ; 7 - last 4 words of In1 paddw mm0, mm1 ; 6 - sum first 4 bytes movq mm1, [ecx+96] ; 7 - first 4 words of In1 packuswb mm0, mm2 ; 6 - combine & clip sum movq [ecx+104], mm7 ; 7 - zero last 4 words of In1 paddw mm3, mm6 ; 7 - add in rounding movq [ecx+96], mm7 ; 7 - zero first 4 words of In1 paddw mm1, mm6 ; 7 - add in rounding movq mm2, [edx+6*8] ; 7 - 8 bytes of In2 psraw mm3, 6 ; 7 - convert to int movq [eax+5*PITCH], mm0 ; 6 - store result psraw mm1, 6 ; 7 - convert to int ; pass 7 movq mm0, mm2 ; 7 - second copy of In2 punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2 paddw mm2, mm3 ; 7 - sum last 4 bytes punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2 movq mm3, [ecx+120] ; 8 - last 4 words of In1 paddw mm0, mm1 ; 7 - sum first 4 bytes movq mm1, [ecx+112] ; 8 - first 4 words of In1 packuswb mm0, mm2 ; 7 - combine & clip sum movq [ecx+120], mm7 ; 8 - zero last 4 words of In1 paddw mm3, mm6 ; 8 - add in rounding movq [ecx+112], mm7 ; 8 - zero first 4 words of In1 paddw mm1, mm6 ; 8 - add in rounding movq mm2, [edx+7*8] ; 8 - 8 bytes of In2 psraw mm3, 6 ; 8 - convert to int movq [eax+6*PITCH], mm0 ; 7 - store result psraw mm1, 6 ; 8 - convert to int ; ; pass 8 ; wrap up ; movq mm0, mm2 ; 8 - second copy of In2 punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2 paddw mm2, mm3 ; 8 - sum last 4 bytes punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2 paddw mm0, mm1 ; 8 - sum first 4 bytes ; packuswb mm0, mm2 ; 8 - combine & clip sum ; movq [eax+7*PITCH], mm0 ; 8 - store result ret 4 @MMX_BlockAddSpecial@12 ENDP ENDIF ;---------------------------------------------------------------------------- ; ; MMX_ClipAndMove ; ; Description: ; This routine takes the MMx IDCT output, converts (with round) ; to integer, and clips to 0...255. Routine is called with the ; __fastcall option, with the two parameters in ecx and edx. ; ; The routine clears the IDCT output after reading it. ; ; MMx version. ; ; Parameters: ; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values ; with 6 fractional bits. They are not clipped to -256 ... ; +255. They are packed into a qword aligned 8x8 array ; of words. ; ; edx = uDst pointer to output values. Values will be unsigned, 8-bit ; values. They will be written into a qword aligned 8x8 array ; of bytes with a PITCH of 384 in between rows. ; esp + 4 = Scaled DC value with 7 fraction bits ;---------------------------------------------------------------------------- ALIGN 4 @MMX_ClipAndMove@12 PROC ; Parameters pSrc1 EQU ecx pDst EQU edx ScaledDC EQU DWORD PTR [esp + 4] ; ; preamble ; movd mm0, ScaledDC ; Scaled DC value pxor mm6, mm6 ; zero movq mm1, mm0 psllq mm0, 16 movq mm2, [ecx] ; 3: fetch first 4 words por mm0, mm1 ; lower 2 WORDS have ScaledDC movq mm7, mm0 psllq mm0, 32 por mm7, mm0 ; all 4 WORDS have ScaledDC mov eax, 3 ; loop control movq mm3, [ecx+8] ; 3: fetch last 4 words psrlw mm7, 1 ; DC with 6 bits of fraction paddw mm7, [MMX_Round32] ; rounding+DC for IDCT output ; movq [ecx], mm6 ; 3: zero first 4 words paddw mm2, mm7 ; 3: add in round movq [ecx+8], mm6 ; 3: zero first 4 words paddw mm3, mm7 ; 3: add in round psraw mm2, 6 ; 3: convert to integer ; ; ; main loop: ; This loop is 3-folded and 2-unrolled. 3-folded means that it ; works on 3 different results per iteration. 2-unrolled that ; it produces 2 results per iteration. ; ; The result which each instruction works on is identified by a ; number (1:, 2:, or 3:) at the start of the comment field. These ; identify 3 stages as follows: ; ; Stage Description ; ----- ----------- ; 1 Convert the last 4 words of a line to integer, pack together ; into 8 bytes, and write the result. ; 2 Do all processing for the next line: load and clear 8 words, ; add in round, convert to integer, pack to bytes, and write ; the result. ; 3 Load and zero all 8 words of a line, add in round, ; and convert the first 4 of them to integers. (Processing ; of this stage is completed as stage 1 of the next pass.) ; MainLoop: movq mm0, [ecx+16] ; 2: fetch first 4 words psraw mm3, 6 ; 1: convert to integer movq mm1, [ecx+24] ; 2: fetch last 4 words packuswb mm2, mm3 ; 1: pack and clip movq [ecx+16], mm6 ; 2: zero first 4 words paddw mm0, mm7 ; 2: add in round movq [ecx+24], mm6 ; 2: zero last 4 words paddw mm1, mm7 ; 2: add in round movq [edx], mm2 ; 1: store result psraw mm0, 6 ; 2: convert to integer movq mm2, [ecx+32] ; 3: fetch first 4 words psraw mm1, 6 ; 2: convert to integer movq mm3, [ecx+40] ; 3: fetch last 4 words packuswb mm0, mm1 ; 2: pack and clip movq [ecx+32], mm6 ; 2: zero first 4 words paddw mm2, mm7 ; 3: add in round movq [ecx+40], mm6 ; 2: zero first 4 words paddw mm3, mm7 ; 3: add in round movq [edx+PITCH], mm0 ; 2: store result psraw mm2, 6 ; 3: convert to integer add ecx, 32 ; increment source pointer add edx, 2*PITCH ; increment destination pointer dec eax ; decrement loop control jne MainLoop ; repeat three times ; ; postamble ; movq mm0, [ecx+16] ; 2: fetch first 4 words psraw mm3, 6 ; 1: convert to integer movq mm1, [ecx+24] ; 2: fetch last 4 words packuswb mm2, mm3 ; 1: pack and clip paddw mm0, mm7 ; 2: add in round paddw mm1, mm7 ; 2: add in round movq [edx], mm2 ; 1: store result psraw mm0, 6 ; 2: convert to integer movq [ecx+16], mm6 ; 2: zero first 4 words psraw mm1, 6 ; 2: convert to integer movq [ecx+24], mm6 ; 2: zero last 4 words packuswb mm0, mm1 ; 2: pack and clip movq [edx+PITCH], mm0 ; 2: store result ret 4 @MMX_ClipAndMove@12 ENDP ;---------------------------------------------------------------------------- ; ; MMX_BlockCopy ; Copy in chunks of 4 as suggested in MMX guide. ( ; Parameters: ; ecx = Pointer to output values ; ; edx = Pointer to input values ;---------------------------------------------------------------------------- ALIGN 4 @MMX_BlockCopy@8 PROC ; Parameters pDst EQU ecx pSrc EQU edx movq mm0, [pSrc] ; movq mm1, [pSrc + PITCH] ; movq mm2, [pSrc + PITCH*2] ; movq mm3, [pSrc + PITCH*3] ; movq [pDst], mm0 ; movq [pDst + PITCH], mm1 ; movq [pDst + PITCH*2], mm2 ; movq [pDst + PITCH*3], mm3 ; movq mm4, [pSrc + PITCH*4] ; movq mm5, [pSrc + PITCH*5] ; movq mm6, [pSrc + PITCH*6] ; movq mm7, [pSrc + PITCH*7] ; movq [pDst + PITCH*4], mm4 ; movq [pDst + PITCH*5], mm5 ; movq [pDst + PITCH*6], mm6 ; movq [pDst + PITCH*7], mm7 ; ret @MMX_BlockCopy@8 ENDP IFDEF H261 ;---------------------------------------------------------------------------- ; ; MMX_BlockCopySpecial ; Copy in chunks of 4 as suggested in MMX guide. ( ; Parameters: ; ecx = Pointer to output values ; ; edx = Pointer to input values ;---------------------------------------------------------------------------- ALIGN 4 @MMX_BlockCopySpecial@8 PROC ; Parameters pDst EQU ecx pSrc EQU edx PITCH8 EQU 8 movq mm0, [pSrc] ; movq mm1, [pSrc + PITCH8] ; movq mm2, [pSrc + PITCH8*2] ; movq mm3, [pSrc + PITCH8*3] ; movq [pDst], mm0 ; movq [pDst + PITCH], mm1 ; movq [pDst + PITCH*2], mm2 ; movq [pDst + PITCH*3], mm3 ; movq mm4, [pSrc + PITCH8*4] ; movq mm5, [pSrc + PITCH8*5] ; movq mm6, [pSrc + PITCH8*6] ; movq mm7, [pSrc + PITCH8*7] ; movq [pDst + PITCH*4], mm4 ; movq [pDst + PITCH*5], mm5 ; movq [pDst + PITCH*6], mm6 ; movq [pDst + PITCH*7], mm7 ; ret @MMX_BlockCopySpecial@8 ENDP ENDIF MMXCODE1 ENDS END