Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

991 lines
22 KiB

;--------------------------------------------------------------------------;
;
; INTEL Corporation Proprietary Information
;
; This listing is supplied under the terms of a license
; agreement with INTEL Corporation and may not be copied
; nor disclosed except in accordance with the terms of
; that agreement.
;
; Copyright (c) 1996 Intel Corporation.
; All Rights Reserved.
;
;--------------------------------------------------------------------------;
;
; $Header: S:\h26x\src\dec\d3madvpr.asv 1.6 01 Oct 1996 16:45:38 KLILLEVO $
; $Log: S:\h26x\src\dec\d3madvpr.asv $
;//
;// Rev 1.6 01 Oct 1996 16:45:38 KLILLEVO
;// removed unneccessary local variable and added code to verify
;// PITCH is 384 at compile-time
;//
;// Rev 1.5 01 Oct 1996 11:57:52 KLILLEVO
;// pairing done, saved about 5*4 = 20 cycles per block = 11880 cycles
;// per QCIF picture
;//
;// Rev 1.4 27 Sep 1996 17:28:40 KLILLEVO
;// added clipping of extended motion vectors, but pairing is horrible and
;// needs to be improved
;//
;// Rev 1.3 01 Apr 1996 12:35:14 RMCKENZX
;//
;// Added MMXCODE1 and MMXDATA1 segments, moved global data
;// to MMXDATA1 segment.
;//
;// Rev 1.2 07 Mar 1996 18:32:16 RMCKENZX
;//
;// Re-organized and optimized routine. Interpolaters now
;// interpolate & weight, driver accumulates and averages. Interpolaters
;// return results in mm4-mm7. Eliminated include file.
;//
;// Rev 1.0 27 Feb 1996 15:03:42 RMCKENZX
;// Initial revision.
;
;--------------------------------------------------------------------------;
;
; File:
; d3madvpr.asm
;
; Routines:
; MMX_AdvancePredict Driver
; MMxInterpolateAndAccumulate Assembly-called interpolate accumulate
;
;--------------------------------------------------------------------------;
.586
.MODEL FLAT
; make all symbols case sensitive
OPTION CASEMAP:NONE
.xlist
include iammx.inc
.list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
;--------------------------------------------------------------------------;
;
; MMX_AdvancePredict
;
; Description:
; This routine performs advanced prediction, including overlapped
; block motion compensation. It uses the assembly routine
; MMxInterpolateAndAccumulate.
;
; This routine is the assembly equivalent of NewAdvancePredict.
;
; Inputs: (dwords pushed onto stack by caller)
; DC flat pointer to decoder catalog.
; fpBlockAction flat pointer to block action stream.
; iNext flat pointer to offsets for 4 neighboring blocks.
; 0 = left
; 1 = right
; 2 = above
; 3 = below
;
;
; Register Usage:
;
;
; Notes:
;
;--------------------------------------------------------------------------;
; register storage
; ebp esp+00
; ebx esp+04
; edi esp+08
; esi esp+12
; local variable definitions
lpBlockAction EQU esp+16 ; local block action stream pointer
lNext EQU esp+20 ; local block action offsets pointer
lClipX EQU esp+24 ; local copy of pointer to x vector clipping table
lClipY EQU esp+28 ; local copy of pointer to y vector clipping table
lNext EQU esp+32 ; local offsets (4 DWORDS = 16 bytes)
lAccum EQU esp+64 ; accumulator (64 WORDS = 128 bytes)
zero EQU mm0
lDst EQU edi ; local destination pointer
; C input parameters
fpBlockAction EQU ebp+08 ; block action stream pointer
iNext EQU esp+12 ; block action offsets pointer
pDst EQU ebp+16 ; destination pointer
pClipX EQU ebp+20 ; x vector clipping table
pClipY EQU ebp+24 ; y vector clipping table
; MMX globals
; the weight tables are each 64 WORDS stored in Quadrant ascending order
WtCt EQU gMMX_WeightCenter
WtLR EQU gMMX_WeightLeftRight
WtAB EQU gMMX_WeightAboveBelow
Round1 EQU gMMX_Round1
Round2 EQU gMMX_Round2
Round4 EQU gMMX_Round4
PITCH = 384
FRAMESIZE = 256
; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****
;
; ANY CHANGES TO THE BLOCK ACTION STRUCTURE
; IN d3dec.h MUST BE ECHOED HERE!!!!
;
; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****
; Offsets into Block Action structure T_BlkAction of length 20
; see the definition in d3dec.h
i8MVx2 = 1 ; I8 = signed byte
i8MVy2 = 2 ; I8 = signed byte
pRefBlock = 8 ; U32 = unsigned dword
MMXDATA1 SEGMENT
ALIGN 8
gMMX_WeightCenter LABEL DWORD
WORD 5, 5, 5, 4, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6, 5, 5 ; Quadrant I
WORD 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6 ; Quadrant II
WORD 5, 5, 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 4, 5, 5, 5 ; Quadrant III
WORD 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4 ; Quadrant IV
gMMX_WeightLeftRight LABEL DWORD
WORD 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 ; Quadrant I
WORD 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 ; Quadrant II
WORD 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1 ; Quadrant III
WORD 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2 ; Quadrant IV
gMMX_WeightAboveBelow LABEL DWORD
WORD 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant I
WORD 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant II
WORD 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 ; Quadrant III
WORD 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2 ; Quadrant IV
gMMX_Round1 DWORD 00010001h, 00010001h
gMMX_Round2 DWORD 00020002h, 00020002h
gMMX_Round4 DWORD 00040004h, 00040004h
MMXDATA1 ENDS
;--------------------------------------------------------------------------;
;--------------------------------------------------------------------------;
MMXCODE1 SEGMENT
PUBLIC C MMX_AdvancePredict
IF PITCH-384
** error: this code assumes PITCH is 384
ENDIF
;--------------------------------------------------------------------------;
; Start Code
;--------------------------------------------------------------------------;
MMX_AdvancePredict:
push ebp
mov ebp, esp
mov edx, [iNext]
and esp, -32 ; align stack on cache boundary
sub esp, FRAMESIZE
pxor zero, zero ; zero for unpacking
push esi
push edi
push ebx
push ebp
mov eax, [pClipX]
mov ebx, [pClipY]
mov [lClipX], eax
mov [lClipY], ebx
mov lDst, [pDst]
mov eax, 00[edx]
mov ebp, [fpBlockAction]
mov ebx, 04[edx]
lea eax, [eax+4*eax]
mov ecx, 08[edx]
lea ebx, [ebx+4*ebx]
mov edx, 12[edx]
lea ecx, [ecx+4*ecx]
mov 00[lNext], eax
lea edx, [edx+4*edx]
mov 04[lNext], ebx
mov 08[lNext], ecx
mov 12[lNext], edx
;-----------------------------------------------------------------------;
; ;
; Center ;
; ;
;-----------------------------------------------------------------------;
xor ecx, ecx
mov esi, [lClipY]
mov cl, i8MVy2[ebp]
xor edx, edx
add cl, 64
mov dl, i8MVx2[ebp]
add dl, 64
mov ebx, [lClipX]
mov ah, [ecx + esi]
mov esi, pRefBlock[ebp]
mov al, [edx + ebx]
mov dl, ah
shl edx, 24
mov cl, al
sar edx, 18
xor cl, 080H
shr ecx, 1
and edx, 0FFFFFF80H
lea ebx, [WtCt + 32]
add esi, ecx
lea edx, [edx + edx*2 - 64]
add esi, edx
; Quadrant II
call MMxInterpolateAndAccumulate
movq mm3, [Round4]
paddw mm4, mm3
add esi, 4
paddw mm5, mm3
sub ebx, 32
movq [lAccum+00], mm4
paddw mm6, mm3
movq [lAccum+16], mm5
paddw mm7, mm3
movq [lAccum+32], mm6
movq [lAccum+48], mm7
; Quadrant I
call MMxInterpolateAndAccumulate
movq mm3, [Round4]
paddw mm4, mm3
add esi, 4*PITCH-4
paddw mm5, mm3
add ebx, 64
movq [lAccum+08], mm4
paddw mm6, mm3
movq [lAccum+24], mm5
paddw mm7, mm3
movq [lAccum+40], mm6
movq [lAccum+56], mm7
; Quadrant III
call MMxInterpolateAndAccumulate
movq mm3, [Round4]
paddw mm4, mm3
add esi, 4
paddw mm5, mm3
add ebx, 32
movq [lAccum+64], mm4
paddw mm6, mm3
movq [lAccum+80], mm5
paddw mm7, mm3
movq [lAccum+96], mm6
movq [lAccum+112], mm7
; Quadrant IV
call MMxInterpolateAndAccumulate
movq mm3, [Round4]
paddw mm4, mm3
mov ebx, 00[lNext]
paddw mm5, mm3
movq [lAccum+72], mm4
paddw mm6, mm3
movq [lAccum+88], mm5
paddw mm7, mm3
movq [lAccum+104], mm6
movq [lAccum+120], mm7
;-----------------------------------------------------------------------;
; ;
; Left ;
; ;
;-----------------------------------------------------------------------;
xor ecx, ecx
mov esi, [lClipY]
mov cl, i8MVy2[ebp + 4*ebx]
xor edx, edx
add cl, 64
mov dl, i8MVx2[ebp + 4*ebx]
add dl, 64
mov ebx, [lClipX]
mov ah, [ecx + esi]
mov esi, pRefBlock[ebp]
mov al, [edx + ebx]
mov dl, ah
shl edx, 24
mov cl, al
sar edx, 18
xor cl, 080H
shr ecx, 1
and edx, 0FFFFFF80H
lea ebx, [WtLR + 32]
add esi, ecx
lea edx, [edx + edx*2 - 64]
add esi, edx
; Quadrant II
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+00]
paddw mm5, [lAccum+16]
paddw mm6, [lAccum+32]
paddw mm7, [lAccum+48]
movq [lAccum+00], mm4
movq [lAccum+16], mm5
movq [lAccum+32], mm6
movq [lAccum+48], mm7
; Quadrant III
add esi, 4*PITCH
add ebx, 32
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+64]
paddw mm5, [lAccum+80]
paddw mm6, [lAccum+96]
paddw mm7, [lAccum+112]
movq [lAccum+64], mm4
movq [lAccum+80], mm5
movq [lAccum+96], mm6
mov ebx, 04[lNext]
movq [lAccum+112], mm7
;-----------------------------------------------------------------------;
; ;
; Right ;
; ;
;-----------------------------------------------------------------------;
xor ecx, ecx
mov esi, [lClipY]
mov cl, i8MVy2[ebp + 4*ebx]
xor edx, edx
add cl, 64
mov dl, i8MVx2[ebp + 4*ebx]
add dl, 64
mov ebx, [lClipX]
mov ah, [ecx + esi]
mov esi, pRefBlock[ebp]
mov al, [edx + ebx]
mov dl, ah
shl edx, 24
mov cl, al
sar edx, 18
xor cl, 080H
shr ecx, 1
and edx, 0FFFFFF80H
lea ebx, [WtLR]
add esi, ecx
lea edx, [edx + edx*2 - 64]
add esi, 4
add esi, edx
; Quadrant I
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+08]
paddw mm5, [lAccum+24]
paddw mm6, [lAccum+40]
paddw mm7, [lAccum+56]
movq [lAccum+08], mm4
movq [lAccum+24], mm5
movq [lAccum+40], mm6
movq [lAccum+56], mm7
; Quadrant IV
add esi, 4*PITCH
add ebx, 96
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+72]
paddw mm5, [lAccum+88]
paddw mm6, [lAccum+104]
paddw mm7, [lAccum+120]
movq [lAccum+72], mm4
movq [lAccum+88], mm5
movq [lAccum+104], mm6
mov ebx, 08[lNext]
movq [lAccum+120], mm7
;-----------------------------------------------------------------------;
; ;
; Above ;
; ;
;-----------------------------------------------------------------------;
xor ecx, ecx
mov esi, [lClipY]
mov cl, i8MVy2[ebp + 4*ebx]
xor edx, edx
add cl, 64
mov dl, i8MVx2[ebp + 4*ebx]
add dl, 64
mov ebx, [lClipX]
mov ah, [ecx + esi]
mov esi, pRefBlock[ebp]
mov al, [edx + ebx]
mov dl, ah
shl edx, 24
mov cl, al
sar edx, 18
xor cl, 080H
shr ecx, 1
and edx, 0FFFFFF80H
lea ebx, [WtAB]
add esi, ecx
lea edx, [edx + edx*2 - 64]
add esi, 4
add esi, edx
; Quadrant I
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+08]
paddw mm5, [lAccum+24]
psraw mm4, 3
paddw mm6, [lAccum+40]
psraw mm5, 3
paddw mm7, [lAccum+56]
psraw mm6, 3
movq [lAccum+08], mm4
psraw mm7, 3
movq [lAccum+24], mm5
movq [lAccum+40], mm6
movq [lAccum+56], mm7
; Quadrant II
sub esi, 4
add ebx, 32
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+00]
paddw mm5, [lAccum+16]
paddw mm6, [lAccum+32]
psraw mm4, 3
paddw mm7, [lAccum+48]
psraw mm5, 3
packuswb mm4, [lAccum+08]
packuswb mm5, [lAccum+24]
movq [lDst+00], mm4
psraw mm6, 3
movq [lDst+PITCH], mm5
psraw mm7, 3
packuswb mm6, [lAccum+40]
packuswb mm7, [lAccum+56]
movq [lDst+2*PITCH], mm6
mov ebx, 12[lNext]
movq [lDst+3*PITCH], mm7
;-----------------------------------------------------------------------;
; ;
; Below ;
; ;
;-----------------------------------------------------------------------;
xor ecx, ecx
mov esi, [lClipY]
mov cl, i8MVy2[ebp + 4*ebx]
xor edx, edx
add cl, 64
mov dl, i8MVx2[ebp + 4*ebx]
add dl, 64
mov ebx, [lClipX]
mov ah, [ecx + esi]
mov esi, pRefBlock[ebp]
mov al, [edx + ebx]
mov dl, ah
shl edx, 24
mov cl, al
sar edx, 18
xor cl, 080H
shr ecx, 1
and edx, 0FFFFFF80H
lea ebx, [WtAB + 96]
add esi, ecx
lea edx, [edx + edx*2 - 64]
add esi, 4*PITCH+4
add esi, edx
; Quadrant IV
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+72]
paddw mm5, [lAccum+88]
psraw mm4, 3
paddw mm6, [lAccum+104]
psraw mm5, 3
paddw mm7, [lAccum+120]
psraw mm6, 3
movq [lAccum+72], mm4
psraw mm7, 3
movq [lAccum+88], mm5
movq [lAccum+104], mm6
movq [lAccum+120], mm7
; Quadrant III
sub esi, 4
sub ebx, 32
call MMxInterpolateAndAccumulate
paddw mm4, [lAccum+64]
paddw mm5, [lAccum+80]
paddw mm6, [lAccum+96]
psraw mm4, 3
paddw mm7, [lAccum+112]
psraw mm5, 3
packuswb mm4, [lAccum+72]
packuswb mm5, [lAccum+88]
movq [lDst+4*PITCH], mm4
psraw mm6, 3
movq [lDst+5*PITCH], mm5
psraw mm7, 3
packuswb mm6, [lAccum+104]
packuswb mm7, [lAccum+120]
movq [lDst+6*PITCH], mm6
movq [lDst+7*PITCH], mm7
pop ebp
pop ebx
pop edi
pop esi
mov esp, ebp
pop ebp
ret
;--------------------------------------------------------------------------;
;
; Routine:
; MMxInterpolateAndAccumulate
;
; Inputs:
; esi flat pointer to Reference Block Source.
; it is already adjusted by the motion vector.
; al x component of motion vector.
; ah y component of motion vector.
; ebx flat pointer to Weighting values.
;
; Outputs
; mm4-mm7 Weighted, interpolated values for rows 0-3.
; Values are in packed word format.
;
; Description:
; This routine performs motion compensation interpolation, weights the
; results, and returns them in mmx registers 4-7.
; It works on a single 4x4 Quadrant per call. It is an assembly
; callable routine with its parameters in registers.
;
; Register Usage:
; This routine modifies no integer registers.
; All MMx registers are modified.
;
; Notes:
;
;--------------------------------------------------------------------------;
; asm input parameters
lpSrc EQU esi ; motion compensated source pointer
lpWt EQU ebx ; pointer to matrix of weights 4x4xWORD
MMxInterpolateAndAccumulate:
test eax, 100h ; test mvy's parity bit
jnz IAAhalf ; jump when it was odd
test eax, 1 ; test mvx's parity bit
jnz IAAhalf_int ; jump when it was odd
IAAint_int:
movd mm4, [lpSrc] ; 1 - fetch row
movd mm5, [PITCH+lpSrc] ; 2 - fetch row
punpcklbw mm4, zero ; 1 - unpack row
pmullw mm4, 00[lpWt] ; 1 - multiply by weights
movq mm6, [PITCH*2+lpSrc] ; 3 - fetch row
punpcklbw mm5, zero ; 2 - unpack row
pmullw mm5, 08[lpWt] ; 2 - multiply by weights
punpcklbw mm6, zero ; 3 - unpack row
movq mm7, [PITCH*3+lpSrc] ; 4 - fetch row
pmullw mm6, 16[lpWt] ; 3 - multiply by weights
punpcklbw mm7, zero ; 4 - unpack row
pmullw mm7, 24[lpWt] ; 4 - multiply by weights
ret
IAAhalf_int:
movq mm4, [lpSrc] ; 0 - fetch row
movq mm1, mm4 ; 0 - copy row
psrlq mm4, 8 ; 0 - shift row
movq mm5, [PITCH+lpSrc] ; 1 - fetch row
punpcklbw mm4, zero ; 0 - unpack shifted row
movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row
punpcklbw mm1, zero ; 0 - unpack row
movq mm2, mm5 ; 1 - copy row
psrlq mm5, 8 ; 1 - shift row
paddw mm4, [Round1] ; 0 - add in Round
punpcklbw mm5, zero ; 1 - unpack shifted row
paddw mm4, mm1 ; 0 - sum copies of row
punpcklbw mm2, zero ; 1 - unpack row
movq mm3, mm6 ; 2 - copy row
psrlq mm6, 8 ; 2 - shift row
paddw mm5, [Round1] ; 1 - add in Round
punpcklbw mm6, zero ; 2 - unpack shifted row
paddw mm5, mm2 ; 1 - sum copies of row
punpcklbw mm3, zero ; 2 - unpack row
movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row
psraw mm4, 1 ; 0 - divide by 2
pmullw mm4, 00[lpWt] ; 0 - multiply by weights
psraw mm5, 1 ; 1 - divide by 2
movq mm1, mm7 ; 3 - copy row
psrlq mm7, 8 ; 3 - shift row
paddw mm6, [Round1] ; 2 - add in Round
punpcklbw mm7, zero ; 3 - unpack shifted row
paddw mm6, mm3 ; 2 - sum copies of rows
punpcklbw mm1, zero ; 3 - unpack row
paddw mm7, [Round1] ; 3 - add in Round
psraw mm6, 1 ; 2 - divide by 2
pmullw mm5, 08[lpWt] ; 1 - multiply by weights
paddw mm7, mm1 ; 3 - sum copies of row
pmullw mm6, 16[lpWt] ; 2 - multiply by weights
psraw mm7, 1 ; 3 - divide by 2
pmullw mm7, 24[lpWt] ; 3 - multiply by weights
ret
IAAhalf:
test eax, 1 ; test mvx's parity bit
jnz IAAhalf_half ; jump when it was odd
IAAint_half:
movd mm4, [lpSrc] ; 0 - fetch row
movd mm5, [PITCH+lpSrc] ; 1 - fetch row
punpcklbw mm4, zero ; 0 - unpack row
movd mm6, [PITCH*2+lpSrc] ; 2 - fetch row
punpcklbw mm5, zero ; 1 - unpack row
paddw mm4, [Round1] ; 0 - add in Round
punpcklbw mm6, zero ; 2 - unpack row
paddw mm4, mm5 ; 0 - sum rows
paddw mm5, [Round1] ; 1 - add in Round
movd mm7, [PITCH*3+lpSrc] ; 3 - fetch row
psraw mm4, 1 ; 0 - divide by 2
pmullw mm4, 00[lpWt] ; 0 - multiply by weights
paddw mm5, mm6 ; 1 - sum rows
movd mm3, [PITCH*4+lpSrc] ; 4 - fetch row
punpcklbw mm7, zero ; 3 - unpack row
paddw mm6, [Round1] ; 2 - add in Round
psraw mm5, 1 ; 1 - divide by 2
pmullw mm5, 08[lpWt] ; 1 - multiply by weights
punpcklbw mm3, zero ; 4 - unpack row
paddw mm6, mm7 ; 2 - sum rows
paddw mm7, [Round1] ; 3 - add in Round
paddw mm7, mm3 ; 3 - sum rows
psraw mm6, 1 ; 2 - divide by 2
pmullw mm6, 16[lpWt] ; 2 - multiply by weights
psraw mm7, 1 ; 3 - divide by 2
pmullw mm7, 24[lpWt] ; 3 - multiply by weights
ret
IAAhalf_half:
movq mm4, [lpSrc] ; 0 - fetch row
movq mm5, [PITCH+lpSrc] ; 1 - fetch row
movq mm1, mm4 ; 0 - copy row
movq mm2, mm5 ; 1 - copy row
psrlq mm4, 8 ; 0 - shift row
movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row
punpcklbw mm4, zero ; 0 - unpack shifted row
movq mm3, mm6 ; 2 - copy row
punpcklbw mm1, zero ; 0 - unpack row
paddw mm4, mm1 ; 0 - parital sum both copies of row
psrlq mm5, 8 ; 1 - shift row
paddw mm4, [Round2] ; 0 - add in Round
punpcklbw mm5, zero ; 1 - unpack shifted row
movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row
punpcklbw mm2, zero ; 1 - unpack row
paddw mm5, mm2 ; 1 - parital sum both copies of row
psrlq mm6, 8 ; 2 - shift row
paddw mm4, mm5 ; 0 - add partial sums
punpcklbw mm6, zero ; 2 - unpack shifted row
paddw mm5, [Round2] ; 1 - add in Round
punpcklbw mm3, zero ; 2 - unpack row
paddw mm6, mm3 ; 2 - parital sum both copies of row
movq mm1, mm7 ; 3 - copy row
movq mm2, [PITCH*4+lpSrc] ; 4 - fetch row
psraw mm4, 2 ; 0 - divide by 2
paddw mm5, mm6 ; 1 - add partial sums
psrlq mm7, 8 ; 3 - shift row
paddw mm6, [Round2] ; 2 - add in Round
punpcklbw mm7, zero ; 3 - unpack shifted row
movq mm3, mm2 ; 4 - copy row
punpcklbw mm1, zero ; 3 - unpack row
paddw mm7, mm1 ; 3 - parital sum both copies of row
psrlq mm2, 8 ; 4 - shift row
pmullw mm4, 00[lpWt] ; 0 - multiply by weights
punpcklbw mm2, zero ; 4 - unpack shifted row
paddw mm6, mm7 ; 2 - add partial sums
punpcklbw mm3, zero ; 4 - unpack row
paddw mm7, [Round2] ; 3 - add in Round
psraw mm5, 2 ; 1 - divide by 2
pmullw mm5, 08[lpWt] ; 1 - multiply by weights
paddw mm2, mm3 ; 4 - parital sum both copies of row
paddw mm7, mm2 ; 3 - add partial sums
psraw mm6, 2 ; 2 - divide by 2
pmullw mm6, 16[lpWt] ; 2 - multiply by weights
psraw mm7, 2 ; 3 - divide by 2
pmullw mm7, 24[lpWt] ; 3 - multiply by weights
ret
MMXCODE1 ENDS
; 11111111112222222222333333333344444444445555555555666666666677777777778
;2345678901234567890123456789012345678901234567890123456789012345678901234567890
END