windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/d3madvpr.asm

;--------------------------------------------------------------------------;
;
;    INTEL Corporation Proprietary Information
;
;    This listing is supplied under the terms of a license
;    agreement with INTEL Corporation and may not be copied
;    nor disclosed except in accordance with the terms of
;    that agreement.
;
;    Copyright (c) 1996 Intel Corporation.
;    All Rights Reserved.
;
;--------------------------------------------------------------------------;
;
;	$Header:   S:\h26x\src\dec\d3madvpr.asv   1.6   01 Oct 1996 16:45:38   KLILLEVO  $
;	$Log:   S:\h26x\src\dec\d3madvpr.asv  $
;// 
;//    Rev 1.6   01 Oct 1996 16:45:38   KLILLEVO
;// removed unneccessary local variable and added code to verify
;// PITCH is 384 at compile-time
;// 
;//    Rev 1.5   01 Oct 1996 11:57:52   KLILLEVO
;// pairing done, saved about 5*4 = 20 cycles per block = 11880 cycles
;// per QCIF picture
;// 
;//    Rev 1.4   27 Sep 1996 17:28:40   KLILLEVO
;// added clipping of extended motion vectors, but pairing is horrible and
;// needs to be improved
;// 
;//    Rev 1.3   01 Apr 1996 12:35:14   RMCKENZX
;// 
;// Added MMXCODE1 and MMXDATA1 segments, moved global data
;// to MMXDATA1 segment.
;// 
;//    Rev 1.2   07 Mar 1996 18:32:16   RMCKENZX
;// 
;// Re-organized and optimized routine.  Interpolaters now
;// interpolate & weight, driver accumulates and averages.  Interpolaters
;// return results in mm4-mm7.  Eliminated include file.
;// 
;//    Rev 1.0   27 Feb 1996 15:03:42   RMCKENZX
;// Initial revision.
;
;--------------------------------------------------------------------------;
;
; File:
;	d3madvpr.asm
;
; Routines:
;	MMX_AdvancePredict				Driver
;	MMxInterpolateAndAccumulate		Assembly-called interpolate accumulate
;
;--------------------------------------------------------------------------;

.586
.MODEL FLAT

;  make all symbols case sensitive
OPTION CASEMAP:NONE

.xlist
include iammx.inc
.list

MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS

MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS

;--------------------------------------------------------------------------;
;
; MMX_AdvancePredict
;
; Description:
;	This routine performs advanced prediction, including overlapped
;	block motion compensation.  It uses the assembly routine 
;	MMxInterpolateAndAccumulate.
;
;	This routine is the assembly equivalent of NewAdvancePredict.
;
; Inputs:			(dwords pushed onto stack by caller)
;	DC				flat pointer to decoder catalog.
;	fpBlockAction	flat pointer to block action stream.
;	iNext			flat pointer to offsets for 4 neighboring blocks.
;						0 = left
;						1 = right
;						2 = above
;						3 = below
;
;
; Register Usage:
;
;
; Notes:
;    
;--------------------------------------------------------------------------;

; register storage
;	ebp						esp+00
;	ebx						esp+04
;	edi						esp+08
;	esi						esp+12

; local variable definitions
	lpBlockAction	EQU		esp+16		; local block action stream pointer
	lNext			EQU		esp+20		; local block action offsets pointer
    lClipX          EQU     esp+24      ; local copy of pointer to x vector clipping table
    lClipY          EQU     esp+28      ; local copy of pointer to y vector clipping table
	lNext			EQU		esp+32		; local offsets (4 DWORDS = 16 bytes)
	lAccum			EQU		esp+64		; accumulator (64 WORDS = 128 bytes)

	zero            EQU     mm0
	lDst			EQU		edi			; local destination pointer

; C input parameters
	fpBlockAction	EQU		ebp+08		; block action stream pointer
	iNext			EQU		esp+12		; block action offsets pointer
	pDst			EQU		ebp+16		; destination pointer
    pClipX          EQU     ebp+20      ; x vector clipping table
    pClipY          EQU     ebp+24      ; y vector clipping table

; MMX globals
;  the weight tables are each 64 WORDS stored in Quadrant ascending order
	WtCt			EQU		gMMX_WeightCenter
	WtLR			EQU		gMMX_WeightLeftRight
	WtAB			EQU		gMMX_WeightAboveBelow
	Round1			EQU		gMMX_Round1
	Round2			EQU		gMMX_Round2
	Round4			EQU		gMMX_Round4

PITCH = 384
FRAMESIZE = 256

; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****
;
;  		ANY CHANGES TO THE BLOCK ACTION STRUCTURE
;  		IN d3dec.h MUST BE ECHOED HERE!!!!
;
; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****

; Offsets into Block Action structure T_BlkAction of length 20
;	see the definition in d3dec.h
i8MVx2				=	1		; I8 = signed byte
i8MVy2				=	2		; I8 = signed byte
pRefBlock			=	8		; U32 = unsigned dword

MMXDATA1 SEGMENT
ALIGN 8
gMMX_WeightCenter LABEL DWORD
WORD 5, 5, 5, 4,  5, 5, 5, 5,  6, 6, 5, 5,  6, 6, 5, 5		; Quadrant I 
WORD 4, 5, 5, 5,  5, 5, 5, 5,  5, 5, 6, 6,  5, 5, 6, 6	 	; Quadrant II
WORD 5, 5, 6, 6,  5, 5, 6, 6,  5, 5, 5, 5,  4, 5, 5, 5	 	; Quadrant III
WORD 6, 6, 5, 5,  6, 6, 5, 5,  5, 5, 5, 5,  5, 5, 5, 4	 	; Quadrant IV

gMMX_WeightLeftRight LABEL DWORD
WORD 1, 1, 1, 2,  1, 1, 2, 2,  1, 1, 2, 2,  1, 1, 2, 2		; Quadrant I 
WORD 2, 1, 1, 1,  2, 2, 1, 1,  2, 2, 1, 1,  2, 2, 1, 1		; Quadrant II 
WORD 2, 2, 1, 1,  2, 2, 1, 1,  2, 2, 1, 1,  2, 1, 1, 1		; Quadrant III 
WORD 1, 1, 2, 2,  1, 1, 2, 2,  1, 1, 2, 2,  1, 1, 1, 2		; Quadrant IV 

gMMX_WeightAboveBelow LABEL DWORD
WORD 2, 2, 2, 2,  2, 2, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1		; Quadrant I 
WORD 2, 2, 2, 2,  1, 1, 2, 2,  1, 1, 1, 1,  1, 1, 1, 1		; Quadrant II 
WORD 1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 2, 2,  2, 2, 2, 2		; Quadrant III 
WORD 1, 1, 1, 1,  1, 1, 1, 1,  2, 2, 1, 1,  2, 2, 2, 2		; Quadrant IV 

gMMX_Round1 DWORD 00010001h, 00010001h
gMMX_Round2 DWORD 00020002h, 00020002h
gMMX_Round4 DWORD 00040004h, 00040004h
MMXDATA1 ENDS
;--------------------------------------------------------------------------;


;--------------------------------------------------------------------------;
MMXCODE1 SEGMENT

PUBLIC C MMX_AdvancePredict

IF PITCH-384
   ** error: this code assumes PITCH is 384
ENDIF

;--------------------------------------------------------------------------;
;	Start Code
;--------------------------------------------------------------------------;
MMX_AdvancePredict:
	push	ebp
	mov 	ebp, esp

	mov 	edx, [iNext]
	and 	esp, -32					; align stack on cache boundary

	sub 	esp, FRAMESIZE
	pxor	zero, zero					; zero for unpacking

	push	esi
	push	edi

	push	ebx
	push	ebp

    mov eax, [pClipX]
    mov ebx, [pClipY]

    mov [lClipX], eax
    mov [lClipY], ebx

	mov 	lDst, [pDst]
	mov 	eax, 00[edx]

	mov 	ebp, [fpBlockAction]
	mov 	ebx, 04[edx]

	lea 	eax, [eax+4*eax]
	mov 	ecx, 08[edx]

	lea 	ebx, [ebx+4*ebx]
	mov 	edx, 12[edx]

	lea 	ecx, [ecx+4*ecx]
	mov 	00[lNext], eax

	lea 	edx, [edx+4*edx]
	mov 	04[lNext], ebx

	mov 	08[lNext], ecx
	mov 	12[lNext], edx


;-----------------------------------------------------------------------;
;																		;
;								Center                                  ;
;																		;
;-----------------------------------------------------------------------;

	xor ecx, ecx
	mov esi, [lClipY]

	mov cl, i8MVy2[ebp]         
	xor edx, edx

	add cl, 64
	mov dl, i8MVx2[ebp]

	add dl, 64
	mov ebx, [lClipX]

	mov ah, [ecx + esi]
	mov esi, pRefBlock[ebp]  

	mov al, [edx + ebx]
	mov dl, ah

	shl edx, 24
	mov cl, al

	sar edx, 18                  
	xor cl, 080H

	shr ecx, 1
	and edx, 0FFFFFF80H

	lea ebx, [WtCt + 32]
	add esi, ecx

	lea edx, [edx + edx*2 - 64]

	add esi, edx

	; Quadrant II
	call      MMxInterpolateAndAccumulate

	movq      mm3, [Round4]

	paddw     mm4, mm3
	add       esi, 4

	paddw     mm5, mm3
	sub       ebx, 32

	movq      [lAccum+00], mm4
	paddw     mm6, mm3

	movq      [lAccum+16], mm5
	paddw     mm7, mm3

	movq      [lAccum+32], mm6

	movq      [lAccum+48], mm7


	; Quadrant I
	call	  MMxInterpolateAndAccumulate
 
	movq      mm3, [Round4]

	paddw     mm4, mm3
	add       esi, 4*PITCH-4

	paddw     mm5, mm3
	add       ebx, 64

	movq      [lAccum+08], mm4
	paddw     mm6, mm3

	movq      [lAccum+24], mm5
	paddw     mm7, mm3

	movq      [lAccum+40], mm6

	movq      [lAccum+56], mm7


	; Quadrant III
	call	  MMxInterpolateAndAccumulate

	movq      mm3, [Round4]

	paddw     mm4, mm3
	add       esi, 4

	paddw     mm5, mm3
	add       ebx, 32

	movq      [lAccum+64], mm4
	paddw     mm6, mm3

	movq      [lAccum+80], mm5
	paddw     mm7, mm3

	movq      [lAccum+96], mm6

	movq      [lAccum+112], mm7


	; Quadrant IV
	call	  MMxInterpolateAndAccumulate

	movq      mm3, [Round4]

	paddw     mm4, mm3
	mov       ebx, 00[lNext]

	paddw     mm5, mm3

	movq      [lAccum+72], mm4
	paddw     mm6, mm3

	movq      [lAccum+88], mm5
	paddw     mm7, mm3

	movq      [lAccum+104], mm6

	movq      [lAccum+120], mm7


;-----------------------------------------------------------------------;
;																		;
;								Left                                    ;
;																		;
;-----------------------------------------------------------------------;

	xor ecx, ecx
	mov esi, [lClipY]

	mov cl, i8MVy2[ebp + 4*ebx]         
	xor edx, edx

	add cl, 64
	mov dl, i8MVx2[ebp + 4*ebx]

	add dl, 64
	mov ebx, [lClipX]

	mov ah, [ecx + esi]
	mov esi, pRefBlock[ebp]  

	mov al, [edx + ebx]
	mov dl, ah

	shl edx, 24
	mov cl, al

	sar edx, 18                  
	xor cl, 080H

	shr ecx, 1
	and edx, 0FFFFFF80H

	lea ebx, [WtLR + 32]
	add esi, ecx

	lea edx, [edx + edx*2 - 64]

	add esi, edx

	; Quadrant II
	call      MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+00]

	paddw     mm5, [lAccum+16]

	paddw     mm6, [lAccum+32]

	paddw     mm7, [lAccum+48]

	movq      [lAccum+00], mm4

	movq      [lAccum+16], mm5

	movq      [lAccum+32], mm6

	movq      [lAccum+48], mm7


	; Quadrant III
	add       esi, 4*PITCH
	add       ebx, 32

	call	  MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+64]

	paddw     mm5, [lAccum+80]

	paddw     mm6, [lAccum+96]

	paddw     mm7, [lAccum+112]

	movq      [lAccum+64], mm4

	movq      [lAccum+80], mm5

	movq      [lAccum+96], mm6
	mov       ebx, 04[lNext]

	movq      [lAccum+112], mm7


;-----------------------------------------------------------------------;
;																		;
;								Right                                   ;
;																		;
;-----------------------------------------------------------------------;
	xor ecx, ecx
	mov esi, [lClipY]

	mov cl, i8MVy2[ebp + 4*ebx]         
	xor edx, edx

	add cl, 64
	mov dl, i8MVx2[ebp + 4*ebx]

	add dl, 64
	mov ebx, [lClipX]

	mov ah, [ecx + esi]
	mov esi, pRefBlock[ebp]  

	mov al, [edx + ebx]
	mov dl, ah

	shl edx, 24
	mov cl, al

	sar edx, 18                  
	xor cl, 080H

	shr ecx, 1
	and edx, 0FFFFFF80H

	lea ebx, [WtLR]
	add esi, ecx

	lea edx, [edx + edx*2 - 64]
	add esi, 4

	add esi, edx

	; Quadrant I
	call      MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+08]

	paddw     mm5, [lAccum+24]

	paddw     mm6, [lAccum+40]

	paddw     mm7, [lAccum+56]

	movq      [lAccum+08], mm4

	movq      [lAccum+24], mm5

	movq      [lAccum+40], mm6

	movq      [lAccum+56], mm7


	; Quadrant IV
	add       esi, 4*PITCH
	add       ebx, 96

	call	  MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+72]

	paddw     mm5, [lAccum+88]

	paddw     mm6, [lAccum+104]

	paddw     mm7, [lAccum+120]

	movq      [lAccum+72], mm4

	movq      [lAccum+88], mm5

	movq      [lAccum+104], mm6
	mov       ebx, 08[lNext]

	movq      [lAccum+120], mm7


;-----------------------------------------------------------------------;
;																		;
;								Above                                   ;
;																		;
;-----------------------------------------------------------------------;

	xor ecx, ecx
	mov esi, [lClipY]

	mov cl, i8MVy2[ebp + 4*ebx]         
	xor edx, edx

	add cl, 64
	mov dl, i8MVx2[ebp + 4*ebx]

	add dl, 64
	mov ebx, [lClipX]

	mov ah, [ecx + esi]
	mov esi, pRefBlock[ebp]  

	mov al, [edx + ebx]
	mov dl, ah

	shl edx, 24
	mov cl, al

	sar edx, 18                  
	xor cl, 080H

	shr ecx, 1
	and edx, 0FFFFFF80H

	lea ebx, [WtAB]
	add esi, ecx

	lea edx, [edx + edx*2 - 64]
	add esi, 4

	add esi, edx

	; Quadrant I
	call      MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+08]

	paddw     mm5, [lAccum+24]
	psraw     mm4, 3

	paddw     mm6, [lAccum+40]
	psraw     mm5, 3

	paddw     mm7, [lAccum+56]
	psraw     mm6, 3

	movq      [lAccum+08], mm4
	psraw     mm7, 3

	movq      [lAccum+24], mm5

	movq      [lAccum+40], mm6

	movq      [lAccum+56], mm7


	; Quadrant II
	sub       esi, 4
	add       ebx, 32
	call	  MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+00]

	paddw     mm5, [lAccum+16]

	paddw     mm6, [lAccum+32]
	psraw     mm4, 3

	paddw     mm7, [lAccum+48]
	psraw     mm5, 3

	packuswb  mm4, [lAccum+08]

	packuswb  mm5, [lAccum+24]

	movq      [lDst+00], mm4
	psraw     mm6, 3

	movq      [lDst+PITCH], mm5
	psraw     mm7, 3

	packuswb  mm6, [lAccum+40]

	packuswb  mm7, [lAccum+56]

	movq      [lDst+2*PITCH], mm6
	mov       ebx, 12[lNext]

	movq      [lDst+3*PITCH], mm7


;-----------------------------------------------------------------------;
;																		;
;								Below                                   ;
;																		;
;-----------------------------------------------------------------------;

	xor ecx, ecx
	mov esi, [lClipY]

	mov cl, i8MVy2[ebp + 4*ebx]         
	xor edx, edx

	add cl, 64
	mov dl, i8MVx2[ebp + 4*ebx]

	add dl, 64
	mov ebx, [lClipX]

	mov ah, [ecx + esi]
	mov esi, pRefBlock[ebp]  

	mov al, [edx + ebx]
	mov dl, ah

	shl edx, 24
	mov cl, al

	sar edx, 18                  
	xor cl, 080H

	shr ecx, 1
	and edx, 0FFFFFF80H

	lea ebx, [WtAB + 96]
	add esi, ecx

	lea edx, [edx + edx*2 - 64]
	add esi, 4*PITCH+4

	add esi, edx

	; Quadrant IV
	call      MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+72]

	paddw     mm5, [lAccum+88]
	psraw     mm4, 3

	paddw     mm6, [lAccum+104]
	psraw     mm5, 3

	paddw     mm7, [lAccum+120]
	psraw     mm6, 3

	movq      [lAccum+72], mm4
	psraw     mm7, 3

	movq      [lAccum+88], mm5

	movq      [lAccum+104], mm6

	movq      [lAccum+120], mm7


	; Quadrant III
	sub       esi, 4
	sub       ebx, 32

	call	  MMxInterpolateAndAccumulate

	paddw     mm4, [lAccum+64]

	paddw     mm5, [lAccum+80]

	paddw     mm6, [lAccum+96]
	psraw     mm4, 3

	paddw     mm7, [lAccum+112]
	psraw     mm5, 3

	packuswb  mm4, [lAccum+72]

	packuswb  mm5, [lAccum+88]

	movq      [lDst+4*PITCH], mm4
	psraw     mm6, 3

	movq      [lDst+5*PITCH], mm5
	psraw     mm7, 3

	packuswb  mm6, [lAccum+104]

	packuswb  mm7, [lAccum+120]

	movq      [lDst+6*PITCH], mm6

	movq      [lDst+7*PITCH], mm7


	pop 	ebp
	pop 	ebx

	pop 	edi
	pop 	esi

	mov 	esp, ebp

	pop 	ebp

	ret


;--------------------------------------------------------------------------;
;
; Routine:
;	MMxInterpolateAndAccumulate
;
; Inputs:
;	esi			flat pointer to Reference Block Source.
;               it is already adjusted by the motion vector.
;	 al			x component of motion vector.
;	 ah			y component of motion vector.
;	ebx			flat pointer to Weighting values.
;
; Outputs
;	mm4-mm7     Weighted, interpolated values for rows 0-3.
;               Values are in packed word format.
;
; Description:
;	This routine performs motion compensation interpolation, weights the
;	results, and returns them in mmx registers 4-7.
;	It works on a single 4x4 Quadrant per call.  It is an assembly
;	callable routine with its parameters in registers.
;
; Register Usage:
;	This routine modifies no integer registers.
;	All MMx registers are modified.
;
; Notes:
;    
;--------------------------------------------------------------------------;

; asm input parameters
	lpSrc		EQU		esi				; motion compensated source pointer
	lpWt		EQU		ebx				; pointer to matrix of weights 4x4xWORD

MMxInterpolateAndAccumulate:
	test      eax, 100h					; test mvy's parity bit
	jnz       IAAhalf					; jump when it was odd

	test      eax, 1					; test mvx's parity bit
	jnz       IAAhalf_int				; jump when it was odd


IAAint_int:
	movd      mm4, [lpSrc]		; 1 - fetch row
										
	movd      mm5, [PITCH+lpSrc]		; 2 - fetch row  
	punpcklbw mm4, zero					; 1 - unpack row

	pmullw    mm4, 00[lpWt]				; 1 - multiply by weights

	movq      mm6, [PITCH*2+lpSrc]		; 3 - fetch row
	punpcklbw mm5, zero					; 2 - unpack row

	pmullw    mm5, 08[lpWt]				; 2 - multiply by weights
	punpcklbw mm6, zero					; 3 - unpack row

	movq      mm7, [PITCH*3+lpSrc]		; 4 - fetch row

	pmullw    mm6, 16[lpWt]				; 3 - multiply by weights
	punpcklbw mm7, zero					; 4 - unpack row

	pmullw    mm7, 24[lpWt]				; 4 - multiply by weights

	ret


IAAhalf_int:
	movq      mm4, [lpSrc]		; 0 - fetch row

	movq      mm1, mm4					; 0 - copy row
	psrlq     mm4, 8					; 0 - shift row

	movq      mm5, [PITCH+lpSrc]		; 1 - fetch row
	punpcklbw mm4, zero					; 0 - unpack shifted row

	movq      mm6, [PITCH*2+lpSrc]		; 2 - fetch row
	punpcklbw mm1, zero					; 0 - unpack row

	movq      mm2, mm5					; 1 - copy row
	psrlq     mm5, 8					; 1 - shift row

	paddw     mm4, [Round1]				; 0 - add in Round
	punpcklbw mm5, zero					; 1 - unpack shifted row

	paddw     mm4, mm1					; 0 - sum copies of row
 	punpcklbw mm2, zero					; 1 - unpack row

	movq      mm3, mm6					; 2 - copy row
	psrlq     mm6, 8					; 2 - shift row

	paddw     mm5, [Round1]				; 1 - add in Round								 
	punpcklbw mm6, zero					; 2 - unpack shifted row

	paddw     mm5, mm2					; 1 - sum copies of row
	punpcklbw mm3, zero					; 2 - unpack row
									 
	movq      mm7, [PITCH*3+lpSrc]		; 3 - fetch row
	psraw     mm4, 1					; 0 - divide by 2

	pmullw    mm4, 00[lpWt]				; 0 - multiply by weights
	psraw     mm5, 1					; 1 - divide by 2

	movq      mm1, mm7					; 3 - copy row
	psrlq     mm7, 8					; 3 - shift row

	paddw     mm6, [Round1]				; 2 - add in Round
	punpcklbw mm7, zero					; 3 - unpack shifted row

	paddw     mm6, mm3					; 2 - sum copies of rows
	punpcklbw mm1, zero					; 3 - unpack row

	paddw     mm7, [Round1]				; 3 - add in Round
	psraw     mm6, 1					; 2 - divide by 2

	pmullw    mm5, 08[lpWt]				; 1 - multiply by weights
	paddw     mm7, mm1					; 3 - sum copies of row

	pmullw    mm6, 16[lpWt]				; 2 - multiply by weights
	psraw     mm7, 1					; 3 - divide by 2

	pmullw    mm7, 24[lpWt]				; 3 - multiply by weights

	ret


IAAhalf:
	test      eax, 1					; test mvx's parity bit
	jnz       IAAhalf_half				; jump when it was odd


IAAint_half:
	movd      mm4, [lpSrc]		; 0 - fetch row
										
	movd      mm5, [PITCH+lpSrc]		; 1 - fetch row
	punpcklbw mm4, zero					; 0 - unpack row

	movd      mm6, [PITCH*2+lpSrc]		; 2 - fetch row
	punpcklbw mm5, zero					; 1 - unpack row

	paddw     mm4, [Round1]				; 0 - add in Round								 
	punpcklbw mm6, zero					; 2 - unpack row

	paddw     mm4, mm5					; 0 - sum rows
	paddw     mm5, [Round1]				; 1 - add in Round

	movd      mm7, [PITCH*3+lpSrc]		; 3 - fetch row
	psraw     mm4, 1					; 0 - divide by 2

	pmullw    mm4, 00[lpWt]				; 0 - multiply by weights
	paddw     mm5, mm6					; 1 - sum rows

	movd      mm3, [PITCH*4+lpSrc]		; 4 - fetch row
	punpcklbw mm7, zero					; 3 - unpack row

	paddw     mm6, [Round1]				; 2 - add in Round
	psraw     mm5, 1					; 1 - divide by 2

	pmullw    mm5, 08[lpWt]				; 1 - multiply by weights
	punpcklbw mm3, zero					; 4 - unpack row

	paddw     mm6, mm7					; 2 - sum rows
 	paddw     mm7, [Round1]				; 3 - add in Round

	paddw     mm7, mm3					; 3 - sum rows
	psraw     mm6, 1					; 2 - divide by 2

	pmullw    mm6, 16[lpWt]				; 2 - multiply by weights
 	psraw     mm7, 1					; 3 - divide by 2

	pmullw    mm7, 24[lpWt]				; 3 - multiply by weights

	ret


IAAhalf_half:
	movq      mm4, [lpSrc]		; 0 - fetch row

	movq      mm5, [PITCH+lpSrc]		; 1 - fetch row
	movq      mm1, mm4					; 0 - copy row

	movq      mm2, mm5					; 1 - copy row
	psrlq     mm4, 8					; 0 - shift row

	movq      mm6, [PITCH*2+lpSrc]		; 2 - fetch row
	punpcklbw mm4, zero					; 0 - unpack shifted row

	movq      mm3, mm6					; 2 - copy row
	punpcklbw mm1, zero					; 0 - unpack row

	paddw     mm4, mm1					; 0 - parital sum both copies of row
	psrlq     mm5, 8					; 1 - shift row

	paddw     mm4, [Round2]				; 0 - add in Round								 
	punpcklbw mm5, zero					; 1 - unpack shifted row

	movq      mm7, [PITCH*3+lpSrc]		; 3 - fetch row
	punpcklbw mm2, zero					; 1 - unpack row

	paddw     mm5, mm2					; 1 - parital sum both copies of row
	psrlq     mm6, 8					; 2 - shift row

	paddw     mm4, mm5					; 0 - add partial sums
	punpcklbw mm6, zero					; 2 - unpack shifted row

	paddw     mm5, [Round2]				; 1 - add in Round								 
	punpcklbw mm3, zero					; 2 - unpack row

	paddw     mm6, mm3					; 2 - parital sum both copies of row
	movq      mm1, mm7					; 3 - copy row

	movq      mm2, [PITCH*4+lpSrc]		; 4 - fetch row
	psraw     mm4, 2					; 0 - divide by 2

	paddw     mm5, mm6					; 1 - add partial sums
	psrlq     mm7, 8					; 3 - shift row

	paddw     mm6, [Round2]				; 2 - add in Round								 
	punpcklbw mm7, zero					; 3 - unpack shifted row

	movq      mm3, mm2					; 4 - copy row
	punpcklbw mm1, zero					; 3 - unpack row

	paddw     mm7, mm1					; 3 - parital sum both copies of row
	psrlq     mm2, 8					; 4 - shift row

	pmullw    mm4, 00[lpWt]				; 0 - multiply by weights
	punpcklbw mm2, zero					; 4 - unpack shifted row

	paddw     mm6, mm7					; 2 - add partial sums
	punpcklbw mm3, zero					; 4 - unpack row

	paddw     mm7, [Round2]				; 3 - add in Round								 
	psraw     mm5, 2					; 1 - divide by 2

	pmullw    mm5, 08[lpWt]				; 1 - multiply by weights
	paddw     mm2, mm3					; 4 - parital sum both copies of row

	paddw     mm7, mm2					; 3 - add partial sums
	psraw     mm6, 2					; 2 - divide by 2

	pmullw    mm6, 16[lpWt]				; 2 - multiply by weights
	psraw     mm7, 2					; 3 - divide by 2

	pmullw    mm7, 24[lpWt]				; 3 - multiply by weights

	ret	 
MMXCODE1 ENDS

;        11111111112222222222333333333344444444445555555555666666666677777777778
;2345678901234567890123456789012345678901234567890123456789012345678901234567890
END