;--------------------------------------------------------------------------; ; ; INTEL Corporation Proprietary Information ; ; This listing is supplied under the terms of a license ; agreement with INTEL Corporation and may not be copied ; nor disclosed except in accordance with the terms of ; that agreement. ; ; Copyright (c) 1996 Intel Corporation. ; All Rights Reserved. ; ;--------------------------------------------------------------------------; ; ; $Header: S:\h26x\src\dec\d3madvpr.asv 1.6 01 Oct 1996 16:45:38 KLILLEVO $ ; $Log: S:\h26x\src\dec\d3madvpr.asv $ ;// ;// Rev 1.6 01 Oct 1996 16:45:38 KLILLEVO ;// removed unneccessary local variable and added code to verify ;// PITCH is 384 at compile-time ;// ;// Rev 1.5 01 Oct 1996 11:57:52 KLILLEVO ;// pairing done, saved about 5*4 = 20 cycles per block = 11880 cycles ;// per QCIF picture ;// ;// Rev 1.4 27 Sep 1996 17:28:40 KLILLEVO ;// added clipping of extended motion vectors, but pairing is horrible and ;// needs to be improved ;// ;// Rev 1.3 01 Apr 1996 12:35:14 RMCKENZX ;// ;// Added MMXCODE1 and MMXDATA1 segments, moved global data ;// to MMXDATA1 segment. ;// ;// Rev 1.2 07 Mar 1996 18:32:16 RMCKENZX ;// ;// Re-organized and optimized routine. Interpolaters now ;// interpolate & weight, driver accumulates and averages. Interpolaters ;// return results in mm4-mm7. Eliminated include file. ;// ;// Rev 1.0 27 Feb 1996 15:03:42 RMCKENZX ;// Initial revision. ; ;--------------------------------------------------------------------------; ; ; File: ; d3madvpr.asm ; ; Routines: ; MMX_AdvancePredict Driver ; MMxInterpolateAndAccumulate Assembly-called interpolate accumulate ; ;--------------------------------------------------------------------------; .586 .MODEL FLAT ; make all symbols case sensitive OPTION CASEMAP:NONE .xlist include iammx.inc .list MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE' MMXCODE1 ENDS MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA' MMXDATA1 ENDS ;--------------------------------------------------------------------------; ; ; MMX_AdvancePredict ; ; Description: ; This routine performs advanced prediction, including overlapped ; block motion compensation. It uses the assembly routine ; MMxInterpolateAndAccumulate. ; ; This routine is the assembly equivalent of NewAdvancePredict. ; ; Inputs: (dwords pushed onto stack by caller) ; DC flat pointer to decoder catalog. ; fpBlockAction flat pointer to block action stream. ; iNext flat pointer to offsets for 4 neighboring blocks. ; 0 = left ; 1 = right ; 2 = above ; 3 = below ; ; ; Register Usage: ; ; ; Notes: ; ;--------------------------------------------------------------------------; ; register storage ; ebp esp+00 ; ebx esp+04 ; edi esp+08 ; esi esp+12 ; local variable definitions lpBlockAction EQU esp+16 ; local block action stream pointer lNext EQU esp+20 ; local block action offsets pointer lClipX EQU esp+24 ; local copy of pointer to x vector clipping table lClipY EQU esp+28 ; local copy of pointer to y vector clipping table lNext EQU esp+32 ; local offsets (4 DWORDS = 16 bytes) lAccum EQU esp+64 ; accumulator (64 WORDS = 128 bytes) zero EQU mm0 lDst EQU edi ; local destination pointer ; C input parameters fpBlockAction EQU ebp+08 ; block action stream pointer iNext EQU esp+12 ; block action offsets pointer pDst EQU ebp+16 ; destination pointer pClipX EQU ebp+20 ; x vector clipping table pClipY EQU ebp+24 ; y vector clipping table ; MMX globals ; the weight tables are each 64 WORDS stored in Quadrant ascending order WtCt EQU gMMX_WeightCenter WtLR EQU gMMX_WeightLeftRight WtAB EQU gMMX_WeightAboveBelow Round1 EQU gMMX_Round1 Round2 EQU gMMX_Round2 Round4 EQU gMMX_Round4 PITCH = 384 FRAMESIZE = 256 ; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ; ; ANY CHANGES TO THE BLOCK ACTION STRUCTURE ; IN d3dec.h MUST BE ECHOED HERE!!!! ; ; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ; Offsets into Block Action structure T_BlkAction of length 20 ; see the definition in d3dec.h i8MVx2 = 1 ; I8 = signed byte i8MVy2 = 2 ; I8 = signed byte pRefBlock = 8 ; U32 = unsigned dword MMXDATA1 SEGMENT ALIGN 8 gMMX_WeightCenter LABEL DWORD WORD 5, 5, 5, 4, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6, 5, 5 ; Quadrant I WORD 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6 ; Quadrant II WORD 5, 5, 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 4, 5, 5, 5 ; Quadrant III WORD 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4 ; Quadrant IV gMMX_WeightLeftRight LABEL DWORD WORD 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 ; Quadrant I WORD 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 ; Quadrant II WORD 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1 ; Quadrant III WORD 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2 ; Quadrant IV gMMX_WeightAboveBelow LABEL DWORD WORD 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant I WORD 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant II WORD 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 ; Quadrant III WORD 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2 ; Quadrant IV gMMX_Round1 DWORD 00010001h, 00010001h gMMX_Round2 DWORD 00020002h, 00020002h gMMX_Round4 DWORD 00040004h, 00040004h MMXDATA1 ENDS ;--------------------------------------------------------------------------; ;--------------------------------------------------------------------------; MMXCODE1 SEGMENT PUBLIC C MMX_AdvancePredict IF PITCH-384 ** error: this code assumes PITCH is 384 ENDIF ;--------------------------------------------------------------------------; ; Start Code ;--------------------------------------------------------------------------; MMX_AdvancePredict: push ebp mov ebp, esp mov edx, [iNext] and esp, -32 ; align stack on cache boundary sub esp, FRAMESIZE pxor zero, zero ; zero for unpacking push esi push edi push ebx push ebp mov eax, [pClipX] mov ebx, [pClipY] mov [lClipX], eax mov [lClipY], ebx mov lDst, [pDst] mov eax, 00[edx] mov ebp, [fpBlockAction] mov ebx, 04[edx] lea eax, [eax+4*eax] mov ecx, 08[edx] lea ebx, [ebx+4*ebx] mov edx, 12[edx] lea ecx, [ecx+4*ecx] mov 00[lNext], eax lea edx, [edx+4*edx] mov 04[lNext], ebx mov 08[lNext], ecx mov 12[lNext], edx ;-----------------------------------------------------------------------; ; ; ; Center ; ; ; ;-----------------------------------------------------------------------; xor ecx, ecx mov esi, [lClipY] mov cl, i8MVy2[ebp] xor edx, edx add cl, 64 mov dl, i8MVx2[ebp] add dl, 64 mov ebx, [lClipX] mov ah, [ecx + esi] mov esi, pRefBlock[ebp] mov al, [edx + ebx] mov dl, ah shl edx, 24 mov cl, al sar edx, 18 xor cl, 080H shr ecx, 1 and edx, 0FFFFFF80H lea ebx, [WtCt + 32] add esi, ecx lea edx, [edx + edx*2 - 64] add esi, edx ; Quadrant II call MMxInterpolateAndAccumulate movq mm3, [Round4] paddw mm4, mm3 add esi, 4 paddw mm5, mm3 sub ebx, 32 movq [lAccum+00], mm4 paddw mm6, mm3 movq [lAccum+16], mm5 paddw mm7, mm3 movq [lAccum+32], mm6 movq [lAccum+48], mm7 ; Quadrant I call MMxInterpolateAndAccumulate movq mm3, [Round4] paddw mm4, mm3 add esi, 4*PITCH-4 paddw mm5, mm3 add ebx, 64 movq [lAccum+08], mm4 paddw mm6, mm3 movq [lAccum+24], mm5 paddw mm7, mm3 movq [lAccum+40], mm6 movq [lAccum+56], mm7 ; Quadrant III call MMxInterpolateAndAccumulate movq mm3, [Round4] paddw mm4, mm3 add esi, 4 paddw mm5, mm3 add ebx, 32 movq [lAccum+64], mm4 paddw mm6, mm3 movq [lAccum+80], mm5 paddw mm7, mm3 movq [lAccum+96], mm6 movq [lAccum+112], mm7 ; Quadrant IV call MMxInterpolateAndAccumulate movq mm3, [Round4] paddw mm4, mm3 mov ebx, 00[lNext] paddw mm5, mm3 movq [lAccum+72], mm4 paddw mm6, mm3 movq [lAccum+88], mm5 paddw mm7, mm3 movq [lAccum+104], mm6 movq [lAccum+120], mm7 ;-----------------------------------------------------------------------; ; ; ; Left ; ; ; ;-----------------------------------------------------------------------; xor ecx, ecx mov esi, [lClipY] mov cl, i8MVy2[ebp + 4*ebx] xor edx, edx add cl, 64 mov dl, i8MVx2[ebp + 4*ebx] add dl, 64 mov ebx, [lClipX] mov ah, [ecx + esi] mov esi, pRefBlock[ebp] mov al, [edx + ebx] mov dl, ah shl edx, 24 mov cl, al sar edx, 18 xor cl, 080H shr ecx, 1 and edx, 0FFFFFF80H lea ebx, [WtLR + 32] add esi, ecx lea edx, [edx + edx*2 - 64] add esi, edx ; Quadrant II call MMxInterpolateAndAccumulate paddw mm4, [lAccum+00] paddw mm5, [lAccum+16] paddw mm6, [lAccum+32] paddw mm7, [lAccum+48] movq [lAccum+00], mm4 movq [lAccum+16], mm5 movq [lAccum+32], mm6 movq [lAccum+48], mm7 ; Quadrant III add esi, 4*PITCH add ebx, 32 call MMxInterpolateAndAccumulate paddw mm4, [lAccum+64] paddw mm5, [lAccum+80] paddw mm6, [lAccum+96] paddw mm7, [lAccum+112] movq [lAccum+64], mm4 movq [lAccum+80], mm5 movq [lAccum+96], mm6 mov ebx, 04[lNext] movq [lAccum+112], mm7 ;-----------------------------------------------------------------------; ; ; ; Right ; ; ; ;-----------------------------------------------------------------------; xor ecx, ecx mov esi, [lClipY] mov cl, i8MVy2[ebp + 4*ebx] xor edx, edx add cl, 64 mov dl, i8MVx2[ebp + 4*ebx] add dl, 64 mov ebx, [lClipX] mov ah, [ecx + esi] mov esi, pRefBlock[ebp] mov al, [edx + ebx] mov dl, ah shl edx, 24 mov cl, al sar edx, 18 xor cl, 080H shr ecx, 1 and edx, 0FFFFFF80H lea ebx, [WtLR] add esi, ecx lea edx, [edx + edx*2 - 64] add esi, 4 add esi, edx ; Quadrant I call MMxInterpolateAndAccumulate paddw mm4, [lAccum+08] paddw mm5, [lAccum+24] paddw mm6, [lAccum+40] paddw mm7, [lAccum+56] movq [lAccum+08], mm4 movq [lAccum+24], mm5 movq [lAccum+40], mm6 movq [lAccum+56], mm7 ; Quadrant IV add esi, 4*PITCH add ebx, 96 call MMxInterpolateAndAccumulate paddw mm4, [lAccum+72] paddw mm5, [lAccum+88] paddw mm6, [lAccum+104] paddw mm7, [lAccum+120] movq [lAccum+72], mm4 movq [lAccum+88], mm5 movq [lAccum+104], mm6 mov ebx, 08[lNext] movq [lAccum+120], mm7 ;-----------------------------------------------------------------------; ; ; ; Above ; ; ; ;-----------------------------------------------------------------------; xor ecx, ecx mov esi, [lClipY] mov cl, i8MVy2[ebp + 4*ebx] xor edx, edx add cl, 64 mov dl, i8MVx2[ebp + 4*ebx] add dl, 64 mov ebx, [lClipX] mov ah, [ecx + esi] mov esi, pRefBlock[ebp] mov al, [edx + ebx] mov dl, ah shl edx, 24 mov cl, al sar edx, 18 xor cl, 080H shr ecx, 1 and edx, 0FFFFFF80H lea ebx, [WtAB] add esi, ecx lea edx, [edx + edx*2 - 64] add esi, 4 add esi, edx ; Quadrant I call MMxInterpolateAndAccumulate paddw mm4, [lAccum+08] paddw mm5, [lAccum+24] psraw mm4, 3 paddw mm6, [lAccum+40] psraw mm5, 3 paddw mm7, [lAccum+56] psraw mm6, 3 movq [lAccum+08], mm4 psraw mm7, 3 movq [lAccum+24], mm5 movq [lAccum+40], mm6 movq [lAccum+56], mm7 ; Quadrant II sub esi, 4 add ebx, 32 call MMxInterpolateAndAccumulate paddw mm4, [lAccum+00] paddw mm5, [lAccum+16] paddw mm6, [lAccum+32] psraw mm4, 3 paddw mm7, [lAccum+48] psraw mm5, 3 packuswb mm4, [lAccum+08] packuswb mm5, [lAccum+24] movq [lDst+00], mm4 psraw mm6, 3 movq [lDst+PITCH], mm5 psraw mm7, 3 packuswb mm6, [lAccum+40] packuswb mm7, [lAccum+56] movq [lDst+2*PITCH], mm6 mov ebx, 12[lNext] movq [lDst+3*PITCH], mm7 ;-----------------------------------------------------------------------; ; ; ; Below ; ; ; ;-----------------------------------------------------------------------; xor ecx, ecx mov esi, [lClipY] mov cl, i8MVy2[ebp + 4*ebx] xor edx, edx add cl, 64 mov dl, i8MVx2[ebp + 4*ebx] add dl, 64 mov ebx, [lClipX] mov ah, [ecx + esi] mov esi, pRefBlock[ebp] mov al, [edx + ebx] mov dl, ah shl edx, 24 mov cl, al sar edx, 18 xor cl, 080H shr ecx, 1 and edx, 0FFFFFF80H lea ebx, [WtAB + 96] add esi, ecx lea edx, [edx + edx*2 - 64] add esi, 4*PITCH+4 add esi, edx ; Quadrant IV call MMxInterpolateAndAccumulate paddw mm4, [lAccum+72] paddw mm5, [lAccum+88] psraw mm4, 3 paddw mm6, [lAccum+104] psraw mm5, 3 paddw mm7, [lAccum+120] psraw mm6, 3 movq [lAccum+72], mm4 psraw mm7, 3 movq [lAccum+88], mm5 movq [lAccum+104], mm6 movq [lAccum+120], mm7 ; Quadrant III sub esi, 4 sub ebx, 32 call MMxInterpolateAndAccumulate paddw mm4, [lAccum+64] paddw mm5, [lAccum+80] paddw mm6, [lAccum+96] psraw mm4, 3 paddw mm7, [lAccum+112] psraw mm5, 3 packuswb mm4, [lAccum+72] packuswb mm5, [lAccum+88] movq [lDst+4*PITCH], mm4 psraw mm6, 3 movq [lDst+5*PITCH], mm5 psraw mm7, 3 packuswb mm6, [lAccum+104] packuswb mm7, [lAccum+120] movq [lDst+6*PITCH], mm6 movq [lDst+7*PITCH], mm7 pop ebp pop ebx pop edi pop esi mov esp, ebp pop ebp ret ;--------------------------------------------------------------------------; ; ; Routine: ; MMxInterpolateAndAccumulate ; ; Inputs: ; esi flat pointer to Reference Block Source. ; it is already adjusted by the motion vector. ; al x component of motion vector. ; ah y component of motion vector. ; ebx flat pointer to Weighting values. ; ; Outputs ; mm4-mm7 Weighted, interpolated values for rows 0-3. ; Values are in packed word format. ; ; Description: ; This routine performs motion compensation interpolation, weights the ; results, and returns them in mmx registers 4-7. ; It works on a single 4x4 Quadrant per call. It is an assembly ; callable routine with its parameters in registers. ; ; Register Usage: ; This routine modifies no integer registers. ; All MMx registers are modified. ; ; Notes: ; ;--------------------------------------------------------------------------; ; asm input parameters lpSrc EQU esi ; motion compensated source pointer lpWt EQU ebx ; pointer to matrix of weights 4x4xWORD MMxInterpolateAndAccumulate: test eax, 100h ; test mvy's parity bit jnz IAAhalf ; jump when it was odd test eax, 1 ; test mvx's parity bit jnz IAAhalf_int ; jump when it was odd IAAint_int: movd mm4, [lpSrc] ; 1 - fetch row movd mm5, [PITCH+lpSrc] ; 2 - fetch row punpcklbw mm4, zero ; 1 - unpack row pmullw mm4, 00[lpWt] ; 1 - multiply by weights movq mm6, [PITCH*2+lpSrc] ; 3 - fetch row punpcklbw mm5, zero ; 2 - unpack row pmullw mm5, 08[lpWt] ; 2 - multiply by weights punpcklbw mm6, zero ; 3 - unpack row movq mm7, [PITCH*3+lpSrc] ; 4 - fetch row pmullw mm6, 16[lpWt] ; 3 - multiply by weights punpcklbw mm7, zero ; 4 - unpack row pmullw mm7, 24[lpWt] ; 4 - multiply by weights ret IAAhalf_int: movq mm4, [lpSrc] ; 0 - fetch row movq mm1, mm4 ; 0 - copy row psrlq mm4, 8 ; 0 - shift row movq mm5, [PITCH+lpSrc] ; 1 - fetch row punpcklbw mm4, zero ; 0 - unpack shifted row movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row punpcklbw mm1, zero ; 0 - unpack row movq mm2, mm5 ; 1 - copy row psrlq mm5, 8 ; 1 - shift row paddw mm4, [Round1] ; 0 - add in Round punpcklbw mm5, zero ; 1 - unpack shifted row paddw mm4, mm1 ; 0 - sum copies of row punpcklbw mm2, zero ; 1 - unpack row movq mm3, mm6 ; 2 - copy row psrlq mm6, 8 ; 2 - shift row paddw mm5, [Round1] ; 1 - add in Round punpcklbw mm6, zero ; 2 - unpack shifted row paddw mm5, mm2 ; 1 - sum copies of row punpcklbw mm3, zero ; 2 - unpack row movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row psraw mm4, 1 ; 0 - divide by 2 pmullw mm4, 00[lpWt] ; 0 - multiply by weights psraw mm5, 1 ; 1 - divide by 2 movq mm1, mm7 ; 3 - copy row psrlq mm7, 8 ; 3 - shift row paddw mm6, [Round1] ; 2 - add in Round punpcklbw mm7, zero ; 3 - unpack shifted row paddw mm6, mm3 ; 2 - sum copies of rows punpcklbw mm1, zero ; 3 - unpack row paddw mm7, [Round1] ; 3 - add in Round psraw mm6, 1 ; 2 - divide by 2 pmullw mm5, 08[lpWt] ; 1 - multiply by weights paddw mm7, mm1 ; 3 - sum copies of row pmullw mm6, 16[lpWt] ; 2 - multiply by weights psraw mm7, 1 ; 3 - divide by 2 pmullw mm7, 24[lpWt] ; 3 - multiply by weights ret IAAhalf: test eax, 1 ; test mvx's parity bit jnz IAAhalf_half ; jump when it was odd IAAint_half: movd mm4, [lpSrc] ; 0 - fetch row movd mm5, [PITCH+lpSrc] ; 1 - fetch row punpcklbw mm4, zero ; 0 - unpack row movd mm6, [PITCH*2+lpSrc] ; 2 - fetch row punpcklbw mm5, zero ; 1 - unpack row paddw mm4, [Round1] ; 0 - add in Round punpcklbw mm6, zero ; 2 - unpack row paddw mm4, mm5 ; 0 - sum rows paddw mm5, [Round1] ; 1 - add in Round movd mm7, [PITCH*3+lpSrc] ; 3 - fetch row psraw mm4, 1 ; 0 - divide by 2 pmullw mm4, 00[lpWt] ; 0 - multiply by weights paddw mm5, mm6 ; 1 - sum rows movd mm3, [PITCH*4+lpSrc] ; 4 - fetch row punpcklbw mm7, zero ; 3 - unpack row paddw mm6, [Round1] ; 2 - add in Round psraw mm5, 1 ; 1 - divide by 2 pmullw mm5, 08[lpWt] ; 1 - multiply by weights punpcklbw mm3, zero ; 4 - unpack row paddw mm6, mm7 ; 2 - sum rows paddw mm7, [Round1] ; 3 - add in Round paddw mm7, mm3 ; 3 - sum rows psraw mm6, 1 ; 2 - divide by 2 pmullw mm6, 16[lpWt] ; 2 - multiply by weights psraw mm7, 1 ; 3 - divide by 2 pmullw mm7, 24[lpWt] ; 3 - multiply by weights ret IAAhalf_half: movq mm4, [lpSrc] ; 0 - fetch row movq mm5, [PITCH+lpSrc] ; 1 - fetch row movq mm1, mm4 ; 0 - copy row movq mm2, mm5 ; 1 - copy row psrlq mm4, 8 ; 0 - shift row movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row punpcklbw mm4, zero ; 0 - unpack shifted row movq mm3, mm6 ; 2 - copy row punpcklbw mm1, zero ; 0 - unpack row paddw mm4, mm1 ; 0 - parital sum both copies of row psrlq mm5, 8 ; 1 - shift row paddw mm4, [Round2] ; 0 - add in Round punpcklbw mm5, zero ; 1 - unpack shifted row movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row punpcklbw mm2, zero ; 1 - unpack row paddw mm5, mm2 ; 1 - parital sum both copies of row psrlq mm6, 8 ; 2 - shift row paddw mm4, mm5 ; 0 - add partial sums punpcklbw mm6, zero ; 2 - unpack shifted row paddw mm5, [Round2] ; 1 - add in Round punpcklbw mm3, zero ; 2 - unpack row paddw mm6, mm3 ; 2 - parital sum both copies of row movq mm1, mm7 ; 3 - copy row movq mm2, [PITCH*4+lpSrc] ; 4 - fetch row psraw mm4, 2 ; 0 - divide by 2 paddw mm5, mm6 ; 1 - add partial sums psrlq mm7, 8 ; 3 - shift row paddw mm6, [Round2] ; 2 - add in Round punpcklbw mm7, zero ; 3 - unpack shifted row movq mm3, mm2 ; 4 - copy row punpcklbw mm1, zero ; 3 - unpack row paddw mm7, mm1 ; 3 - parital sum both copies of row psrlq mm2, 8 ; 4 - shift row pmullw mm4, 00[lpWt] ; 0 - multiply by weights punpcklbw mm2, zero ; 4 - unpack shifted row paddw mm6, mm7 ; 2 - add partial sums punpcklbw mm3, zero ; 4 - unpack row paddw mm7, [Round2] ; 3 - add in Round psraw mm5, 2 ; 1 - divide by 2 pmullw mm5, 08[lpWt] ; 1 - multiply by weights paddw mm2, mm3 ; 4 - parital sum both copies of row paddw mm7, mm2 ; 3 - add partial sums psraw mm6, 2 ; 2 - divide by 2 pmullw mm6, 16[lpWt] ; 2 - multiply by weights psraw mm7, 2 ; 3 - divide by 2 pmullw mm7, 24[lpWt] ; 3 - multiply by weights ret MMXCODE1 ENDS ; 11111111112222222222333333333344444444445555555555666666666677777777778 ;2345678901234567890123456789012345678901234567890123456789012345678901234567890 END