;//////////////////////////////////////////////////////////////////////////// ;// ;// INTEL CORPORATION PROPRIETARY INFORMATION ;// ;// This software is supplied under the terms of a license ;// agreement or nondisclosure agreement with Intel Corporation ;// and may not be copied or disclosed except in accordance ;// with the terms of that agreement. ;// ;//////////////////////////////////////////////////////////////////////////// ;// ;// $Header: R:\h26x\h26x\src\enc\e3mbme.asv 1.5 18 Oct 1996 16:57:08 BNICKERS $ ;// ;// $Log: R:\h26x\h26x\src\enc\e3mbme.asv $ ;// ;// Rev 1.5 18 Oct 1996 16:57:08 BNICKERS ;// Fixes for EMV ;// ;// Rev 1.4 12 Sep 1996 10:56:16 BNICKERS ;// Add arguments for thresholds and differentials. ;// ;// Rev 1.3 22 Jul 1996 15:22:48 BNICKERS ;// Reduce code size. Implement H261 spatial filter. ;// ;// Rev 1.2 14 May 1996 12:18:48 BNICKERS ;// Initial debugging of MMx B-Frame ME. ;// ;// Rev 1.1 03 May 1996 14:03:30 BNICKERS ;// ;// Minor bug fixes and integration refinements. ;// ;// Rev 1.0 02 May 1996 12:00:56 BNICKERS ;// Initial revision. ;// ;//////////////////////////////////////////////////////////////////////////// ; ; MMxBFrameMotionEstimation -- This function performs motion estimation for the ; B frame macroblocks identified in the input list. ; This is the MMx version. ; OPTION M510 OPTION CASEMAP:NONE BFRMNONZEROMVDIFFERENTIAL = 400 BFRMEMPTYTHRESHOLD = 256 .xlist include e3inst.inc include memmodel.inc include iammx.inc include exEDTQ.inc include e3mbad.inc .list .CODE EDTQ EXTERN MMxDoForwardDCT:NEAR PUBLIC MMxDoBFrameLumaBlocks PUBLIC MMxDoBFrameChromaBlocks StackOffset TEXTEQU <4> CONST_384 TEXTEQU MMxDoBFrameLumaBlocks: mov eax,QPDiv2 ; Swap these so Quantizer uses right level. mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov cl,INTER1MV mov BRecip2QPToUse,eax mov StashBlockType,cl BFrameSWDLoop_0MV: mov ecx,[edx].BlkY1.MVs xor ebx,ebx mov bl,[edx].BlkY1.PVMV ; P-frame Vertical MV lea edi,WeightForwardMotion xor eax,eax and ecx,0FFH ; P-frame Horizontal MV mov al,[edi+ebx] ; VMV for past ref. mov bl,[edi+ebx+64] ; VMV for future ref. mov [edx].BlkY1.VMVb0Delta,bl mov bl,[edi+ecx+64] ; HMV for future ref. mov [edx].BlkY1.HMVb0Delta,bl mov bl,[edi+ecx] ; HMV for past ref. mov [edx].BlkY1.VMVf0Delta,al ; Record candidate VMVf. xor ecx,ecx ; Keep pairing happy. mov [edx].BlkY1.HMVf0Delta,bl ; Record candidate HMVf. mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame. call ComputeBFrameSWDForCandRef movdf [edx].BlkY1.BlkLvlSWD0Delta,mm7 ; Stash SWD. add edx,SIZEOF T_Blk lea edi,WeightForwardMotion test dl,4*SIZEOF T_Blk ; Quit when fourth block done. je BFrameSWDLoop_0MV mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs mov cl,[edx-4*SIZEOF T_Blk].BlockType xor cl,INTER1MV or al,ah lea esi,[edx-4*SIZEOF T_Blk] ; Reset MacroBlockActionDescr cursor. or al,cl mov ecx,[edx-SIZEOF T_Blk].BlkY1.BlkLvlSWD0Delta je BelowBFrmZeroThreshold ; Jump if P frm macroblock uses 0 motion vector. xor eax,eax cmp ecx,BFrmZeroVectorThreshold mov CurrSWDState,eax ; Record ME engine state. jle BelowBFrmZeroThreshold mov edx,[esi].BlkY1.BlkLvlSWD0Delta ; Remember 0-MV SWDs. mov ecx,[esi].BlkY2.BlkLvlSWD0Delta mov [esi].BlkY1.BestBlkLvlSWD,edx mov [esi].BlkY2.BestBlkLvlSWD,ecx mov edx,[esi].BlkY3.BlkLvlSWD0Delta mov ecx,[esi].BlkY4.BlkLvlSWD0Delta mov [esi].BlkY3.BestBlkLvlSWD,edx mov [esi].BlkY4.BestBlkLvlSWD,ecx mov [esi].BlkU.BestBlkLvlSWD,ecx ; Avoid unintended early out, below. xor edx,edx ; Set best MV to zero. BFrmSWDLoop: mov ecx,PD BFrmSWDState[eax] ; cl == HMV; ch == VMV offsets to try. mov BestMV,edx ; Record what the best MV so far is. add cl,dl ; Try this horizontal MV delta. je HMVdIsZero mov PB CandidateMV,cl ; Record the candidate HMV delta. add ch,dh ; Try this vertical MV delta. mov PB CandidateMV+1,ch ; Record the candidate VMV delta. je VMVdIsZero VMVdAndHMVdAreNonZero_Loop: mov edx,[esi].BlkY1.MVs xor ebx,ebx mov bl,dl xor eax,eax mov al,dh add esi,SIZEOF T_Blk mov bl,[edi+ebx] ; TRb * HMV / TRd pxor mm7,mm7 ; Initialize SWD accumulator add bl,cl ; HMVf = TRb * HMV / TRd + HMVd mov al,[edi+eax] ; TRb * VMV / TRd cmp bl,040H ; If too far left or right, quick out. jbe MVDeltaOutOfRange mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl add al,ch ; VMVf = TRb * VMV / TRd + VMVd cmp al,040H ; If too far up or down, quick out. jbe MVDeltaOutOfRange mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al sub bl,dl ; -HMVb = -(HMVf - HMV) mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl sub al,dh ; -VMVb = -(VMVf - VMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al je VMVdAndHMVdAreNonZero_Loop sub esi,4*SIZEOF T_Blk jmp CandidateMVsGotten VMVdIsZero: VMVdIsZero_Loop: mov edx,[esi].BlkY1.MVs xor eax,eax mov al,dh xor ebx,ebx mov bl,dl add esi,SIZEOF T_Blk mov dh,[edi+eax+64] ; -VMVb = -((TRb - TRd) * VMV) / TRd mov al,[edi+eax] ; TRb * VMV / TRd mov bl,[edi+ebx] ; TRb * HMV / TRd mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al add bl,cl ; HMVf = TRb * HMV / TRd + HMVd mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,dh cmp bl,040H ; If too far left or right, quick out. jbe MVDeltaOutOfRange mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl sub bl,dl ; -HMVb = -(HMVf - HMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl je VMVdIsZero_Loop sub esi,4*SIZEOF T_Blk pxor mm7,mm7 ; Initialize SWD accumulator jmp CandidateMVsGotten BFrameEarlyOutForCandidateMV: MVDeltaOutOfRange: and esi,-1-7*SIZEOF T_Blk ; Reset block action descr cursor. mov ebx,CurrSWDState ; Reload ME engine state. xor eax,eax mov edx,BestMV ; Previous best MV is still best. mov al,BFrmSWDState[ebx+2] ; Get next State number. jmp ProceedWithNextCand HMVdIsZero: mov PB CandidateMV,cl ; Record the candidate HMV delta. add ch,dh ; Try this vertical MV delta. mov PB CandidateMV+1,ch ; Record the candidate VMV delta. HMVdIsZeroLoop: mov edx,[esi].BlkY1.MVs xor ebx,ebx mov bl,dl xor eax,eax mov al,dh add esi,SIZEOF T_Blk mov dl,[edi+ebx+64] ; -HMVb = -((TRb - TRd) * HMV) / TRd mov bl,[edi+ebx] ; TRb * HMV / TRd mov al,[edi+eax] ; TRb * VMV / TRd mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl add al,ch ; VMVf = TRb * VMV / TRd + VMVd mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,dl cmp al,040H ; If too far up or down, quick out. jbe MVDeltaOutOfRange mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al sub al,dh ; -VMVb = -(VMVf - VMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al je HMVdIsZeroLoop sub esi,4*SIZEOF T_Blk pxor mm7,mm7 ; Initialize SWD accumulator CandidateMVsGotten: BFrameSWDLoop_Non0MVCandidate: xor eax,eax xor ebx,ebx mov al,[esi].BlkY1.CandVMVf mov edi,[esi].BlkY1.BlkOffset ; Address of 0-MV blk within frame. mov bl,[esi].BlkY1.CandHMVf mov edx,esi call ComputeBFrameSWDForCandRef movdf ecx,mm7 mov eax,[edx].BlkY2.BestBlkLvlSWD lea esi,[edx+SIZEOF T_Blk] ; Early out if the first N blocks for cmp ecx,eax ; this cand are worse than the first jge BFrameEarlyOutForCandidateMV ; N+1 blocks for previous best. test esi,4*SIZEOF T_Blk ; Quit when fourth block done. mov [esi-SIZEOF T_Blk].BlkY1.CandBlkLvlSWD,ecx ; Stash SWD. je BFrameSWDLoop_Non0MVCandidate ; This candidate is best so far. mov [esi-4*SIZEOF T_Blk].BlkY4.BestBlkLvlSWD,ecx mov ebx,CurrSWDState ; Reload ME engine state. mov [esi-4*SIZEOF T_Blk].BlkU.BestBlkLvlSWD,ecx sub esi,4*SIZEOF T_Blk xor eax,eax mov edx,CandidateMV ; Candidate was best MV. mov ecx,[esi].BlkY3.CandBlkLvlSWD mov [esi].BlkY3.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY2.CandBlkLvlSWD mov [esi].BlkY2.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY1.CandBlkLvlSWD mov [esi].BlkY1.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY4.CandBiDiMVs mov [esi].BlkY4.BestBiDiMVs,ecx mov ecx,[esi].BlkY3.CandBiDiMVs mov [esi].BlkY3.BestBiDiMVs,ecx mov ecx,[esi].BlkY2.CandBiDiMVs mov [esi].BlkY2.BestBiDiMVs,ecx mov ecx,[esi].BlkY1.CandBiDiMVs mov [esi].BlkY1.BestBiDiMVs,ecx mov al,BFrmSWDState[ebx+3] ; Get next State number. ProceedWithNextCand: mov CurrSWDState,eax ; Record ME engine state. test eax,eax lea edi,WeightForwardMotion jne BFrmSWDLoop mov ecx,[esi].BlkY4.BlkLvlSWD0Delta ; 0MV SWD sub ecx,BFRMNONZEROMVDIFFERENTIAL mov ebx,[esi].BlkY4.BestBlkLvlSWD ; Best non-0 MV SWD. cmp ebx,ecx jge NonZeroBFrmVectorNotGoodEnoughGain mov [esi].BlkY1.BHMV,dl mov [esi].BlkY2.BHMV,dl mov [esi].BlkY3.BHMV,dl mov [esi].BlkY4.BHMV,dl mov [esi].BlkY1.BVMV,dh mov [esi].BlkY2.BVMV,dh mov [esi].BlkY3.BVMV,dh mov [esi].BlkY4.BVMV,dh mov eax,[esi].BlkY4.BestBlkLvlSWD mov ebx,[esi].BlkY3.BestBlkLvlSWD sub eax,ebx mov ecx,[esi].BlkY2.BestBlkLvlSWD sub ebx,ecx mov edx,[esi].BlkY1.BestBlkLvlSWD sub ecx,edx mov [esi].BlkY4.BestBlkLvlSWD,eax mov [esi].BlkY3.BestBlkLvlSWD,ebx mov [esi].BlkY2.BestBlkLvlSWD,ecx mov [esi].BlkY1.BestBlkLvlSWD,edx jmp BFrmMVSettled BelowBFrmZeroThreshold: NonZeroBFrmVectorNotGoodEnoughGain: mov ebx,[esi].BlkY4.BlkLvlSWD0Delta mov ecx,[esi].BlkY3.BlkLvlSWD0Delta sub ebx,ecx mov edx,[esi].BlkY2.BlkLvlSWD0Delta sub ecx,edx mov edi,[esi].BlkY1.BlkLvlSWD0Delta sub edx,edi mov [esi].BlkY4.BestBlkLvlSWD,ebx mov [esi].BlkY3.BestBlkLvlSWD,ecx mov [esi].BlkY2.BestBlkLvlSWD,edx mov [esi].BlkY1.BestBlkLvlSWD,edi mov eax,[esi].BlkY1.BiDiMVs0Delta mov [esi].BlkY1.BestBiDiMVs,eax mov eax,[esi].BlkY2.BiDiMVs0Delta mov [esi].BlkY2.BestBiDiMVs,eax mov eax,[esi].BlkY3.BiDiMVs0Delta mov [esi].BlkY3.BestBiDiMVs,eax mov eax,[esi].BlkY4.BiDiMVs0Delta mov [esi].BlkY4.BestBiDiMVs,eax xor eax,eax mov [esi].BlkY1.BHMV,al mov [esi].BlkY2.BHMV,al mov [esi].BlkY3.BHMV,al mov [esi].BlkY4.BHMV,al mov [esi].BlkY1.BVMV,al mov [esi].BlkY2.BVMV,al mov [esi].BlkY3.BVMV,al mov [esi].BlkY4.BVMV,al BFrmMVSettled: mov edx,esi mov bl,8 ; Init coded block pattern BFrmLumaBlkLoop: mov esi,[edx].BlkY1.BestBlkLvlSWD ; Get SWD for block. xor eax,eax mov BFrmCBP,bl cmp esi,BFRMEMPTYTHRESHOLD ; Below threshold for forcing empty? mov ecx,BSWDTotal jl BFrmLumaBlkEmpty mov eax,[edx].BlkY1.BestBiDiMVs xor ebx,ebx add ecx,esi mov bl,ah mov BSWDTotal,ecx and eax,0FFH call BFrameDTQ mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0. mov al,BFrmCBP BFrmLumaBlkEmpty: or bl,al ; Factor in CBP bit for this block. add edx,SIZEOF T_Blk shr bl,1 ; CF == 1 when sentinel shifted off jnc BFrmLumaBlkLoop mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl sub edx,4*SIZEOF T_Blk mov eax,QPDiv2 ; Restore these for P frame blocks. mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov BRecip2QPToUse,eax ret MMxDoBFrameChromaBlocks: ; mov eax,QPDiv2 ; Swap these so Quantizer uses right level. ; mov ebx,BQPDiv2 ; (Loaded in caller.) mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov cl,INTER1MV mov BRecip2QPToUse,eax mov StashBlockType,cl mov eax,[edx].BlkU.BestBiDiMVs xor ebx,ebx mov bl,ah and eax,0FFH add edx,4*SIZEOF T_Blk ; To know we're working on chroma. call BFrameDTQ mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0. mov al,[edx-4*SIZEOF T_Blk].CodedBlocksB or bl,al ; Factor in CBP bit for this block. mov eax,[edx-4*SIZEOF T_Blk].BlkV.BestBiDiMVs mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl xor ebx,ebx mov bl,ah and eax,0FFH add edx,SIZEOF T_Blk call BFrameDTQ mov bl,BlkEmptyFlag[ebx+2] ; Fetch 32 if block not empty; else 0. mov al,[edx-5*SIZEOF T_Blk].CodedBlocksB or bl,al ; Factor in CBP bit for this block. mov eax,QPDiv2 ; Restore these for P frame blocks. mov [edx-5*SIZEOF T_Blk].CodedBlocksB,bl mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov BRecip2QPToUse,eax sub edx,5*SIZEOF T_Blk ret ;=============================================================================== ; ebp -- Pitch ; edi -- Address of (0-MV) block within frame. ; edx -- Block Action Decriptor cursor ; ebx -- HMVf (HMV to apply to past reference) biased by 96. ; eax -- VMVf (VMV to apply to past reference) biased by 96. StackOffset TEXTEQU <8> ComputeBFrameSWDForCandRef: test al,1 mov ecx,PreviousFrameBaseAddress lea eax,[eax+eax*2] ; Start of VMVf*384 jne ME_VMVfAtHalfPelPosition ME_VMVfAtFullPelPosition: IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl eax,6 add ecx,edi shr ebx,1 ; CF == 1 iff HMVf is at half pel. jc ME_VMVfAtFull_HMVfAtHalfPelPosition ME_VMVfAtFull_HMVfAtFullPelPosition: lea esi,[ecx+eax-48*PITCH-48] lea ecx,[ebp+ebp*2] add esi,ebx ; Address of past reference block. mov eax,BFrameBaseAddress add edi,eax ; Address of target block. lea ebx,[ebp+ebp*4] movq mm0,[esi+ebp*1] psubw mm0,[edi+ebp*1] ; Get diff for line 1. movq mm1,[esi+ecx] ; Ref MB, upper left block, Line 3. psllw mm0,8 ; Extract diffs for line 1 even pels. psubw mm1,[edi+ecx] ; Diff for line 3. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1. movq mm2,[esi+ebx] psllw mm1,8 psubw mm2,[edi+ebx] pmaddwd mm1,mm1 movq mm3,[esi+PITCH*7] psllw mm2,8 psubw mm3,[edi+PITCH*7] pmaddwd mm2,mm2 movq mm4,[esi] ; Ref MB, upper left blk, Line 0. psllw mm3,8 psubw mm4,[edi] ; Diff for line 0. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2). movq mm1,[esi+ebp*2] pmaddwd mm3,mm3 psubw mm1,[edi+ebp*2] paddusw mm0,mm2 movq mm2,[esi+ebp*4] pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0. psubw mm2,[edi+ebp*4] paddusw mm0,mm3 movq mm3,[esi+ecx*2] pmaddwd mm1,mm1 psubw mm3,[edi+ecx*2] pmaddwd mm2,mm2 paddusw mm0,mm4 pmaddwd mm3,mm3 paddusw mm0,mm1 ; paddusw mm0,mm2 ; paddusw mm0,mm3 ; punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret ME_VMVfAtFull_HMVfAtHalfPelPosition: lea esi,[ecx+eax-48*PITCH-48] mov eax,BFrameBaseAddress add esi,ebx ; Address of past reference block. add edi,eax ; Address of target block. lea ecx,[ebp+ebp*2] movq mm0,mm6 ; 8 bytes of 1 pmullw mm0,[esi] ; <(P07+P06)*256+junk ...> movq mm1,mm6 pmullw mm1,[esi+ebp*2] movq mm2,mm6 pmullw mm2,[esi+ebp*4] movq mm3,mm6 movq mm4,[edi] ; psrlw mm0,1 ; <(P07+P06)*256/2+junk ...> pmullw mm3,[esi+ecx*2] psllw mm4,8 ; movq mm5,[edi+ebp*2] psrlw mm1,1 psubw mm0,mm4 ; <(P07+P06)*256/2-C06*256+junk ...> psllw mm5,8 movq mm4,[edi+ebp*4] psrlw mm2,1 psubw mm1,mm5 psllw mm4,8 movq mm5,[edi+ecx*2] psrlw mm3,1 psubw mm2,mm4 pmaddwd mm0,mm0 ; SSD fof even pels of line 0. pmaddwd mm1,mm1 psllw mm5,8 psubw mm3,mm5 pmaddwd mm2,mm2 pmaddwd mm3,mm3 movq mm5,mm6 pmullw mm6,[esi+ebp*1+1] ; <(P18+P17)*256+junk ...> movq mm4,mm5 pmullw mm5,[esi+ecx+1] paddusw mm0,mm1 ; Accum SSD for lines 0 and 2. paddusw mm2,mm3 movq mm1,mm4 pmullw mm4,[esi+PITCH*5+1] paddusw mm0,mm2 pmullw mm1,[esi+PITCH*7+1] psrlw mm6,1 ; <(P18+P17)*256/2+junk ...> psubw mm6,[edi+ebp*1] ; <(P18+P17)*256/2-C17*256+junk ...> psrlw mm5,1 psubw mm5,[edi+ecx] psrlw mm4,1 psubw mm4,[edi+PITCH*5] pmaddwd mm6,mm6 ; SSD for odd pels of line 1. pmaddwd mm5,mm5 psrlw mm1,1 psubw mm1,[edi+PITCH*7] pmaddwd mm4,mm4 pmaddwd mm1,mm1 paddusw mm0,mm6 pxor mm6,mm6 paddusw mm0,mm5 pcmpeqb mm5,mm5 paddusw mm0,mm4 psubb mm6,mm5 ; Restore 8 bytes of -1. paddusw mm0,mm1 punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret ME_VMVfAtHalfPelPosition: IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl eax,6 lea ecx,[ecx+edi-48*PITCH-48-PITCH/2] add ecx,eax mov eax,BFrameBaseAddress shr ebx,1 ; CF == 1 iff HMVf is at half pel. mov esi,ecx ; esi and ecx same if HMVf at full pel, adc ecx,ebx ; but inc ecx if HMVf is at half pel. add esi,ebx add edi,eax ; Address of target block. lea ebx,[ebp+ebp*2] movq mm0,[esi] ; pcmpeqb mm6,mm6 movq mm1,[ecx+ebp*1] ; or psrlw mm6,8 movq mm2,[esi+ebp*2] ; paddb mm0,mm1 ; or movq mm3,[ecx+ebx] ; or paddb mm1,mm2 ; or movq mm4,[esi+ebp*4] ; paddb mm2,mm3 ; or paddb mm3,mm4 ; or psrlw mm0,1 ; <(P07+P17)/2 junk ...> or (P07+P18)/2 junk ...> pand mm1,mm6 ; or psrlw mm2,1 ; <(P27+P37)/2 junk ...> or (P27+P38)/2 junk ...> movq mm5,[edi+ebp*1] ; pand mm3,mm6 ; or movq mm6,[edi+ebx] ; psllw mm5,8 ; psubw mm0,[edi] ; <(P07+P17)/2-C07 junk ...> or ... psllw mm1,7 ; <(P16+P26)/2 ...> or <(P17+P26)/2 ...> psubw mm2,[edi+ebp*2] ; <(P27+P37)/2-C27 junk ...> or ... psllw mm6,8 ; pmaddwd mm0,mm0 ; SSD of even pels of line 0. psubw mm1,mm5 ; <(P16+P26)/2-C16 junk ...> or ... pmaddwd mm1,mm1 ; SSD of odd pels of line 1. psllw mm3,7 ; <(P36+P46)/2 ...> or <(P37+P46)/2 ...> pmaddwd mm2,mm2 ; SSD of even pels of line 2. psubw mm3,mm6 ; <(P36+P46)/2-C36 junk ...> or ... pmaddwd mm3,mm3 ; SSD of odd pels of line 3. pcmpeqb mm6,mm6 paddusw mm0,mm1 movq mm1,[ecx+PITCH*5] paddusw mm0,mm2 movq mm2,[esi+ebx*2] paddusw mm0,mm3 movq mm3,[ecx+PITCH*7] paddb mm4,mm1 paddb mm1,mm2 paddb mm2,mm3 paddb mm3,[esi+ebp*8] psrlw mm6,8 pand mm1,mm6 psrlw mm4,1 movq mm5,[edi+PITCH*5] psrlw mm2,1 pand mm3,mm6 psllw mm5,8 movq mm6,[edi+PITCH*7] psllw mm1,7 psubw mm4,[edi+ebp*4] psllw mm3,7 psubw mm2,[edi+ebx*2] psllw mm6,8 pmaddwd mm4,mm4 psubw mm1,mm5 pmaddwd mm2,mm2 psubw mm3,mm6 pmaddwd mm1,mm1 pxor mm6,mm6 pmaddwd mm3,mm3 paddusw mm0,mm4 pcmpeqb mm5,mm5 paddusw mm0,mm1 psubb mm6,mm5 ; Restore 8 bytes of 1. paddusw mm0,mm2 paddusw mm0,mm3 ; punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret ;=============================================================================== ; ebp -- Pitch ; edx -- Block Action Decriptor cursor ; ebx -- VMVf (VMV to apply to past reference) biased by 96. ; eax -- HMVf (HMV to apply to past reference) biased by 96. StackOffset TEXTEQU <8> BFrameDTQ: test bl,1 lea ebx,[ebx+ebx*2] ; Start of VMVf*384 mov ecx,PreviousFrameBaseAddress jne Diff_VMVfAtHalfPelPosition Diff_VMVfAtFullPelPosition: IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl ebx,6 mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame. shr eax,1 ; CF == 1 iff HMVf is at half pel. jc Diff_VMVfAtFull_HMVfAtHalfPelPosition Diff_VMVfAtFull_HMVfAtFullPelPosition: lea esi,[ecx+ebx-48*PITCH-48] add eax,edi add esi,eax ; Address of past reference block. mov ecx,PITCH/4 ; Pitch for past reference blk, div 4. mov eax,BFrameBaseAddress ; Address of target block. mov PastRefPitchDiv4,ecx add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred Diff_VMVfAtHalfPelPosition: IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl ebx,6 mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame. shr eax,1 ; CF == 1 iff HMVf is at half pel. jc Diff_VMVfAtHalf_HMVfAtHalfPelPosition Diff_VMVfAtHalf_HMVfAtFullPelPosition: lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 pcmpeqb mm6,mm6 pcmpeqb mm7,mm7 ; 8 bytes -1 movq mm2,[esi] ; Line0 paddb mm6,mm6 ; 8 bytes of 0xFE. @@: movq mm1,[esi+ebp*1] ; Line1 movq mm0,mm2 ; Line0 movq mm2,[esi+ebp*2] ; Line2 psubb mm1,mm7 ; Line1+1 paddb mm0,mm1 ; Line0+Line1+1 paddb mm1,mm2 ; Line1+Line2+1 pand mm0,mm6 ; pre-clean pand mm1,mm6 ; pre-clean add eax,32 ; Advance pointer for PelDiffs output. psrlq mm0,1 ; (Line0+Line1+1)/2 lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm1,1 ; (Line1+Line2+1)/2 movq [eax],mm0 ; Store Past Ref for Line0 movq [eax+16],mm1 ; Store Past Ref for Line1 test al,32 ; Iterate twice jne @b test al,64 ; Iterate twice. mov ecx,4 ; Pitch for past reference blk, div 4. mov PastRefPitchDiv4,ecx jne @b mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred Diff_VMVfAtFull_HMVfAtHalfPelPosition: lea esi,[ecx+ebx-48*PITCH-48] ; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 lea ebx,Pel_Rnd xor ecx,ecx @@: movq mm0,[esi+1] ; pcmpeqb mm7,mm7 mov cl,[esi] ; P00 movq mm2,mm0 ; movq mm1,[esi+ebp*1+1] psllq mm2,8 ; paddb mm0,[ebx+ecx*8] ; movq mm3,mm1 mov cl,[esi+ebp*1] psllq mm3,8 paddb mm1,mm3 paddb mm0,mm2 ; paddb mm1,[ebx+ecx*8] paddb mm7,mm7 ; 8 bytes of 0xFE. pand mm0,mm7 ; pre-clean pand mm1,mm7 ; pre-clean add eax,32 ; Advance pointer for PelDiffs output. psrlq mm0,1 ; <(P08+P07+1)/2 ...> lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm1,1 movq [eax],mm0 ; Store Past Ref for Line0 movq [eax+16],mm1 ; Store Past Ref for Line1 test al,32 ; Iterate twice jne @b test al,64 ; Iterate twice. mov cl,4 ; Pitch for past reference blk, div 4. mov PastRefPitchDiv4,ecx jne @b mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred Diff_VMVfAtHalf_HMVfAtHalfPelPosition: lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 lea ebx,Pel_Rnd xor ecx,ecx movq mm3,[esi+1] ; 0A: pcmpeqb mm7,mm7 mov cl,[esi] ; 0B: P00 movq mm0,mm3 ; 0C: paddb mm7,mm7 ; 8 bytes of 0xFE. psllq mm0,8 ; 0D: paddb mm3,[ebx+ecx*8] ; 0E: movq mm6,mm7 ; 8 bytes of 0xFE. @@: movq mm1,[esi+ebp*1+1] ; 1A: paddb mm0,mm3 ; 0F: mov cl,[esi+ebp*1] ; 1B: P10 movq mm3,mm1 ; 1C: movq mm2,[esi+ebp*2+1] ; 2A: psllq mm3,8 ; 1D: paddb mm1,[ebx+ecx*8] ; 1E: movq mm4,mm2 ; 2C: mov cl,[esi+ebp*2] ; 2B: P20 paddb mm1,mm3 ; 1F: pandn mm6,mm1 ; 0G: <(P18+P17+1)&1 ...> psllq mm4,8 ; 2D: paddb mm2,[ebx+ecx*8] ; 2E: movq mm5,mm6 ; 1G: <(P18+P17+1)&1 ...> paddb mm2,mm4 ; 2F: pand mm6,mm0 ; 0H: <(P18+P17+1)&(P08+P07+1)&1 ...> pand mm5,mm2 ; 1H: <(P18+P17+1)&(P28+P27+1)&1 ...> pand mm0,mm7 ; 0I: pre-clean for divide pand mm1,mm7 ; 1I: pre-clean for divide psrlq mm0,1 ; 0J: <(P08+P07+1)/2 ...> movq mm3,mm2 ; Save line 2 for next iter's line 0. psrlq mm1,1 ; 1J: <(P18+P17+1)/2 ...> pand mm2,mm7 ; 2I: pre-clean for divide paddb mm0,mm1 ; 0K: <(P08+P07+1)/2+(P18+P17+1)/2 ...> paddb mm6,mm0 ; 0L: <(P08+P07+P18+P17+2)/2 ...> psrlq mm2,1 ; 2J: <(P28+P27+1)/2 ...> paddb mm1,mm2 ; 1K: <(P18+P17+1)/2+(P28+P27+1)/2 ...> pand mm6,mm7 ; 0M: pre-clean for divide paddb mm5,mm1 ; 1L: <(P18+P17+P28+P27+2)/2 ...> psrlq mm6,1 ; 0M: <(P08+P07+P18+P17+2)/4 ...> add eax,32 ; Advance pointer for PelDiffs output. pand mm5,mm7 ; 1M: pre-clean for divide lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm5,1 ; 1N: <(P18+P17+P28+P27+2)/4 ...> movq [eax],mm6 ; 0O: Store Past Ref for Line0 pxor mm0,mm0 ; So that add of mm3 is just like movq. movq [eax+16],mm5 ; 1O: Store Past Ref for Line1 movq mm6,mm7 ; 8 bytes of 0xFE. test al,32 ; Iterate twice jne @b test al,64 ; Iterate twice. mov cl,4 ; Pitch for past reference blk, div 4. jne @b mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. mov PastRefPitchDiv4,ecx Diff_GetFutureContribToPred: ;=============================================================================== ; ; Registers at entry: ; edi -- Pointer to target block. ; esi -- Pointer to past reference. ; edx -- Block Descriptor within MacroBlockActionDescritptorStream ; ; Subsequent assignments: ; ; ebp -- Pitch for past reference block, div 4. Loop counter in high 2 bits. ; ecx -- Pointer to future reference block ; ebx -- Pointer to list of indices of multipliers to wt past and future refs. ; eax,edx -- Index of multiplier to weight past and future ref. xor ecx,ecx mov eax,edx IF SIZEOF T_Blk-16 **** The magic leaks out if size of block descriptor is not 16. ENDIF mov cl,[edx].BlkY1.BestHMVb ; HMV for future reference block. and edx,112 ; Extract block number (times 16). xor ebx,ebx mov BlockActionDescrCursor,eax mov bl,[eax].BlkY1.BestVMVb ; VMV for future reference block. mov eax,LeftRightBlkPosition[edx] mov ebp,ecx CONST_384 TEXTEQU <384> mov edx,UpDownBlkPosition[edx] mov cl,[eax+ecx*2] ; Get horz part of past/future wt sel. IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF lea eax,[ebx+ebx*2] ; Start of VMVb*384 mov bl,[edx+ebx*2] ; Get vert part of past/future wt sel. shl eax,6 mov edx,BFrameToFuture lea ebx,Diff_IdxRefWts[ecx+ebx] ; Addr of list of wts for refs. test al,64 ; Is VMVb odd? lea eax,[eax+edx] ; Begin to get addr futr ref. jne Diff_VMVbAtHalfPelPosition Diff_VMVbAtFullPelPosition: CONST_384 TEXTEQU <384> shr ebp,1 ; CF == 1 iff HMVf is at half pel. lea esp,[esp-128] StackOffset TEXTEQU <136> lea ecx,[eax+edi-48*PITCH-48] jc Diff_VMVbAtFull_HMVbAtHalfPelPosition Diff_VMVbAtFull_HMVbAtFullPelPosition: CONST_384 TEXTEQU <384> add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax xor edx,edx @@: StackOffset TEXTEQU mov al,[ebx] ; 0A: Index of weights for line 0. add esp,32 ; Advance Pel Difference cursor mov dl,[ebx+1] ; 1A: Index of weights for line 1. add ebx,2 ; Advance list ptr for ref weights. movq mm0,[ecx] ; 0B: pcmpeqb mm7,mm7 movq mm2,FutureWt_FF_or_00[eax]; 0C: paddb mm7,mm7 ; 8 bytes of 0xFE movq mm3,[esi] ; 0D: pand mm0,mm2 ; 0E: pandn mm2,mm3 ; 0F: paddb mm0,mm3 ; 0G: movq mm1,[ecx+PITCH] ; 1B: paddb mm0,mm2 ; 0H: movq mm2,FutureWt_FF_or_00[edx]; 1C: pand mm0,mm7 ; 0I: pre-clean movq mm3,[esi+ebp*4] ; 1D: pand mm1,mm2 ; 1E: pandn mm2,mm3 ; 1F: paddb mm1,mm3 ; 1G: movq mm3,[edi] ; 0J: psrlq mm0,1 ; 0K: psubb mm3,mm0 ; 0L: paddb mm1,mm2 ; 1H: movq mm4,[edi+PITCH] ; 1J: pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU psubb mm4,mm1 ; 1L: add ecx,PITCH*2 ; Advance Future Ref Blk cursor lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU add ebp,080000000H ; Iterate twice jnc @b test ebp,040000000H ; Iterate twice lea ebp,[ebp+040000000H] je @b StackOffset TEXTEQU <8> mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT Diff_VMVbAtHalfPelPosition: CONST_384 TEXTEQU <384> shr ebp,1 ; CF == 1 iff HMVf is at half pel. lea esp,[esp-128] StackOffset TEXTEQU <136> lea ecx,[eax+edi-48*PITCH-48-PITCH/2] jc Diff_VMVbAtHalf_HMVbAtHalfPelPosition Diff_VMVbAtHalf_HMVbAtFullPelPosition: CONST_384 TEXTEQU <384> add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax xor edx,edx movq mm6,[ecx] ; 0B: pcmpeqb mm7,mm7 ; 8 bytes -1 @@: StackOffset TEXTEQU movq mm1,[ecx+PITCH] ; 1a: movq mm0,mm6 ; 0a: mov al,[ebx] ; 0A: Index of weights for line 0. psubb mm1,mm7 ; b: movq mm6,[ecx+PITCH*2] ; 2a: paddb mm0,mm1 ; 0c: mov dl,[ebx+1] ; 1A: Index of weights for line 1. paddb mm7,mm7 ; 8 bytes of 0xFE paddb mm1,mm6 ; 1c: pand mm0,mm7 ; 0d: pre-clean pand mm1,mm7 ; 1d: pre-clean psrlq mm0,1 ; 0B: <(F07 = f07+f17+1)/2> movq mm2,FutureWt_FF_or_00[eax]; 0C: psrlq mm1,1 ; 1B: <(F17 = f17+f27+1)/2> movq mm3,[esi] ; 0D: pand mm0,mm2 ; 0E: pandn mm2,mm3 ; 0F: paddb mm0,mm3 ; 0G: add ebx,2 ; Advance list ptr for ref weights. paddb mm0,mm2 ; 0H: movq mm2,FutureWt_FF_or_00[edx]; 1C: pand mm0,mm7 ; 0I: pre-clean movq mm3,[esi+ebp*4] ; 1D: pand mm1,mm2 ; 1E: pandn mm2,mm3 ; 1F: paddb mm1,mm3 ; 1G: movq mm3,[edi] ; 0J: psrlq mm0,1 ; 0K: psubb mm3,mm0 ; 0L: add esp,32 ; Advance Pel Difference cursor movq mm4,[edi+PITCH] ; 1J: paddb mm1,mm2 ; 1H: add ecx,PITCH*2 ; Advance Future Ref Blk cursor pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU psubb mm4,mm1 ; 1L: pcmpeqb mm7,mm7 ; 8 bytes -1 lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU pcmpeqb mm7,mm7 ; 8 bytes -1 add ebp,080000000H ; Iterate twice jnc @b add ebp,040000000H ; Iterate twice test ebp,ebp jns @b StackOffset TEXTEQU <8> mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT Diff_VMVbAtFull_HMVbAtHalfPelPosition: StackOffset TEXTEQU <136> CONST_384 TEXTEQU <384> add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax lea edx,Pel_Rnd @@: StackOffset TEXTEQU movq mm0,[ecx+1] ; 0a: pcmpeqb mm7,mm7 mov al,[ecx] ; 0b: f00 movq mm2,mm0 ; 0c: movq mm1,[ecx+PITCH+1] ; 1a: psllq mm2,8 ; 0d: paddb mm0,[edx+eax*8] ; 0e: movq mm3,mm1 ; 1c: mov al,[ecx+PITCH] ; 1b: f10 psllq mm3,8 ; 1d: paddb mm0,mm2 ; 0f: paddb mm1,mm3 ; 1f: paddb mm1,[edx+eax*8] ; 1e: paddb mm7,mm7 ; 8 bytes of 0xFE. mov al,[ebx] ; 0A: Index of weights for line 0. pand mm0,mm7 ; 0g: pre-clean movq mm3,[esi] ; 0D: psrlq mm0,1 ; 0B: movq mm2,FutureWt_FF_or_00[eax]; 0C: pand mm1,mm7 ; 1g: pre-clean mov al,[ebx+1] ; 1A: Index of weights for line 1. psrlq mm1,1 ; 1B: pand mm0,mm2 ; 0E: pandn mm2,mm3 ; 0F: movq mm4,FutureWt_FF_or_00[eax]; 1C: paddb mm0,mm3 ; 0G: movq mm3,[esi+ebp*4] ; 1D: paddb mm0,mm2 ; 0H: pand mm0,mm7 ; 0I: pre-clean pand mm1,mm4 ; 1E: pandn mm4,mm3 ; 1F: paddb mm1,mm3 ; 1G: movq mm3,[edi] ; 0J: psrlq mm0,1 ; 0K: psubb mm3,mm0 ; 0L: add esp,32 ; Advance Pel Difference cursor add ecx,PITCH*2 ; Advance Future Ref Blk cursor paddb mm1,mm4 ; 1H: movq mm4,[edi+PITCH] ; 1J: pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU psubb mm4,mm1 ; 1L: add ebx,2 ; Advance list ptr for ref weights. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU add ebp,080000000H ; Iterate twice jnc @b add ebp,040000000H ; Iterate twice test ebp,ebp jns @b StackOffset TEXTEQU <8> mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT Diff_VMVbAtHalf_HMVbAtHalfPelPosition: StackOffset TEXTEQU <136> CONST_384 TEXTEQU <384> add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax lea edx,Pel_Rnd movq mm4,[ecx+1] ; 0a: pcmpeqb mm7,mm7 mov al,[ecx] ; 0b: f00 movq mm0,mm4 ; 0c: paddb mm7,mm7 ; 8 bytes of 0xFE. psllq mm0,8 ; 0d: paddb mm4,[edx+eax*8] ; 0e: movq mm6,mm7 ; 8 bytes of 0xFE. @@: StackOffset TEXTEQU movq mm1,[ecx+PITCH+1] ; 1a: paddb mm0,mm4 ; 0f: mov al,[ecx+PITCH] ; 1b: f10 movq mm3,mm1 ; 1c: movq mm2,[ecx+PITCH*2+1] ; 2a: psllq mm3,8 ; 1d: paddb mm1,[edx+eax*8] ; 1e: movq mm4,mm2 ; 2c: mov al,[ecx+PITCH*2] ; 2b: f20 paddb mm1,mm3 ; 1f: pandn mm6,mm1 ; 0g: <(f18+f17+1)&1 ...> psllq mm4,8 ; 2d: paddb mm2,[edx+eax*8] ; 2e: movq mm5,mm6 ; 1g: <(f18+f17+1)&1 ...> paddb mm2,mm4 ; 2f: pand mm6,mm0 ; 0h: <(f18+f17+1)&(f08+f07+1)&1 ...> pand mm5,mm2 ; 1h: <(f18+f17+1)&(f28+f27+1)&1 ...> pand mm0,mm7 ; 0i: pre-clean for divide pand mm1,mm7 ; 1i: pre-clean for divide psrlq mm0,1 ; 0j: <(f08+f07+1)/2 ...> movq mm4,mm2 ; Save line 2 for next iter's line 0. psrlq mm1,1 ; 1j: <(f18+f17+1)/2 ...> pand mm2,mm7 ; 2i: pre-clean for divide paddb mm0,mm1 ; 0k: <(f08+f07+1)/2+(f18+f17+1)/2 ...> paddb mm0,mm6 ; 0l: <(f08+f07+f18+f17+2)/2 ...> psrlq mm2,1 ; 2j: <(f28+f27+1)/2 ...> paddb mm1,mm2 ; 1k: <(f18+f17+1)/2+(f28+f27+1)/2 ...> pand mm0,mm7 ; 0m: pre-clean for divide mov al,[ebx] ; 0A: Index of weights for line 0. paddb mm1,mm5 ; 1l: <(f18+f17+f28+f27+2)/2 ...> movq mm3,[esi] ; 0D: pand mm1,mm7 ; 1m: pre-clean for divide movq mm2,FutureWt_FF_or_00[eax]; 0C: psrlq mm0,1 ; 0B: mov al,[ebx+1] ; 1A: Index of weights for line 1. psrlq mm1,1 ; 1B: pand mm0,mm2 ; 0E: pandn mm2,mm3 ; 0F: movq mm5,FutureWt_FF_or_00[eax]; 1C: paddb mm0,mm3 ; 0G: movq mm3,[esi+ebp*4] ; 1D: paddb mm0,mm2 ; 0H: pand mm0,mm7 ; 0I: pre-clean pand mm1,mm5 ; 1E: pandn mm5,mm3 ; 1F: paddb mm1,mm3 ; 1G: movq mm3,[edi] ; 0J: psrlq mm0,1 ; 0K: psubb mm3,mm0 ; 0L: add esp,32 ; Advance Pel Difference cursor paddb mm1,mm5 ; 1H: add ecx,PITCH*2 ; Advance Future Ref Blk cursor movq mm5,[edi+PITCH] ; 1J: pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU psubb mm5,mm1 ; 1L: add ebx,2 ; Advance list ptr for ref weights. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm5 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU pxor mm0,mm0 ; So that add of mm4 is just like movq. add ebp,080000000H ; Iterate twice movq mm6,mm7 ; 8 bytes of 0xFE. jnc @b add ebp,040000000H ; Iterate twice test ebp,ebp jns @b StackOffset TEXTEQU <8> mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT CONST_384 TEXTEQU END