|
|
;//////////////////////////////////////////////////////////////////////////// ;// ;// INTEL CORPORATION PROPRIETARY INFORMATION ;// ;// This software is supplied under the terms of a license ;// agreement or nondisclosure agreement with Intel Corporation ;// and may not be copied or disclosed except in accordance ;// with the terms of that agreement. ;// ;//////////////////////////////////////////////////////////////////////////// ;// ;// $Header: R:\h26x\h26x\src\enc\e3mbme.asv 1.5 18 Oct 1996 16:57:08 BNICKERS $ ;// ;// $Log: R:\h26x\h26x\src\enc\e3mbme.asv $ ;// ;// Rev 1.5 18 Oct 1996 16:57:08 BNICKERS ;// Fixes for EMV ;// ;// Rev 1.4 12 Sep 1996 10:56:16 BNICKERS ;// Add arguments for thresholds and differentials. ;// ;// Rev 1.3 22 Jul 1996 15:22:48 BNICKERS ;// Reduce code size. Implement H261 spatial filter. ;// ;// Rev 1.2 14 May 1996 12:18:48 BNICKERS ;// Initial debugging of MMx B-Frame ME. ;// ;// Rev 1.1 03 May 1996 14:03:30 BNICKERS ;// ;// Minor bug fixes and integration refinements. ;// ;// Rev 1.0 02 May 1996 12:00:56 BNICKERS ;// Initial revision. ;// ;//////////////////////////////////////////////////////////////////////////// ; ; MMxBFrameMotionEstimation -- This function performs motion estimation for the ; B frame macroblocks identified in the input list. ; This is the MMx version. ;
OPTION M510 OPTION CASEMAP:NONE
BFRMNONZEROMVDIFFERENTIAL = 400 BFRMEMPTYTHRESHOLD = 256
.xlist include e3inst.inc include memmodel.inc include iammx.inc include exEDTQ.inc include e3mbad.inc .list
.CODE EDTQ
EXTERN MMxDoForwardDCT:NEAR
PUBLIC MMxDoBFrameLumaBlocks PUBLIC MMxDoBFrameChromaBlocks
StackOffset TEXTEQU <4> CONST_384 TEXTEQU <ebp>
MMxDoBFrameLumaBlocks:
mov eax,QPDiv2 ; Swap these so Quantizer uses right level. mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov cl,INTER1MV mov BRecip2QPToUse,eax mov StashBlockType,cl
BFrameSWDLoop_0MV:
mov ecx,[edx].BlkY1.MVs xor ebx,ebx mov bl,[edx].BlkY1.PVMV ; P-frame Vertical MV lea edi,WeightForwardMotion xor eax,eax and ecx,0FFH ; P-frame Horizontal MV mov al,[edi+ebx] ; VMV for past ref. mov bl,[edi+ebx+64] ; VMV for future ref. mov [edx].BlkY1.VMVb0Delta,bl mov bl,[edi+ecx+64] ; HMV for future ref. mov [edx].BlkY1.HMVb0Delta,bl mov bl,[edi+ecx] ; HMV for past ref. mov [edx].BlkY1.VMVf0Delta,al ; Record candidate VMVf. xor ecx,ecx ; Keep pairing happy. mov [edx].BlkY1.HMVf0Delta,bl ; Record candidate HMVf. mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
call ComputeBFrameSWDForCandRef
movdf [edx].BlkY1.BlkLvlSWD0Delta,mm7 ; Stash SWD. add edx,SIZEOF T_Blk lea edi,WeightForwardMotion test dl,4*SIZEOF T_Blk ; Quit when fourth block done. je BFrameSWDLoop_0MV
mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs mov cl,[edx-4*SIZEOF T_Blk].BlockType xor cl,INTER1MV or al,ah lea esi,[edx-4*SIZEOF T_Blk] ; Reset MacroBlockActionDescr cursor. or al,cl mov ecx,[edx-SIZEOF T_Blk].BlkY1.BlkLvlSWD0Delta je BelowBFrmZeroThreshold ; Jump if P frm macroblock uses 0 motion vector. xor eax,eax cmp ecx,BFrmZeroVectorThreshold mov CurrSWDState,eax ; Record ME engine state. jle BelowBFrmZeroThreshold
mov edx,[esi].BlkY1.BlkLvlSWD0Delta ; Remember 0-MV SWDs. mov ecx,[esi].BlkY2.BlkLvlSWD0Delta mov [esi].BlkY1.BestBlkLvlSWD,edx mov [esi].BlkY2.BestBlkLvlSWD,ecx mov edx,[esi].BlkY3.BlkLvlSWD0Delta mov ecx,[esi].BlkY4.BlkLvlSWD0Delta mov [esi].BlkY3.BestBlkLvlSWD,edx mov [esi].BlkY4.BestBlkLvlSWD,ecx mov [esi].BlkU.BestBlkLvlSWD,ecx ; Avoid unintended early out, below. xor edx,edx ; Set best MV to zero.
BFrmSWDLoop:
mov ecx,PD BFrmSWDState[eax] ; cl == HMV; ch == VMV offsets to try. mov BestMV,edx ; Record what the best MV so far is. add cl,dl ; Try this horizontal MV delta. je HMVdIsZero
mov PB CandidateMV,cl ; Record the candidate HMV delta. add ch,dh ; Try this vertical MV delta. mov PB CandidateMV+1,ch ; Record the candidate VMV delta. je VMVdIsZero
VMVdAndHMVdAreNonZero_Loop:
mov edx,[esi].BlkY1.MVs xor ebx,ebx mov bl,dl xor eax,eax mov al,dh add esi,SIZEOF T_Blk mov bl,[edi+ebx] ; TRb * HMV / TRd pxor mm7,mm7 ; Initialize SWD accumulator add bl,cl ; HMVf = TRb * HMV / TRd + HMVd mov al,[edi+eax] ; TRb * VMV / TRd cmp bl,040H ; If too far left or right, quick out. jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl add al,ch ; VMVf = TRb * VMV / TRd + VMVd cmp al,040H ; If too far up or down, quick out. jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al sub bl,dl ; -HMVb = -(HMVf - HMV) mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl sub al,dh ; -VMVb = -(VMVf - VMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al je VMVdAndHMVdAreNonZero_Loop
sub esi,4*SIZEOF T_Blk jmp CandidateMVsGotten
VMVdIsZero: VMVdIsZero_Loop:
mov edx,[esi].BlkY1.MVs xor eax,eax mov al,dh xor ebx,ebx mov bl,dl add esi,SIZEOF T_Blk mov dh,[edi+eax+64] ; -VMVb = -((TRb - TRd) * VMV) / TRd mov al,[edi+eax] ; TRb * VMV / TRd mov bl,[edi+ebx] ; TRb * HMV / TRd mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al add bl,cl ; HMVf = TRb * HMV / TRd + HMVd mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,dh cmp bl,040H ; If too far left or right, quick out. jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl sub bl,dl ; -HMVb = -(HMVf - HMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl je VMVdIsZero_Loop
sub esi,4*SIZEOF T_Blk pxor mm7,mm7 ; Initialize SWD accumulator jmp CandidateMVsGotten
BFrameEarlyOutForCandidateMV: MVDeltaOutOfRange:
and esi,-1-7*SIZEOF T_Blk ; Reset block action descr cursor. mov ebx,CurrSWDState ; Reload ME engine state. xor eax,eax mov edx,BestMV ; Previous best MV is still best. mov al,BFrmSWDState[ebx+2] ; Get next State number. jmp ProceedWithNextCand
HMVdIsZero:
mov PB CandidateMV,cl ; Record the candidate HMV delta. add ch,dh ; Try this vertical MV delta. mov PB CandidateMV+1,ch ; Record the candidate VMV delta.
HMVdIsZeroLoop:
mov edx,[esi].BlkY1.MVs xor ebx,ebx mov bl,dl xor eax,eax mov al,dh add esi,SIZEOF T_Blk mov dl,[edi+ebx+64] ; -HMVb = -((TRb - TRd) * HMV) / TRd mov bl,[edi+ebx] ; TRb * HMV / TRd mov al,[edi+eax] ; TRb * VMV / TRd mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl add al,ch ; VMVf = TRb * VMV / TRd + VMVd mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,dl cmp al,040H ; If too far up or down, quick out. jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al sub al,dh ; -VMVb = -(VMVf - VMV) test esi,4*SIZEOF T_Blk mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al je HMVdIsZeroLoop
sub esi,4*SIZEOF T_Blk pxor mm7,mm7 ; Initialize SWD accumulator
CandidateMVsGotten: BFrameSWDLoop_Non0MVCandidate:
xor eax,eax xor ebx,ebx mov al,[esi].BlkY1.CandVMVf mov edi,[esi].BlkY1.BlkOffset ; Address of 0-MV blk within frame. mov bl,[esi].BlkY1.CandHMVf mov edx,esi
call ComputeBFrameSWDForCandRef
movdf ecx,mm7 mov eax,[edx].BlkY2.BestBlkLvlSWD lea esi,[edx+SIZEOF T_Blk] ; Early out if the first N blocks for cmp ecx,eax ; this cand are worse than the first jge BFrameEarlyOutForCandidateMV ; N+1 blocks for previous best.
test esi,4*SIZEOF T_Blk ; Quit when fourth block done. mov [esi-SIZEOF T_Blk].BlkY1.CandBlkLvlSWD,ecx ; Stash SWD. je BFrameSWDLoop_Non0MVCandidate
; This candidate is best so far.
mov [esi-4*SIZEOF T_Blk].BlkY4.BestBlkLvlSWD,ecx mov ebx,CurrSWDState ; Reload ME engine state. mov [esi-4*SIZEOF T_Blk].BlkU.BestBlkLvlSWD,ecx sub esi,4*SIZEOF T_Blk xor eax,eax mov edx,CandidateMV ; Candidate was best MV. mov ecx,[esi].BlkY3.CandBlkLvlSWD mov [esi].BlkY3.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY2.CandBlkLvlSWD mov [esi].BlkY2.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY1.CandBlkLvlSWD mov [esi].BlkY1.BestBlkLvlSWD,ecx mov ecx,[esi].BlkY4.CandBiDiMVs mov [esi].BlkY4.BestBiDiMVs,ecx mov ecx,[esi].BlkY3.CandBiDiMVs mov [esi].BlkY3.BestBiDiMVs,ecx mov ecx,[esi].BlkY2.CandBiDiMVs mov [esi].BlkY2.BestBiDiMVs,ecx mov ecx,[esi].BlkY1.CandBiDiMVs mov [esi].BlkY1.BestBiDiMVs,ecx mov al,BFrmSWDState[ebx+3] ; Get next State number.
ProceedWithNextCand:
mov CurrSWDState,eax ; Record ME engine state. test eax,eax lea edi,WeightForwardMotion jne BFrmSWDLoop
mov ecx,[esi].BlkY4.BlkLvlSWD0Delta ; 0MV SWD sub ecx,BFRMNONZEROMVDIFFERENTIAL mov ebx,[esi].BlkY4.BestBlkLvlSWD ; Best non-0 MV SWD. cmp ebx,ecx jge NonZeroBFrmVectorNotGoodEnoughGain
mov [esi].BlkY1.BHMV,dl mov [esi].BlkY2.BHMV,dl mov [esi].BlkY3.BHMV,dl mov [esi].BlkY4.BHMV,dl mov [esi].BlkY1.BVMV,dh mov [esi].BlkY2.BVMV,dh mov [esi].BlkY3.BVMV,dh mov [esi].BlkY4.BVMV,dh mov eax,[esi].BlkY4.BestBlkLvlSWD mov ebx,[esi].BlkY3.BestBlkLvlSWD sub eax,ebx mov ecx,[esi].BlkY2.BestBlkLvlSWD sub ebx,ecx mov edx,[esi].BlkY1.BestBlkLvlSWD sub ecx,edx mov [esi].BlkY4.BestBlkLvlSWD,eax mov [esi].BlkY3.BestBlkLvlSWD,ebx mov [esi].BlkY2.BestBlkLvlSWD,ecx mov [esi].BlkY1.BestBlkLvlSWD,edx jmp BFrmMVSettled
BelowBFrmZeroThreshold: NonZeroBFrmVectorNotGoodEnoughGain:
mov ebx,[esi].BlkY4.BlkLvlSWD0Delta mov ecx,[esi].BlkY3.BlkLvlSWD0Delta sub ebx,ecx mov edx,[esi].BlkY2.BlkLvlSWD0Delta sub ecx,edx mov edi,[esi].BlkY1.BlkLvlSWD0Delta sub edx,edi mov [esi].BlkY4.BestBlkLvlSWD,ebx mov [esi].BlkY3.BestBlkLvlSWD,ecx mov [esi].BlkY2.BestBlkLvlSWD,edx mov [esi].BlkY1.BestBlkLvlSWD,edi mov eax,[esi].BlkY1.BiDiMVs0Delta mov [esi].BlkY1.BestBiDiMVs,eax mov eax,[esi].BlkY2.BiDiMVs0Delta mov [esi].BlkY2.BestBiDiMVs,eax mov eax,[esi].BlkY3.BiDiMVs0Delta mov [esi].BlkY3.BestBiDiMVs,eax mov eax,[esi].BlkY4.BiDiMVs0Delta mov [esi].BlkY4.BestBiDiMVs,eax xor eax,eax mov [esi].BlkY1.BHMV,al mov [esi].BlkY2.BHMV,al mov [esi].BlkY3.BHMV,al mov [esi].BlkY4.BHMV,al mov [esi].BlkY1.BVMV,al mov [esi].BlkY2.BVMV,al mov [esi].BlkY3.BVMV,al mov [esi].BlkY4.BVMV,al
BFrmMVSettled:
mov edx,esi mov bl,8 ; Init coded block pattern
BFrmLumaBlkLoop:
mov esi,[edx].BlkY1.BestBlkLvlSWD ; Get SWD for block. xor eax,eax mov BFrmCBP,bl cmp esi,BFRMEMPTYTHRESHOLD ; Below threshold for forcing empty? mov ecx,BSWDTotal jl BFrmLumaBlkEmpty
mov eax,[edx].BlkY1.BestBiDiMVs xor ebx,ebx add ecx,esi mov bl,ah mov BSWDTotal,ecx and eax,0FFH
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0. mov al,BFrmCBP
BFrmLumaBlkEmpty:
or bl,al ; Factor in CBP bit for this block. add edx,SIZEOF T_Blk shr bl,1 ; CF == 1 when sentinel shifted off jnc BFrmLumaBlkLoop
mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl sub edx,4*SIZEOF T_Blk mov eax,QPDiv2 ; Restore these for P frame blocks. mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov BRecip2QPToUse,eax ret
MMxDoBFrameChromaBlocks:
; mov eax,QPDiv2 ; Swap these so Quantizer uses right level. ; mov ebx,BQPDiv2 ; (Loaded in caller.) mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov cl,INTER1MV mov BRecip2QPToUse,eax mov StashBlockType,cl mov eax,[edx].BlkU.BestBiDiMVs xor ebx,ebx mov bl,ah and eax,0FFH add edx,4*SIZEOF T_Blk ; To know we're working on chroma.
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0. mov al,[edx-4*SIZEOF T_Blk].CodedBlocksB or bl,al ; Factor in CBP bit for this block. mov eax,[edx-4*SIZEOF T_Blk].BlkV.BestBiDiMVs mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl xor ebx,ebx mov bl,ah and eax,0FFH add edx,SIZEOF T_Blk
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx+2] ; Fetch 32 if block not empty; else 0. mov al,[edx-5*SIZEOF T_Blk].CodedBlocksB or bl,al ; Factor in CBP bit for this block. mov eax,QPDiv2 ; Restore these for P frame blocks. mov [edx-5*SIZEOF T_Blk].CodedBlocksB,bl mov ebx,BQPDiv2 mov QPDiv2,ebx mov BQPDiv2,eax mov eax,CodeStreamCursor mov ebx,BCodeStreamCursor mov CodeStreamCursor,ebx mov BCodeStreamCursor,eax mov eax,Recip2QPToUse mov ebx,BRecip2QPToUse mov Recip2QPToUse,ebx mov BRecip2QPToUse,eax sub edx,5*SIZEOF T_Blk ret
;===============================================================================
; ebp -- Pitch ; edi -- Address of (0-MV) block within frame. ; edx -- Block Action Decriptor cursor ; ebx -- HMVf (HMV to apply to past reference) biased by 96. ; eax -- VMVf (VMV to apply to past reference) biased by 96.
StackOffset TEXTEQU <8> ComputeBFrameSWDForCandRef:
test al,1 mov ecx,PreviousFrameBaseAddress lea eax,[eax+eax*2] ; Start of VMVf*384 jne ME_VMVfAtHalfPelPosition
ME_VMVfAtFullPelPosition:
IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF
shl eax,6 add ecx,edi shr ebx,1 ; CF == 1 iff HMVf is at half pel. jc ME_VMVfAtFull_HMVfAtHalfPelPosition
ME_VMVfAtFull_HMVfAtFullPelPosition:
lea esi,[ecx+eax-48*PITCH-48] lea ecx,[ebp+ebp*2] add esi,ebx ; Address of past reference block. mov eax,BFrameBaseAddress add edi,eax ; Address of target block. lea ebx,[ebp+ebp*4] movq mm0,[esi+ebp*1] psubw mm0,[edi+ebp*1] ; Get diff for line 1. movq mm1,[esi+ecx] ; Ref MB, upper left block, Line 3. psllw mm0,8 ; Extract diffs for line 1 even pels. psubw mm1,[edi+ecx] ; Diff for line 3. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1. movq mm2,[esi+ebx] psllw mm1,8 psubw mm2,[edi+ebx] pmaddwd mm1,mm1 movq mm3,[esi+PITCH*7] psllw mm2,8 psubw mm3,[edi+PITCH*7] pmaddwd mm2,mm2 movq mm4,[esi] ; Ref MB, upper left blk, Line 0. psllw mm3,8 psubw mm4,[edi] ; Diff for line 0. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2). movq mm1,[esi+ebp*2] pmaddwd mm3,mm3 psubw mm1,[edi+ebp*2] paddusw mm0,mm2 movq mm2,[esi+ebp*4] pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0. psubw mm2,[edi+ebp*4] paddusw mm0,mm3 movq mm3,[esi+ecx*2] pmaddwd mm1,mm1 psubw mm3,[edi+ecx*2] pmaddwd mm2,mm2 paddusw mm0,mm4 pmaddwd mm3,mm3 paddusw mm0,mm1 ; paddusw mm0,mm2 ; paddusw mm0,mm3 ; punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret
ME_VMVfAtFull_HMVfAtHalfPelPosition:
lea esi,[ecx+eax-48*PITCH-48] mov eax,BFrameBaseAddress add esi,ebx ; Address of past reference block. add edi,eax ; Address of target block. lea ecx,[ebp+ebp*2] movq mm0,mm6 ; 8 bytes of 1 pmullw mm0,[esi] ; <(P07+P06)*256+junk ...> movq mm1,mm6 pmullw mm1,[esi+ebp*2] movq mm2,mm6 pmullw mm2,[esi+ebp*4] movq mm3,mm6 movq mm4,[edi] ; <C07 C06 C05 C04 C03 C02 C01 C00> psrlw mm0,1 ; <(P07+P06)*256/2+junk ...> pmullw mm3,[esi+ecx*2] psllw mm4,8 ; <C06*256 C04*256 C02*256 C00*256> movq mm5,[edi+ebp*2] psrlw mm1,1 psubw mm0,mm4 ; <(P07+P06)*256/2-C06*256+junk ...> psllw mm5,8 movq mm4,[edi+ebp*4] psrlw mm2,1 psubw mm1,mm5 psllw mm4,8 movq mm5,[edi+ecx*2] psrlw mm3,1 psubw mm2,mm4 pmaddwd mm0,mm0 ; SSD fof even pels of line 0. pmaddwd mm1,mm1 psllw mm5,8 psubw mm3,mm5 pmaddwd mm2,mm2 pmaddwd mm3,mm3 movq mm5,mm6 pmullw mm6,[esi+ebp*1+1] ; <(P18+P17)*256+junk ...> movq mm4,mm5 pmullw mm5,[esi+ecx+1] paddusw mm0,mm1 ; Accum SSD for lines 0 and 2. paddusw mm2,mm3 movq mm1,mm4 pmullw mm4,[esi+PITCH*5+1] paddusw mm0,mm2 pmullw mm1,[esi+PITCH*7+1] psrlw mm6,1 ; <(P18+P17)*256/2+junk ...> psubw mm6,[edi+ebp*1] ; <(P18+P17)*256/2-C17*256+junk ...> psrlw mm5,1 psubw mm5,[edi+ecx] psrlw mm4,1 psubw mm4,[edi+PITCH*5] pmaddwd mm6,mm6 ; SSD for odd pels of line 1. pmaddwd mm5,mm5 psrlw mm1,1 psubw mm1,[edi+PITCH*7] pmaddwd mm4,mm4 pmaddwd mm1,mm1 paddusw mm0,mm6 pxor mm6,mm6 paddusw mm0,mm5 pcmpeqb mm5,mm5 paddusw mm0,mm4 psubb mm6,mm5 ; Restore 8 bytes of -1. paddusw mm0,mm1 punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret
ME_VMVfAtHalfPelPosition:
IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl eax,6 lea ecx,[ecx+edi-48*PITCH-48-PITCH/2] add ecx,eax mov eax,BFrameBaseAddress shr ebx,1 ; CF == 1 iff HMVf is at half pel. mov esi,ecx ; esi and ecx same if HMVf at full pel, adc ecx,ebx ; but inc ecx if HMVf is at half pel. add esi,ebx add edi,eax ; Address of target block. lea ebx,[ebp+ebp*2]
movq mm0,[esi] ; <P07 P06 ...> pcmpeqb mm6,mm6 movq mm1,[ecx+ebp*1] ; <P17 P16 ...> or <P18 P17 ...> psrlw mm6,8 movq mm2,[esi+ebp*2] ; <P27 P26 ...> paddb mm0,mm1 ; <P07+P17 junk ...> or <P07+P18 junk ...> movq mm3,[ecx+ebx] ; <P37 P36 ...> or <P38 P37 ...> paddb mm1,mm2 ; <junk P16+P26 ...> or <junk P17+P26 ...> movq mm4,[esi+ebp*4] ; <P47 P46 ...> paddb mm2,mm3 ; <P27+P37 junk ...> or <P27+P38 junk ...> paddb mm3,mm4 ; <junk P36+P46 ...> or <junk P37+P46 ...> psrlw mm0,1 ; <(P07+P17)/2 junk ...> or (P07+P18)/2 junk ...> pand mm1,mm6 ; <P16+P26 ...> or <P17+P26 ...> psrlw mm2,1 ; <(P27+P37)/2 junk ...> or (P27+P38)/2 junk ...> movq mm5,[edi+ebp*1] ; <C17 C16 C15 C14 C13 C12 C11 C10> pand mm3,mm6 ; <P36+P46 ...> or <P37+P46 ...> movq mm6,[edi+ebx] ; <C37 C36 C35 C34 C33 C32 C31 C30> psllw mm5,8 ; <C16 0 C14 0 C12 0 C10 0> psubw mm0,[edi] ; <(P07+P17)/2-C07 junk ...> or ... psllw mm1,7 ; <(P16+P26)/2 ...> or <(P17+P26)/2 ...> psubw mm2,[edi+ebp*2] ; <(P27+P37)/2-C27 junk ...> or ... psllw mm6,8 ; <C36 0 C34 0 C32 0 C30 0> pmaddwd mm0,mm0 ; SSD of even pels of line 0. psubw mm1,mm5 ; <(P16+P26)/2-C16 junk ...> or ... pmaddwd mm1,mm1 ; SSD of odd pels of line 1. psllw mm3,7 ; <(P36+P46)/2 ...> or <(P37+P46)/2 ...> pmaddwd mm2,mm2 ; SSD of even pels of line 2. psubw mm3,mm6 ; <(P36+P46)/2-C36 junk ...> or ... pmaddwd mm3,mm3 ; SSD of odd pels of line 3. pcmpeqb mm6,mm6 paddusw mm0,mm1
movq mm1,[ecx+PITCH*5] paddusw mm0,mm2 movq mm2,[esi+ebx*2] paddusw mm0,mm3 movq mm3,[ecx+PITCH*7] paddb mm4,mm1 paddb mm1,mm2 paddb mm2,mm3 paddb mm3,[esi+ebp*8] psrlw mm6,8 pand mm1,mm6 psrlw mm4,1 movq mm5,[edi+PITCH*5] psrlw mm2,1 pand mm3,mm6 psllw mm5,8 movq mm6,[edi+PITCH*7] psllw mm1,7 psubw mm4,[edi+ebp*4] psllw mm3,7 psubw mm2,[edi+ebx*2] psllw mm6,8 pmaddwd mm4,mm4 psubw mm1,mm5 pmaddwd mm2,mm2 psubw mm3,mm6 pmaddwd mm1,mm1 pxor mm6,mm6 pmaddwd mm3,mm3 paddusw mm0,mm4 pcmpeqb mm5,mm5 paddusw mm0,mm1 psubb mm6,mm5 ; Restore 8 bytes of 1. paddusw mm0,mm2 paddusw mm0,mm3 ; punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. ; paddusw mm0,mm1 ; mm0[48:63] is SWD for block. ; psrlq mm0,48 ; SWD for block. ; paddd mm7,mm0 ; mm7 is SWD for all four blocks. ; ret
;===============================================================================
; ebp -- Pitch ; edx -- Block Action Decriptor cursor ; ebx -- VMVf (VMV to apply to past reference) biased by 96. ; eax -- HMVf (HMV to apply to past reference) biased by 96.
StackOffset TEXTEQU <8>
BFrameDTQ:
test bl,1 lea ebx,[ebx+ebx*2] ; Start of VMVf*384 mov ecx,PreviousFrameBaseAddress jne Diff_VMVfAtHalfPelPosition
Diff_VMVfAtFullPelPosition:
IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl ebx,6 mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame. shr eax,1 ; CF == 1 iff HMVf is at half pel. jc Diff_VMVfAtFull_HMVfAtHalfPelPosition
Diff_VMVfAtFull_HMVfAtFullPelPosition:
lea esi,[ecx+ebx-48*PITCH-48] add eax,edi add esi,eax ; Address of past reference block. mov ecx,PITCH/4 ; Pitch for past reference blk, div 4. mov eax,BFrameBaseAddress ; Address of target block. mov PastRefPitchDiv4,ecx add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred
Diff_VMVfAtHalfPelPosition:
IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF shl ebx,6 mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame. shr eax,1 ; CF == 1 iff HMVf is at half pel. jc Diff_VMVfAtHalf_HMVfAtHalfPelPosition
Diff_VMVfAtHalf_HMVfAtFullPelPosition:
lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 pcmpeqb mm6,mm6 pcmpeqb mm7,mm7 ; 8 bytes -1 movq mm2,[esi] ; Line0 paddb mm6,mm6 ; 8 bytes of 0xFE.
@@:
movq mm1,[esi+ebp*1] ; Line1 movq mm0,mm2 ; Line0 movq mm2,[esi+ebp*2] ; Line2 psubb mm1,mm7 ; Line1+1 paddb mm0,mm1 ; Line0+Line1+1 paddb mm1,mm2 ; Line1+Line2+1 pand mm0,mm6 ; pre-clean pand mm1,mm6 ; pre-clean add eax,32 ; Advance pointer for PelDiffs output. psrlq mm0,1 ; (Line0+Line1+1)/2 lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm1,1 ; (Line1+Line2+1)/2 movq [eax],mm0 ; Store Past Ref for Line0 movq [eax+16],mm1 ; Store Past Ref for Line1 test al,32 ; Iterate twice jne @b
test al,64 ; Iterate twice. mov ecx,4 ; Pitch for past reference blk, div 4. mov PastRefPitchDiv4,ecx jne @b
mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred Diff_VMVfAtFull_HMVfAtHalfPelPosition:
lea esi,[ecx+ebx-48*PITCH-48] ; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 lea ebx,Pel_Rnd xor ecx,ecx
@@:
movq mm0,[esi+1] ; <P08 P07 P06 P05 P04 P03 P02 P01> pcmpeqb mm7,mm7 mov cl,[esi] ; P00 movq mm2,mm0 ; <P08 P07 P06 P05 P04 P03 P02 P01> movq mm1,[esi+ebp*1+1] psllq mm2,8 ; <P07 P06 P05 P04 P03 P02 P01 0> paddb mm0,[ebx+ecx*8] ; <P08+1 P07+1 ... P01+P00+1> movq mm3,mm1 mov cl,[esi+ebp*1] psllq mm3,8 paddb mm1,mm3 paddb mm0,mm2 ; <P08+P07+1 P07+P06+1 ... P01+P00+1> paddb mm1,[ebx+ecx*8] paddb mm7,mm7 ; 8 bytes of 0xFE. pand mm0,mm7 ; pre-clean pand mm1,mm7 ; pre-clean add eax,32 ; Advance pointer for PelDiffs output. psrlq mm0,1 ; <(P08+P07+1)/2 ...> lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm1,1 movq [eax],mm0 ; Store Past Ref for Line0 movq [eax+16],mm1 ; Store Past Ref for Line1 test al,32 ; Iterate twice jne @b
test al,64 ; Iterate twice. mov cl,4 ; Pitch for past reference blk, div 4. mov PastRefPitchDiv4,ecx jne @b
mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. jmp Diff_GetFutureContribToPred
Diff_VMVfAtHalf_HMVfAtHalfPelPosition:
lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias. add eax,edi add esi,eax ; Address of past reference block. lea eax,PelDiffs-32 lea ebx,Pel_Rnd xor ecx,ecx
movq mm3,[esi+1] ; 0A: <P08 P07 P06 P05 P04 P03 P02 P01> pcmpeqb mm7,mm7 mov cl,[esi] ; 0B: P00 movq mm0,mm3 ; 0C: <P08 P07 P06 P05 P04 P03 P02 P01> paddb mm7,mm7 ; 8 bytes of 0xFE. psllq mm0,8 ; 0D: <P07 P06 P05 P04 P03 P02 P01 0> paddb mm3,[ebx+ecx*8] ; 0E: <P08+1 P07+1 ... P01+P00+1> movq mm6,mm7 ; 8 bytes of 0xFE.
@@:
movq mm1,[esi+ebp*1+1] ; 1A: <P18 P17 P16 P15 P14 P13 P12 P11> paddb mm0,mm3 ; 0F: <P08+P07+1 ... P01+P00+1> mov cl,[esi+ebp*1] ; 1B: P10 movq mm3,mm1 ; 1C: <P18 P17 P16 P15 P14 P13 P12 P11> movq mm2,[esi+ebp*2+1] ; 2A: <P28 P27 P26 P25 P24 P23 P22 P21> psllq mm3,8 ; 1D: <P17 P16 P15 P14 P13 P12 P11 0> paddb mm1,[ebx+ecx*8] ; 1E: <P18+1 P17+1 ... P11+P10+1> movq mm4,mm2 ; 2C: <P28 P27 P26 P25 P24 P23 P22 P21> mov cl,[esi+ebp*2] ; 2B: P20 paddb mm1,mm3 ; 1F: <P18+P17+1 ... P11+P10+1> pandn mm6,mm1 ; 0G: <(P18+P17+1)&1 ...> psllq mm4,8 ; 2D: <P27 P26 P25 P24 P23 P22 P21 0> paddb mm2,[ebx+ecx*8] ; 2E: <P28+1 P27+1 ... P21+P20+1> movq mm5,mm6 ; 1G: <(P18+P17+1)&1 ...> paddb mm2,mm4 ; 2F: <P28+P27+1 ... P21+P20+1> pand mm6,mm0 ; 0H: <(P18+P17+1)&(P08+P07+1)&1 ...> pand mm5,mm2 ; 1H: <(P18+P17+1)&(P28+P27+1)&1 ...> pand mm0,mm7 ; 0I: pre-clean for divide pand mm1,mm7 ; 1I: pre-clean for divide psrlq mm0,1 ; 0J: <(P08+P07+1)/2 ...> movq mm3,mm2 ; Save line 2 for next iter's line 0. psrlq mm1,1 ; 1J: <(P18+P17+1)/2 ...> pand mm2,mm7 ; 2I: pre-clean for divide paddb mm0,mm1 ; 0K: <(P08+P07+1)/2+(P18+P17+1)/2 ...> paddb mm6,mm0 ; 0L: <(P08+P07+P18+P17+2)/2 ...> psrlq mm2,1 ; 2J: <(P28+P27+1)/2 ...> paddb mm1,mm2 ; 1K: <(P18+P17+1)/2+(P28+P27+1)/2 ...> pand mm6,mm7 ; 0M: pre-clean for divide paddb mm5,mm1 ; 1L: <(P18+P17+P28+P27+2)/2 ...> psrlq mm6,1 ; 0M: <(P08+P07+P18+P17+2)/4 ...> add eax,32 ; Advance pointer for PelDiffs output. pand mm5,mm7 ; 1M: pre-clean for divide lea esi,[esi+ebp*2] ; Advance input ptr 2 lines. psrlq mm5,1 ; 1N: <(P18+P17+P28+P27+2)/4 ...> movq [eax],mm6 ; 0O: Store Past Ref for Line0 pxor mm0,mm0 ; So that add of mm3 is just like movq. movq [eax+16],mm5 ; 1O: Store Past Ref for Line1 movq mm6,mm7 ; 8 bytes of 0xFE. test al,32 ; Iterate twice jne @b
test al,64 ; Iterate twice. mov cl,4 ; Pitch for past reference blk, div 4. jne @b
mov eax,BFrameBaseAddress lea esi,PelDiffs ; Address of interpolated past ref blk. add edi,eax ; Address of target block. mov PastRefPitchDiv4,ecx
Diff_GetFutureContribToPred:
;=============================================================================== ; ; Registers at entry: ; edi -- Pointer to target block. ; esi -- Pointer to past reference. ; edx -- Block Descriptor within MacroBlockActionDescritptorStream ; ; Subsequent assignments: ; ; ebp -- Pitch for past reference block, div 4. Loop counter in high 2 bits. ; ecx -- Pointer to future reference block ; ebx -- Pointer to list of indices of multipliers to wt past and future refs. ; eax,edx -- Index of multiplier to weight past and future ref.
xor ecx,ecx mov eax,edx IF SIZEOF T_Blk-16 **** The magic leaks out if size of block descriptor is not 16. ENDIF mov cl,[edx].BlkY1.BestHMVb ; HMV for future reference block. and edx,112 ; Extract block number (times 16). xor ebx,ebx mov BlockActionDescrCursor,eax mov bl,[eax].BlkY1.BestVMVb ; VMV for future reference block. mov eax,LeftRightBlkPosition[edx] mov ebp,ecx CONST_384 TEXTEQU <384> mov edx,UpDownBlkPosition[edx] mov cl,[eax+ecx*2] ; Get horz part of past/future wt sel. IF PITCH-384 **** The magic leaks out if PITCH != 384 ENDIF lea eax,[ebx+ebx*2] ; Start of VMVb*384 mov bl,[edx+ebx*2] ; Get vert part of past/future wt sel. shl eax,6 mov edx,BFrameToFuture lea ebx,Diff_IdxRefWts[ecx+ebx] ; Addr of list of wts for refs. test al,64 ; Is VMVb odd? lea eax,[eax+edx] ; Begin to get addr futr ref. jne Diff_VMVbAtHalfPelPosition
Diff_VMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
shr ebp,1 ; CF == 1 iff HMVf is at half pel. lea esp,[esp-128] StackOffset TEXTEQU <136> lea ecx,[eax+edi-48*PITCH-48] jc Diff_VMVbAtFull_HMVbAtHalfPelPosition
Diff_VMVbAtFull_HMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax xor edx,edx
@@:
StackOffset TEXTEQU <undefined>
mov al,[ebx] ; 0A: Index of weights for line 0. add esp,32 ; Advance Pel Difference cursor mov dl,[ebx+1] ; 1A: Index of weights for line 1. add ebx,2 ; Advance list ptr for ref weights. movq mm0,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00> pcmpeqb mm7,mm7 movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...> paddb mm7,mm7 ; 8 bytes of 0xFE movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00> pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...> pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...> paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...> movq mm1,[ecx+PITCH] ; 1B: <F17 F16 F15 F14 F13 F12 F11 F10> paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...> movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...> pand mm0,mm7 ; 0I: pre-clean movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10> pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...> pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...> paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...> movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00> psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...> psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...> paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...> movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10> pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...> StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU <undefined> psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...> add ecx,PITCH*2 ; Advance Future Ref Blk cursor lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU <undefined> add ebp,080000000H ; Iterate twice jnc @b
test ebp,040000000H ; Iterate twice lea ebp,[ebp+040000000H] je @b
StackOffset TEXTEQU <8>
mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT
Diff_VMVbAtHalfPelPosition:
CONST_384 TEXTEQU <384>
shr ebp,1 ; CF == 1 iff HMVf is at half pel. lea esp,[esp-128] StackOffset TEXTEQU <136> lea ecx,[eax+edi-48*PITCH-48-PITCH/2] jc Diff_VMVbAtHalf_HMVbAtHalfPelPosition
Diff_VMVbAtHalf_HMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax xor edx,edx movq mm6,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00> pcmpeqb mm7,mm7 ; 8 bytes -1
@@:
StackOffset TEXTEQU <undefined>
movq mm1,[ecx+PITCH] ; 1a: <f17 f16 f15 f14 f13 f12 f11 f10> movq mm0,mm6 ; 0a: <f07 f06 f05 f04 f03 f02 f01 f00> mov al,[ebx] ; 0A: Index of weights for line 0. psubb mm1,mm7 ; b: <f17+1 ...> movq mm6,[ecx+PITCH*2] ; 2a: <f27 f26 f25 f24 f23 f22 f21 f20> paddb mm0,mm1 ; 0c: <f07+f17+1..> mov dl,[ebx+1] ; 1A: Index of weights for line 1. paddb mm7,mm7 ; 8 bytes of 0xFE paddb mm1,mm6 ; 1c: <f17+f27+1..> pand mm0,mm7 ; 0d: pre-clean pand mm1,mm7 ; 1d: pre-clean psrlq mm0,1 ; 0B: <(F07 = f07+f17+1)/2> movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...> psrlq mm1,1 ; 1B: <(F17 = f17+f27+1)/2> movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00> pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...> pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...> paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...> add ebx,2 ; Advance list ptr for ref weights. paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...> movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...> pand mm0,mm7 ; 0I: pre-clean movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10> pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...> pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...> paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...> movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00> psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...> psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...> add esp,32 ; Advance Pel Difference cursor movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10> paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...> add ecx,PITCH*2 ; Advance Future Ref Blk cursor pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...> StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU <undefined> psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...> pcmpeqb mm7,mm7 ; 8 bytes -1 lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU <undefined> pcmpeqb mm7,mm7 ; 8 bytes -1 add ebp,080000000H ; Iterate twice jnc @b
add ebp,040000000H ; Iterate twice test ebp,ebp jns @b
StackOffset TEXTEQU <8>
mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT
Diff_VMVbAtFull_HMVbAtHalfPelPosition:
StackOffset TEXTEQU <136> CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax lea edx,Pel_Rnd
@@:
StackOffset TEXTEQU <undefined>
movq mm0,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01> pcmpeqb mm7,mm7 mov al,[ecx] ; 0b: f00 movq mm2,mm0 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01> movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11> psllq mm2,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0> paddb mm0,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1> movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11> mov al,[ecx+PITCH] ; 1b: f10 psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0> paddb mm0,mm2 ; 0f: <f08+f07+1 f07+f06+1 ... f01+f00+1> paddb mm1,mm3 ; 1f: <f18+f17 f17+f16 ... f11 > paddb mm1,[edx+eax*8] ; 1e: <f18+f17+1 f17+f16+1 ... f11+f10+1> paddb mm7,mm7 ; 8 bytes of 0xFE. mov al,[ebx] ; 0A: Index of weights for line 0. pand mm0,mm7 ; 0g: pre-clean movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00> psrlq mm0,1 ; 0B: <F07 = (f08+f07+1)/2 ...> movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...> pand mm1,mm7 ; 1g: pre-clean mov al,[ebx+1] ; 1A: Index of weights for line 1. psrlq mm1,1 ; 1B: <F17 = (f18+f17+1)/2 ...> pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...> pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...> movq mm4,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...> paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...> movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10> paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...> pand mm0,mm7 ; 0I: pre-clean pand mm1,mm4 ; 1E: <In?F17:00 In?F16:00 ...> pandn mm4,mm3 ; 1F: <In?00:P17 In?00:P16 ...> paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...> movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00> psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...> psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...> add esp,32 ; Advance Pel Difference cursor add ecx,PITCH*2 ; Advance Future Ref Blk cursor paddb mm1,mm4 ; 1H: <In?F17+P17:2P17 ...> movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10> pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...> StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU <undefined> psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...> add ebx,2 ; Advance list ptr for ref weights. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU <undefined> add ebp,080000000H ; Iterate twice jnc @b
add ebp,040000000H ; Iterate twice test ebp,ebp jns @b
StackOffset TEXTEQU <8>
mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT
Diff_VMVbAtHalf_HMVbAtHalfPelPosition:
StackOffset TEXTEQU <136> CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block. mov ebp,PastRefPitchDiv4 xor eax,eax lea edx,Pel_Rnd movq mm4,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01> pcmpeqb mm7,mm7 mov al,[ecx] ; 0b: f00 movq mm0,mm4 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01> paddb mm7,mm7 ; 8 bytes of 0xFE. psllq mm0,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0> paddb mm4,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1> movq mm6,mm7 ; 8 bytes of 0xFE.
@@:
StackOffset TEXTEQU <undefined>
movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11> paddb mm0,mm4 ; 0f: <f08+f07+1 ... f01+f00+1> mov al,[ecx+PITCH] ; 1b: f10 movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11> movq mm2,[ecx+PITCH*2+1] ; 2a: <f28 f27 f26 f25 f24 f23 f22 f21> psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0> paddb mm1,[edx+eax*8] ; 1e: <f18+1 f17+1 ... f11+f10+1> movq mm4,mm2 ; 2c: <f28 f27 f26 f25 f24 f23 f22 f21> mov al,[ecx+PITCH*2] ; 2b: f20 paddb mm1,mm3 ; 1f: <f18+f17+1 ... f11+f10+1> pandn mm6,mm1 ; 0g: <(f18+f17+1)&1 ...> psllq mm4,8 ; 2d: <f27 f26 f25 f24 f23 f22 f21 0> paddb mm2,[edx+eax*8] ; 2e: <f28+1 f27+1 ... f21+f20+1> movq mm5,mm6 ; 1g: <(f18+f17+1)&1 ...> paddb mm2,mm4 ; 2f: <f28+f27+1 ... f21+f20+1> pand mm6,mm0 ; 0h: <(f18+f17+1)&(f08+f07+1)&1 ...> pand mm5,mm2 ; 1h: <(f18+f17+1)&(f28+f27+1)&1 ...> pand mm0,mm7 ; 0i: pre-clean for divide pand mm1,mm7 ; 1i: pre-clean for divide psrlq mm0,1 ; 0j: <(f08+f07+1)/2 ...> movq mm4,mm2 ; Save line 2 for next iter's line 0. psrlq mm1,1 ; 1j: <(f18+f17+1)/2 ...> pand mm2,mm7 ; 2i: pre-clean for divide paddb mm0,mm1 ; 0k: <(f08+f07+1)/2+(f18+f17+1)/2 ...> paddb mm0,mm6 ; 0l: <(f08+f07+f18+f17+2)/2 ...> psrlq mm2,1 ; 2j: <(f28+f27+1)/2 ...> paddb mm1,mm2 ; 1k: <(f18+f17+1)/2+(f28+f27+1)/2 ...> pand mm0,mm7 ; 0m: pre-clean for divide mov al,[ebx] ; 0A: Index of weights for line 0. paddb mm1,mm5 ; 1l: <(f18+f17+f28+f27+2)/2 ...> movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00> pand mm1,mm7 ; 1m: pre-clean for divide movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...> psrlq mm0,1 ; 0B: <F07 = (f08+f07+f18+f17+2)/4 ...> mov al,[ebx+1] ; 1A: Index of weights for line 1. psrlq mm1,1 ; 1B: <F17 = (f18+f17+f28+f27+2)/4 ...> pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...> pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...> movq mm5,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...> paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...> movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10> paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...> pand mm0,mm7 ; 0I: pre-clean pand mm1,mm5 ; 1E: <In?F17:00 In?F16:00 ...> pandn mm5,mm3 ; 1F: <In?00:P17 In?00:P16 ...> paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...> movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00> psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...> psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...> add esp,32 ; Advance Pel Difference cursor paddb mm1,mm5 ; 1H: <In?F17+P17:2P17 ...> add ecx,PITCH*2 ; Advance Future Ref Blk cursor movq mm5,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10> pand mm1,mm7 ; 1I: pre-clean add edi,PITCH*2 ; Advance Target Blk cursor psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...> StackOffset TEXTEQU <8+96> movq PelDiffs,mm3 ; 0M: Save pel differences for line 0. StackOffset TEXTEQU <undefined> psubb mm5,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...> add ebx,2 ; Advance list ptr for ref weights. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor StackOffset TEXTEQU <8+96> movq PelDiffs+16,mm5 ; 1M: Save pel differences for line 1. StackOffset TEXTEQU <undefined> pxor mm0,mm0 ; So that add of mm4 is just like movq. add ebp,080000000H ; Iterate twice movq mm6,mm7 ; 8 bytes of 0xFE. jnc @b
add ebp,040000000H ; Iterate twice test ebp,ebp jns @b
StackOffset TEXTEQU <8>
mov ebp,16 lea esi,PelDiffs mov edx,BlockActionDescrCursor jmp MMxDoForwardDCT
CONST_384 TEXTEQU <ebp>
END
|