You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1879 lines
68 KiB
1879 lines
68 KiB
;/* *************************************************************************
|
|
;** INTEL Corporation Proprietary Information
|
|
;**
|
|
;** This listing is supplied under the terms of a license
|
|
;** agreement with INTEL Corporation and may not be copied
|
|
;** nor disclosed except in accordance with the terms of
|
|
;** that agreement.
|
|
;**
|
|
;** Copyright (c) 1995 Intel Corporation.
|
|
;** All Rights Reserved.
|
|
;**
|
|
;** *************************************************************************
|
|
;*/
|
|
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// $Header: R:\h26x\h26x\src\enc\e35bme.asv 1.10 29 May 1996 15:37:38 BNICKERS $
|
|
;// $Log: R:\h26x\h26x\src\enc\e35bme.asv $
|
|
;//
|
|
;// Rev 1.10 29 May 1996 15:37:38 BNICKERS
|
|
;// Acceleration of IA version of ME.
|
|
;//
|
|
;// Rev 1.9 14 May 1996 12:18:18 BNICKERS
|
|
;// Initial debugging of MMx B-Frame ME.
|
|
;//
|
|
;// Rev 1.8 09 Jan 1996 16:14:46 BNICKERS
|
|
;// Avoid generating delta MV's that make B frame MV's out of range.
|
|
;//
|
|
;// Rev 1.7 27 Dec 1995 15:32:34 RMCKENZX
|
|
;// Added copyright notice
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;
|
|
; BFrameMotionEstimation -- This function performs B frame motion estimation for the macroblocks identified in the
|
|
; input list. This is only applicable for H263. This version is tuned for best performance
|
|
; on the Pentium Microprocessor.
|
|
;
|
|
; This function works correctly only if Unrestricted Motion Vectors is enabled. It is not
|
|
; possible to select full pel resolution only; half pel resolution is always selected.
|
|
;
|
|
; Input Arguments:
|
|
;
|
|
; MBlockActionStream
|
|
;
|
|
; The list of macroblocks for which we need to perform motion estimation.
|
|
;
|
|
; Upon input, the following fields must be defined:
|
|
;
|
|
; CodedBlocks -- Bit 6 must be set for the last macroblock to be processed.
|
|
;
|
|
; BlkOffset -- must be defined for each of the blocks in the macroblocks.
|
|
;
|
|
; TargetFrameBaseAddress -- Address of upper left viewable pel in the target Y plane.
|
|
;
|
|
; PreviousFrameBaseAddress -- Address of upper left viewable pel in the Y plane of the previous P frame. Whether this
|
|
; is the reconstructed previous frame, or the original, is up to the caller to decide.
|
|
;
|
|
; FutureFrameBaseAddress -- Address of upper left viewable pel in the Y plane of the future P frame. Whether this
|
|
; is the reconstructed previous frame, or the original, is up to the caller to decide.
|
|
;
|
|
; WeightForwardMotion -- Array of 64 signed chars, each element I equal to ((TRb * (I-32)) / TRd). (See H263 spec.)
|
|
;
|
|
; WeightBackwardMotion -- Array of 64 signed chars, each element I equal to ((TRb - TRd) * (I-32) / TRd). (See spec.)
|
|
;
|
|
; ZeroVectorThreshold -- If the SWDB for a macroblock is less than this threshold, we do not bother searching for a
|
|
; better motion vector. Compute as follows, where D is the average tolerable pel difference
|
|
; to satisfy this threshold. (Initial recommendation: D=2 ==> ZVT=384)
|
|
; ZVT = (128 * ((int)((D**1.6)+.5)))
|
|
;
|
|
; NonZeroDifferential -- After searching for the best motion vector (or individual block motion vectors, if enabled),
|
|
; if the macroblock's SWDB is not better than it was for the zero vector -- not better by at
|
|
; least this amount -- then we revert to the zero vector. We are comparing two macroblock
|
|
; SWDs, both calculated as follows: (Initial recommendation: NZD=128)
|
|
; For each of 128 match points, where D is its Abs Diff, accumulate ((int)(M**1.6)+.5)))
|
|
;
|
|
; EmptyThreshold -- If the SWD for a block is less than this, the block is forced empty. Compute as follows, where D
|
|
; is the average tolerable pel diff to satisfy threshold. (Initial recommendation: D=3 ==> ET=96)
|
|
; ET = (32 * ((int)((D**1.6)+.5)))
|
|
;
|
|
; Output Arguments
|
|
;
|
|
; MBlockActionStream
|
|
;
|
|
; These fields are defined as follows upon return:
|
|
;
|
|
; BHMV and BVMV -- The horizontal and vertical motion vectors, in units of a half pel. These values are intended
|
|
; for coding in the macroblock layer.
|
|
;
|
|
; If Horizontal MV indicates a half pel position, the prediction for the upper left pel of the block
|
|
; is the average of the pel at PastRef and the one at PastRef+1.
|
|
;
|
|
; If Vertical MV indicates a half pel position, the prediction for the upper left pel of the block
|
|
; is the average of the pel at PastRef and the one at PastRef+PITCH.
|
|
;
|
|
; If both MVs indicate half pel positions, the prediction for the upper left pel of the block is the
|
|
; average of the pels at PastRef, PastRef+1, PastRef+PITCH, and PastRef+PITCH+1.
|
|
;
|
|
; BestHMVf, BestVMVf, BestHMVb, BestVMVb -- Motion vector components, as described in H263 spec. They are biased
|
|
; by 060H. Only defined for luma blocks. Caller must define for
|
|
; chroma blocks.
|
|
;
|
|
; CandidateHMVf, CandidateVMVf, CandidateHMVb, CandidateVMVb -- Scratch space for this function.
|
|
;
|
|
; CodedBlocksB -- Bits 4 and 5 are turned on, indicating that the U and V blocks should be processed. (If the
|
|
; FDCT function finds them to quantize to empty, it will mark them as empty.)
|
|
;
|
|
; Bits 0 thru 3 are cleared for each of blocks 1 thru 4 that BFrameMotionEstimation forces empty;
|
|
; they are set otherwise.
|
|
;
|
|
; Bits 6 and 7 are left unchanged.
|
|
;
|
|
; SWDB -- Set to the sum of the SWDBs for the four luma blocks in the macroblock. The SWD for any block that is
|
|
; forced empty, is NOT included in the sum.
|
|
;
|
|
; InterSWDTotal -- The sum of the block SWDBs for all Intercoded macroblocks. None of the blocks forced empty are
|
|
; included in this.
|
|
;
|
|
; InterSWDBlocks -- The number of blocks that make up the InterSWDTotal.
|
|
;
|
|
;
|
|
; Other assumptions:
|
|
;
|
|
; For performance reasons, it is assumed that the current and previous frame are 32-byte aligned, and the pitch is a
|
|
; constant 384. Moreover, the current and previous frames must be out of phase by 2K bytes, i.e. must be an odd
|
|
; multiple of 2K bytes apart. This will assure best utilization of the on-chip cache.
|
|
;
|
|
; Many of the techniques described in MotionEstimation are used here. It is wise to study that module before trying
|
|
;to understand this one.
|
|
;
|
|
; Data structures used for bi-directional motion search:
|
|
;
|
|
; Target Macroblock:
|
|
;
|
|
; The target macroblock is copied to the stack frame so that esp can be used as an induction variable for the block:
|
|
;
|
|
; esp+ 0 AAAABBBB AAAABBBB AAAABBBB AAAABBBB
|
|
; esp+ 64 CCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFF
|
|
; esp+ 128 LLLLDDDDLLLLLLLLLLLLLLLLLLLLLLLL
|
|
; esp+ 160 +------++------++------++------++------++------++------++------+
|
|
; esp+ 224 | Blk1 || Blk1 || Blk2 || Blk2 || Blk3 || Blk3 || Blk4 || Blk4 |
|
|
; esp+ 288 |Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7|
|
|
; esp+ 352 +------++------++------++------++------++------++------++------+
|
|
; esp+ 416
|
|
;
|
|
; AAAA is the address of the Future Reference Block to be used.
|
|
; BBBB is the address of the Past Reference Block to be used.
|
|
; AAAA, BBBB, and the next 8 bytes are overwritten by the offset to apply when interpolating future ref block or
|
|
; past ref block.
|
|
; CCCC is the accumulated SWD for the current candidate motion vector.
|
|
; DDDD is the accumulated SWD for the best motion vector so far.
|
|
; One extra DDDD occupies esp+132.
|
|
; EEEE is the address at which to transfer control after calculating SWD.
|
|
; FFFF is the accumulated SWD for the zero motion vector.
|
|
; LLLL is space for local variables.
|
|
;
|
|
; Future reference:
|
|
;
|
|
; For each macroblock, the corresponding macroblock from the future frame is copied into the following reference
|
|
; area, wherein all the X's are bytes initialized to 255. When the projection of the B-frame's future motion
|
|
; vector component falls on a byte valued at 255, we know that it is outside the future macroblock, and thus this
|
|
; is a pel that is only predicted from the past reference.
|
|
;
|
|
; esp+ 704 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+ 744 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+ 784 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+ 824 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+ 864 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+ 904 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > Rarely used
|
|
; esp+ 944 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+ 984 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+1024 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+1064 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+1104 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+1144 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1184 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1224 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1264 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1304 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1344 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1384 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1424 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1464 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1504 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1544 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+1584 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
|
|
; esp+1624 XXXXXXXXXXXXXXXXXXXXXXXX| || |
|
|
; esp+1664 XXXXXXXXXXXXXXXXXXXXXXXX|Future||Future|
|
|
; esp+1704 XXXXXXXXXXXXXXXXXXXXXXXX| Ref || Ref |
|
|
; esp+1744 XXXXXXXXXXXXXXXXXXXXXXXX| Blk || Blk |
|
|
; esp+1784 XXXXXXXXXXXXXXXXXXXXXXXX| 1 || 2 |
|
|
; esp+1824 XXXXXXXXXXXXXXXXXXXXXXXX| || |
|
|
; esp+1864 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
|
|
; esp+1904 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
|
|
; esp+1944 XXXXXXXXXXXXXXXXXXXXXXXX| || |
|
|
; esp+1984 XXXXXXXXXXXXXXXXXXXXXXXX|Future||Future|
|
|
; esp+2024 XXXXXXXXXXXXXXXXXXXXXXXX| Ref || Ref |
|
|
; esp+2064 XXXXXXXXXXXXXXXXXXXXXXXX| Blk || Blk |
|
|
; esp+2104 XXXXXXXXXXXXXXXXXXXXXXXX| 3 || 4 |
|
|
; esp+2144 XXXXXXXXXXXXXXXXXXXXXXXX| || |
|
|
; esp+2184 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
|
|
; esp+2224 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2264 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2304 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2344 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2384 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2424 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2464 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2504 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2544 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2584 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2624 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
|
|
; esp+2664 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+2704 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+2744 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+2784 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+2824 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
|
|
; esp+2864 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > Rarely used
|
|
; esp+2904 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+2944 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+2984 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+3024 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+3064 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
|
|
; esp+3104 XXXXXXXXXXXXXXXXXXXXXXXX________ /
|
|
; esp+3136
|
|
;
|
|
; Past Reference:
|
|
;
|
|
; The past reference search area is taken directly from the past frame. It is not necessary to copy any portion
|
|
; of the past frame to a scratch area.
|
|
;
|
|
;
|
|
; Memory layout of the target macroblock, the future reference macroblock, and the full range for the reference area
|
|
; (as restricted to +/- 7 in vertical, and +/- 7 (expandable to +/- 15) in horizontal, is as shown here. Each box
|
|
; represents a cache line (32 bytes), increasing incrementally from left to right, and then to the next row (like
|
|
; reading a book). The 128 boxes taken as a whole represent 4Kbytes. The boxes are populated as follows:
|
|
;
|
|
; R -- Data from the past reference area. Each box contains 23 of the pels belonging to a line of the reference
|
|
; area. The remaining 7 pels of the line is either in the box to the left (for reference areas used to provide
|
|
; predictions for target macroblocks that begin at an address 0-mod-32), or to the right (for target MBs that begin
|
|
; at an address 16-mod-32). There are 30 R's corresponding to the 30-line limit on the vertical distance we might
|
|
; search. The lowercase r's correspond to the lines above and below zero-vertical-motion.
|
|
;
|
|
; F -- Data from the future reference area. Eacg box contains a full line (16 pels) for each of two adjacent
|
|
; macroblocks. There are 16 F's corresponding to the 16 lines of the macroblocks.
|
|
;
|
|
; T -- Data from the target macroblock. Each box contains a full line (16 pels) for each of two adjacent
|
|
; macroblocks. There are 16 C's corresponding to the 16 lines of the macroblocks.
|
|
;
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | T | | R | | F | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | T | | R | | F | | R | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | T | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | T | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | T | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | F | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | F | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | r | | F | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | F | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | F | | R | | F | | r | |
|
|
; +---+---+---+---+---+---+---+---+---+---+---+---+
|
|
; | T | | R | | F | | R | |
|
|
; +---+---+---+---+---+---+---+---+
|
|
;
|
|
; Thus, in a logical sense, the above data fits into one of the 4K data cache pages, leaving the other for all other
|
|
; data. Care has been taken to assure that the tables and the stack space needed by this function fit nicely into
|
|
; the other data cache page. Only the MBlockActionStream remains to conflict with the above data structures. That
|
|
; is both unavoidable, and of minimal consequence.
|
|
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
OPTION M510
|
|
|
|
include e3inst.inc
|
|
include e3mbad.inc
|
|
|
|
.xlist
|
|
include memmodel.inc
|
|
.list
|
|
.DATA
|
|
|
|
LocalStorage LABEL DWORD ; Local storage goes on the stack at addresses whose lower 12 bits match this address.
|
|
|
|
DB 544 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
|
|
|
|
SWDState LABEL BYTE ; State engine rules for finding best motion vector.
|
|
|
|
; 1st number: Horizontal Motion displacement to try, in half pel increments.
|
|
; 2nd number: Vertical Motion displacement to try, in half pel increments.
|
|
; 3rd number: Next state to enter if this motion is better than previous best.
|
|
; 4th number: Next state to enter if previous best is still best.
|
|
|
|
DB -2, 0, 8, 4 ; 0 -- ( 0, 0) Try (-2, 0)
|
|
DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0)
|
|
DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0)
|
|
DB 0, -2, 20, 16 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2})
|
|
DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2)
|
|
DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
|
|
|
|
DB -1, 0, 32, 28 ; 24
|
|
DB 1, 0, 36, 36 ; 28
|
|
DB 2, 0, 36, 36 ; 32
|
|
DB 0, -1, 44, 40 ; 36
|
|
DB 0, 1, 0, 0 ; 40
|
|
DB 0, 2, 0, 0 ; 44
|
|
|
|
DB 48 DUP (?) ; Additional space for more states, if needed.
|
|
|
|
DB 64 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
|
|
|
|
InterpFutureRef LABEL BYTE ; Map FPEL+FPEL to its average. If one FPEL is out
|
|
; of range (255), map FPEL+FPEL to 255.
|
|
CNT = 0
|
|
REPEAT 127
|
|
DB CNT,CNT
|
|
CNT = CNT + 1
|
|
ENDM
|
|
DB 127
|
|
DB 257 DUP (255)
|
|
|
|
DB 1472 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
|
|
|
|
Interp2PastAndFutureRef LABEL BYTE ; Map PPEL+PPEL+FPELavg*2 to 2*average. If
|
|
; FPELavg out of range, map PPEL+PPEL to
|
|
; -(PPEL+PPEL).
|
|
CNT = 0
|
|
REPEAT 255
|
|
DB CNT,CNT
|
|
CNT = CNT - 1
|
|
ENDM
|
|
InterpPastAndFutureRef LABEL BYTE ; Map PPEL+FPELavg to 2*average. If
|
|
; FPELavg out of range, map PPEL to
|
|
; 2(PPEL).
|
|
CNT = 0
|
|
REPEAT 255
|
|
DB CNT
|
|
CNT = CNT + 1
|
|
ENDM
|
|
CNT = 0
|
|
REPEAT 128
|
|
DB CNT
|
|
CNT = CNT + 2
|
|
ENDM
|
|
DB ?,?,?
|
|
|
|
DB 255
|
|
WeightedDiff LABEL BYTE ; Label placed here because negative pel value is
|
|
; not sign extended, so we need to subtract 256.
|
|
DB 191 DUP (255)
|
|
DB 255,250,243,237,231,225,219,213,207,201,195,189,184,178,172,167
|
|
DB 162,156,151,146,141,135,130,126,121,116,111,107,102, 97, 93, 89
|
|
DB 84, 80, 76, 72, 68, 64, 61, 57, 53, 50, 46, 43, 40, 37, 34, 31
|
|
DB 28, 25, 22, 20, 18, 15, 13, 11, 9, 7, 6, 4, 3, 2, 1, 0
|
|
DB 0
|
|
DB 0, 1, 2, 3, 4, 6, 7, 9, 11, 13, 15, 18, 20, 22, 25, 28
|
|
DB 31, 34, 37, 40, 43, 46, 50, 53, 57, 61, 64, 68, 72, 76, 80, 84
|
|
DB 89, 93, 97,102,107,111,116,121,126,130,135,141,146,151,156,162
|
|
DB 167,172,178,184,189,195,201,207,213,219,225,231,237,243,250,255
|
|
DB 191 DUP (255)
|
|
|
|
.CODE
|
|
|
|
ASSUME cs : FLAT
|
|
ASSUME ds : FLAT
|
|
ASSUME es : FLAT
|
|
ASSUME fs : FLAT
|
|
ASSUME gs : FLAT
|
|
ASSUME ss : FLAT
|
|
|
|
BFRAMEMOTIONESTIMATION proc C AMBAS: DWORD,
|
|
ATFBA: DWORD,
|
|
APFBA: DWORD,
|
|
AFFBA: DWORD,
|
|
AWFM: DWORD,
|
|
AWBM: DWORD,
|
|
AZVT: DWORD,
|
|
ANZMVD:DWORD,
|
|
AEBT: DWORD,
|
|
ASWDT: DWORD,
|
|
ASWDB: DWORD
|
|
|
|
RegisterStorageSize = 16
|
|
|
|
; Arguments:
|
|
|
|
MBlockActionStream_arg = RegisterStorageSize + 4
|
|
TargetFrameBaseAddress_arg = RegisterStorageSize + 8
|
|
PreviousFrameBaseAddress_arg = RegisterStorageSize + 12
|
|
FutureFrameBaseAddress_arg = RegisterStorageSize + 16
|
|
WeightForwardMotion_arg = RegisterStorageSize + 20
|
|
WeightBackwardMotion_arg = RegisterStorageSize + 24
|
|
ZeroVectorThreshold_arg = RegisterStorageSize + 28
|
|
NonZeroMVDifferential_arg = RegisterStorageSize + 32
|
|
EmptyBlockThreshold_arg = RegisterStorageSize + 36
|
|
InterSWDTotal_arg = RegisterStorageSize + 40
|
|
InterSWDBlocks_arg = RegisterStorageSize + 44
|
|
EndOfArgList = RegisterStorageSize + 48
|
|
|
|
; Locals (on local stack frame)
|
|
|
|
; 0 thru 415 are Target MV scratch structure, described above, with room for
|
|
; 7 DWORDs of local variables.
|
|
|
|
Block1 EQU [esp+ 0]
|
|
Block2 EQU [esp+ 16]
|
|
Block3 EQU [esp+ 32]
|
|
Block4 EQU [esp+ 48]
|
|
BlockN EQU [esp+ 64]
|
|
BlockNM1 EQU [esp+ 48]
|
|
|
|
TargetBlock EQU 160
|
|
FutureRefBlockAddr EQU 0
|
|
PastRefBlockAddr EQU 4
|
|
FutureRefInterpOffset EQU FutureRefBlockAddr
|
|
CandidateSWDAccum EQU 64
|
|
BestSWDAccum EQU 68
|
|
SWD0MVAccum EQU 72
|
|
TransferCase EQU 76
|
|
TPITCH EQU 64
|
|
|
|
; 416 thru 479 and 640 thru 703 for weighting motion vectors.
|
|
|
|
WeightForwardMotion EQU [esp+ 416] ; 32 bytes at 416 for positive MV; 32
|
|
; bytes at 640 for negative MV.
|
|
WeightBackwardMotion EQU [esp+ 448] ; 32 bytes at 448 for positive MV; 32
|
|
; bytes at 672 for negative MV.
|
|
|
|
; 480 thru 543 are stack storage for more local variables.
|
|
; 128:131, 136:159, and 480:543 are available for local variables.
|
|
|
|
TargetFrameBaseAddress EQU [esp+ 128]
|
|
PreviousFrameBaseAddress EQU [esp+ 136]
|
|
FutureFrameBaseAddress EQU [esp+ 140]
|
|
MBlockActionStream EQU [esp+ 144]
|
|
ZeroVectorThreshold EQU [esp+ 148]
|
|
NonZeroMVDifferential EQU [esp+ 152]
|
|
EmptyBlockThreshold EQU [esp+ 156]
|
|
InterSWDTotal EQU [esp+ 480]
|
|
InterSWDBlocks EQU [esp+ 484]
|
|
StashESP EQU [esp+ 488]
|
|
PastMBAddr EQU [esp+ 492]
|
|
CurrSWDState EQU [esp+ 496]
|
|
CandidateMV EQU [esp+ 500]
|
|
BestMV EQU [esp+ 504]
|
|
BlkY1_0deltaBiDiMVs EQU [esp+ 508]
|
|
BlkY2_0deltaBiDiMVs EQU [esp+ 512]
|
|
BlkY3_0deltaBiDiMVs EQU [esp+ 516]
|
|
BlkY4_0deltaBiDiMVs EQU [esp+ 520]
|
|
FirstTransferCase EQU [esp+ 524]
|
|
|
|
; 544: 639 is for static data, namely the state engine rules.
|
|
; 640 thru 703, as stated above is for weighting motion vectors.
|
|
|
|
; 704 thru 1215 hit static data structure to interpolate 2 future pels.
|
|
|
|
; Future Reference Area also starts at 704 on stack, but collision at 704
|
|
; thru 1215 will occur very infrequently. Future Reference Area continues
|
|
; thru 3135. 2688 thru 3135 collide with the static data structure to
|
|
; interpolate between past and future pels, but that portion of the Future
|
|
; Reference Area is rarely accessed. 3136 thru 3583 continue that static
|
|
; structure. 3584 thru 4095 have the static structure to look up the
|
|
; weighted difference for a target pel and it's prediction.
|
|
|
|
FutureRefArea EQU [esp+ 704]
|
|
FutureBlock EQU [esp+1608]
|
|
FPITCH EQU 40
|
|
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
|
|
; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
|
|
|
|
mov esi,esp
|
|
sub esp,000001000H
|
|
mov ebx, [esp]
|
|
sub esp,000001000H
|
|
and esp,0FFFFF000H
|
|
mov ebx,OFFSET LocalStorage+63
|
|
and ebx,000000FC0H
|
|
mov edx,PD [esi+MBlockActionStream_arg]
|
|
or esp,ebx
|
|
mov eax,PD [esi+TargetFrameBaseAddress_arg]
|
|
mov TargetFrameBaseAddress,eax
|
|
mov eax,PD [esi+PreviousFrameBaseAddress_arg]
|
|
mov PreviousFrameBaseAddress,eax
|
|
mov eax,PD [esi+FutureFrameBaseAddress_arg]
|
|
mov FutureFrameBaseAddress,eax
|
|
mov eax,PD [esi+EmptyBlockThreshold_arg]
|
|
mov EmptyBlockThreshold,eax
|
|
mov eax,PD [esi+ZeroVectorThreshold_arg]
|
|
mov ZeroVectorThreshold,eax
|
|
mov eax,PD [esi+NonZeroMVDifferential_arg]
|
|
mov NonZeroMVDifferential,eax
|
|
mov ebx,3116
|
|
@@:
|
|
mov [esp+ebx],0FFFFFFFFH
|
|
sub ebx,4
|
|
cmp ebx,688
|
|
jae @b
|
|
xor ebx,ebx
|
|
mov StashESP,esi
|
|
mov edi,[esi+WeightForwardMotion_arg]
|
|
mov esi,[esi+WeightBackwardMotion_arg]
|
|
mov InterSWDBlocks,ebx
|
|
mov InterSWDTotal,ebx
|
|
mov eax,[edi]
|
|
mov ebx,[edi+4]
|
|
mov ecx,03F3F3F3FH
|
|
mov ebp,060606060H
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+224,eax
|
|
mov WeightForwardMotion+228,ebx
|
|
mov eax,[edi+8]
|
|
mov ebx,[edi+12]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+232,eax
|
|
mov WeightForwardMotion+236,ebx
|
|
mov eax,[edi+16]
|
|
mov ebx,[edi+20]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+240,eax
|
|
mov WeightForwardMotion+244,ebx
|
|
mov eax,[edi+24]
|
|
mov ebx,[edi+28]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+248,eax
|
|
mov WeightForwardMotion+252,ebx
|
|
mov eax,[edi+32]
|
|
mov ebx,[edi+36]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+0,eax
|
|
mov WeightForwardMotion+4,ebx
|
|
mov eax,[edi+40]
|
|
mov ebx,[edi+44]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+8,eax
|
|
mov WeightForwardMotion+12,ebx
|
|
mov eax,[edi+48]
|
|
mov ebx,[edi+52]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+16,eax
|
|
mov WeightForwardMotion+20,ebx
|
|
mov eax,[edi+56]
|
|
mov ebx,[edi+60]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightForwardMotion+24,eax
|
|
mov WeightForwardMotion+28,ebx
|
|
mov eax,[esi]
|
|
mov ebx,[esi+4]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+224,eax
|
|
mov WeightBackwardMotion+228,ebx
|
|
mov eax,[esi+8]
|
|
mov ebx,[esi+12]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+232,eax
|
|
mov WeightBackwardMotion+236,ebx
|
|
mov eax,[esi+16]
|
|
mov ebx,[esi+20]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+240,eax
|
|
mov WeightBackwardMotion+244,ebx
|
|
mov eax,[esi+24]
|
|
mov ebx,[esi+28]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+248,eax
|
|
mov WeightBackwardMotion+252,ebx
|
|
mov eax,[esi+32]
|
|
mov ebx,[esi+36]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+0,eax
|
|
mov WeightBackwardMotion+4,ebx
|
|
mov eax,[esi+40]
|
|
mov ebx,[esi+44]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+8,eax
|
|
mov WeightBackwardMotion+12,ebx
|
|
mov eax,[esi+48]
|
|
mov ebx,[esi+52]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+16,eax
|
|
mov WeightBackwardMotion+20,ebx
|
|
mov eax,[esi+56]
|
|
mov ebx,[esi+60]
|
|
and eax,ecx
|
|
and ebx,ecx
|
|
xor eax,ebp
|
|
xor ebx,ebp
|
|
mov WeightBackwardMotion+24,eax
|
|
mov WeightBackwardMotion+28,ebx
|
|
jmp FirstMacroBlock
|
|
|
|
ALIGN 16
|
|
|
|
NextMacroBlock:
|
|
|
|
mov bl,[edx].CodedBlocks
|
|
add edx,SIZEOF T_MacroBlockActionDescr
|
|
and ebx,000000040H ; Check for end-of-stream
|
|
jne Done
|
|
|
|
FirstMacroBlock:
|
|
|
|
mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
|
|
mov edi,TargetFrameBaseAddress
|
|
mov eax,FutureFrameBaseAddress
|
|
mov ebp,PreviousFrameBaseAddress
|
|
lea edi,[esi+edi+PITCH*3]
|
|
mov MBlockActionStream,edx ; Stash list ptr.
|
|
add ebp,esi
|
|
lea esi,[esi+eax+PITCH*15]
|
|
mov PastMBAddr,ebp ; Stash addr of past MB w/ zero motion.
|
|
mov ecx,FPITCH*15
|
|
mov ebp,PITCH
|
|
xor eax,eax
|
|
|
|
|
|
@@: ; Copy future reference to scratch area
|
|
; that is surrounded by "255" so we can
|
|
; handle access to this surrounding area
|
|
; as the future ref falls out of the MB.
|
|
mov eax,[esi]
|
|
mov ebx,[esi+4]
|
|
mov FutureBlock[ecx],eax
|
|
mov FutureBlock[ecx+4],ebx
|
|
mov eax,[esi+8]
|
|
mov ebx,[esi+12]
|
|
mov FutureBlock[ecx+8],eax
|
|
mov FutureBlock[ecx+12],ebx
|
|
sub esi,ebp
|
|
sub ecx,FPITCH
|
|
lea edx,Block1.TargetBlock
|
|
jge @b
|
|
|
|
sar ecx,31
|
|
lea ebx,Block1.TargetBlock+TPITCH*3
|
|
|
|
@@: ; Copy target macroblock to scratch area
|
|
; so that we can pick up the target points
|
|
; from a static offset added to esp.
|
|
mov eax,[edi]
|
|
mov esi,[edi+8]
|
|
add eax,eax
|
|
add esi,esi
|
|
xor eax,ecx
|
|
xor esi,ecx
|
|
mov [ebx],eax
|
|
mov [ebx+16],esi
|
|
mov eax,[edi+ebp*4]
|
|
mov esi,[edi+ebp*4+8]
|
|
add eax,eax
|
|
add esi,esi
|
|
xor eax,ecx
|
|
xor esi,ecx
|
|
mov [ebx+8],eax
|
|
mov [ebx+24],esi
|
|
mov eax,[edi+ebp*8]
|
|
mov esi,[edi+ebp*8+8]
|
|
add eax,eax
|
|
add esi,esi
|
|
xor eax,ecx
|
|
xor esi,ecx
|
|
mov [ebx+32],eax
|
|
mov [ebx+48],esi
|
|
mov eax,[edi+PITCH*12]
|
|
mov esi,[edi+PITCH*12+8]
|
|
add eax,eax
|
|
add esi,esi
|
|
xor eax,ecx
|
|
xor esi,ecx
|
|
mov [ebx+40],eax
|
|
mov [ebx+56],esi
|
|
sub edi,ebp
|
|
sub ebx,TPITCH
|
|
cmp ebx,edx
|
|
jge @b
|
|
|
|
mov eax,16
|
|
lea edi,[edi+ebp*4+4]
|
|
test edi,4
|
|
lea ebx,Block1.TargetBlock+TPITCH*3+4
|
|
jne @b
|
|
|
|
mov edx,MBlockActionStream
|
|
xor ebx,ebx
|
|
mov Block4.TransferCase,eax ; After block 4, transfer to done 0-MV.
|
|
xor ecx,ecx
|
|
mov bl,[edx].BlkY4.PVMV
|
|
mov esi,PastMBAddr
|
|
mov al,[edx].BlkY4.PHMV
|
|
xor ebp,ebp
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov [edx].BlkY4.BestVMVf,bl
|
|
sar ebx,1 ; CF == 1 if past vert is at half pel.
|
|
mov cl,WeightForwardMotion[eax]
|
|
adc ebp,ebp ; ebp == 1 if past vert is at half pel.
|
|
mov [edx].BlkY4.BestHMVf,cl
|
|
sar ecx,1 ; CF == 1 if past horz is at half pel.
|
|
IF PITCH-384
|
|
**** The magic leaks out if PITCH != 384
|
|
ENDIF
|
|
lea edi,[ebx+ebx*2] ; Multiply vertical component by PITCH.
|
|
adc ebp,ebp ; ebp odd if past horz is at half pel.
|
|
mov bl,[edx].BlkY4.PVMV
|
|
shl edi,7
|
|
lea esi,[esi+ecx-48-48*PITCH+PITCH*8+8]; Add horz full pel disp to ref addr.
|
|
add esi,edi ; Add vert full pel disp to past ref addr.
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov [edx].BlkY4.BestVMVb,bl
|
|
mov Block4.PastRefBlockAddr,esi ; Stash address of ref block from past.
|
|
sar ebx,1 ; CF == 1 if future vert is at half pel.
|
|
mov al,WeightBackwardMotion[eax]
|
|
adc ebp,ebp ; ebp odd if future vert is at half pel.
|
|
mov [edx].BlkY4.BestHMVb,al
|
|
sar eax,1 ; CF == 1 if future horz is at half pel.
|
|
IF FPITCH-40
|
|
**** The magic leaks out if FPITCH != 40
|
|
ENDIF
|
|
lea edi,[ebx+ebx*4] ; Multiply vertical component by FPITCH.
|
|
adc ebp,ebp ; ebp odd if future horz is at half pel.
|
|
lea esi,FutureBlock+80-48-48*FPITCH+FPITCH*8+8
|
|
lea edi,[eax+edi*8] ; Linearized MV for future ref.
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
add esi,edi
|
|
mov bl,[edx].BlkY3.PVMV
|
|
mov Block4.FutureRefBlockAddr,esi ; Stash address of ref block from future.
|
|
mov al,[edx].BlkY3.PHMV
|
|
mov esi,PastMBAddr
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov [edx].BlkY3.BestVMVf,bl
|
|
xor ebp,ebp
|
|
sar ebx,1
|
|
mov cl,WeightForwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY3.BestHMVf,cl
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
mov bl,[edx].BlkY3.PVMV
|
|
shl edi,7
|
|
lea esi,[esi+ecx-48-48*PITCH+PITCH*8]
|
|
add esi,edi
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov [edx].BlkY3.BestVMVb,bl
|
|
mov Block3.PastRefBlockAddr,esi
|
|
sar ebx,1
|
|
mov al,WeightBackwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY3.BestHMVb,al
|
|
sar eax,1
|
|
lea edi,[ebx+ebx*4]
|
|
adc ebp,ebp
|
|
lea esi,FutureBlock+80-48-48*FPITCH+FPITCH*8
|
|
lea edi,[eax+edi*8]
|
|
mov Block2.TransferCase,ebp
|
|
add esi,edi
|
|
mov bl,[edx].BlkY2.PVMV
|
|
mov Block3.FutureRefBlockAddr,esi
|
|
mov al,[edx].BlkY2.PHMV
|
|
mov esi,PastMBAddr
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov [edx].BlkY2.BestVMVf,bl
|
|
xor ebp,ebp
|
|
sar ebx,1
|
|
mov cl,WeightForwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY2.BestHMVf,cl
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
mov bl,[edx].BlkY2.PVMV
|
|
shl edi,7
|
|
lea esi,[esi+ecx-48-48*PITCH+8]
|
|
add esi,edi
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov [edx].BlkY2.BestVMVb,bl
|
|
mov Block2.PastRefBlockAddr,esi
|
|
sar ebx,1
|
|
mov al,WeightBackwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY2.BestHMVb,al
|
|
sar eax,1
|
|
lea edi,[ebx+ebx*4]
|
|
adc ebp,ebp
|
|
lea esi,FutureBlock+80-48-48*FPITCH+8
|
|
lea edi,[eax+edi*8]
|
|
mov Block1.TransferCase,ebp
|
|
add esi,edi
|
|
mov bl,[edx].BlkY1.PVMV
|
|
mov Block2.FutureRefBlockAddr,esi
|
|
mov al,[edx].BlkY1.PHMV
|
|
mov esi,PastMBAddr
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov [edx].BlkY1.BestVMVf,bl
|
|
xor ebp,ebp
|
|
sar ebx,1
|
|
mov cl,WeightForwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY1.BestHMVf,cl
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
mov bl,[edx].BlkY1.PVMV
|
|
shl edi,7
|
|
lea esi,[esi+ecx-48-48*PITCH]
|
|
add esi,edi
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov [edx].BlkY1.BestVMVb,bl
|
|
mov Block1.PastRefBlockAddr,esi
|
|
sar ebx,1
|
|
mov al,WeightBackwardMotion[eax]
|
|
adc ebp,ebp
|
|
mov [edx].BlkY1.BestHMVb,al
|
|
sar eax,1
|
|
lea ecx,[ebx+ebx*4]
|
|
adc ebp,ebp
|
|
lea edi,FutureBlock+80-48-48*FPITCH
|
|
lea ecx,[eax+ecx*8]
|
|
mov eax,ebp
|
|
add edi,ecx
|
|
mov ebp,00BADBEEFH
|
|
mov Block1.BestSWDAccum,ebp
|
|
mov Block2.BestSWDAccum,ebp
|
|
mov Block3.BestSWDAccum,ebp
|
|
mov Block4.BestSWDAccum,ebp
|
|
mov BlockN.BestSWDAccum,ebp
|
|
xor ebp,ebp
|
|
sub esp,64
|
|
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
ZeroVectorSWDDone:
|
|
|
|
mov eax,ZeroVectorThreshold
|
|
mov ebx,Block2.CandidateSWDAccum
|
|
cmp eax,ebp
|
|
mov edi,Block1.CandidateSWDAccum
|
|
mov ecx,Block3.CandidateSWDAccum
|
|
mov Block1.BestSWDAccum,edi
|
|
mov Block2.BestSWDAccum,ebx
|
|
mov Block3.BestSWDAccum,ecx
|
|
mov Block4.BestSWDAccum,ebp
|
|
mov eax,0 ; Set best MV to zero.
|
|
mov esi,MBlockActionStream
|
|
jge BelowZeroThreshold
|
|
|
|
mov Block1.SWD0MVAccum,edi
|
|
mov Block2.SWD0MVAccum,ebx
|
|
mov Block3.SWD0MVAccum,ecx
|
|
mov Block4.SWD0MVAccum,ebp
|
|
mov ebx,[esi].BlkY1.BestBiDiMVs
|
|
mov ecx,[esi].BlkY2.BestBiDiMVs
|
|
mov BlkY1_0deltaBiDiMVs,ebx
|
|
mov BlkY2_0deltaBiDiMVs,ecx
|
|
mov ebx,[esi].BlkY3.BestBiDiMVs
|
|
mov ecx,[esi].BlkY4.BestBiDiMVs
|
|
mov BlkY3_0deltaBiDiMVs,ebx
|
|
mov BlkY4_0deltaBiDiMVs,ecx
|
|
mov ecx,17
|
|
xor ebx,ebx ; First ME engine state is zero.
|
|
mov Block4.TransferCase,ecx ; After block 4, transfer to done non0-MV.
|
|
xor ecx,ecx
|
|
mov BlockN.BestSWDAccum,ebp
|
|
|
|
SWDLoop:
|
|
|
|
mov CurrSWDState,ebx ; Record ME engine state.
|
|
mov edx,PD SWDState[ebx] ; dl == HMV; dh == VMV offsets to try.
|
|
mov bl,[esi].BlkY4.PVMV
|
|
add dl,al ; Try this horizontal MV delta.
|
|
add dh,ah ; Try this vertical MV delta.
|
|
mov cl,[esi].BlkY4.PHMV
|
|
mov BestMV,eax ; Record what the best MV so far is.
|
|
mov CandidateMV,edx ; Record the candidate MV delta.
|
|
mov bl,WeightForwardMotion[ebx] ; TRb * VMV / TRd
|
|
xor ebp,ebp
|
|
add bl,dh ; VMVf = TRb * VMV / TRd + VMVd
|
|
mov cl,WeightForwardMotion[ecx] ; TRb * HMV / TRd
|
|
cmp bl,040H ; If too far up or down, take quick out.
|
|
jbe MVDeltaOutOfRange
|
|
|
|
mov [esi].BlkY4.CandidateVMVf,bl
|
|
add cl,dl ; HMVf = TRb * HMV / TRd + HMVd
|
|
cmp cl,040H ; If too far left or right, quick out.
|
|
jbe MVDeltaOutOfRange
|
|
|
|
sar ebx,1 ; CF == 1 if past vert is at half pel.
|
|
mov [esi].BlkY4.CandidateHMVf,cl
|
|
adc ebp,ebp ; ebp == 1 if past vert is at half pel.
|
|
mov eax,PastMBAddr
|
|
sar ecx,1 ; CF == 1 if past horz is at half pel.
|
|
IF PITCH-384
|
|
**** The magic leaks out if PITCH != 384
|
|
ENDIF
|
|
lea edi,[ebx+ebx*2] ; Multiply vertical component by PITCH.
|
|
adc ebp,ebp ; ebp odd if past horz is at half pel.
|
|
mov bl,[esi].BlkY3.PVMV
|
|
shl edi,7
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
lea ebp,[eax+ecx-48-48*PITCH+PITCH*8+8] ;Add horz full pel disp to ref addr.
|
|
mov cl,[esi].BlkY3.PHMV
|
|
add edi,ebp ; Add vert full pel disp to past ref addr.
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov Block4.PastRefBlockAddr,edi ; Stash address of ref block from past.
|
|
xor ebp,ebp
|
|
add bl,dh
|
|
mov cl,WeightForwardMotion[ecx]
|
|
cmp bl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
mov [esi].BlkY3.CandidateVMVf,bl
|
|
add cl,dl
|
|
cmp cl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
sar ebx,1
|
|
mov [esi].BlkY3.CandidateHMVf,cl
|
|
adc ebp,ebp
|
|
sub eax,48+48*PITCH
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
mov bl,[esi].BlkY2.PVMV
|
|
shl edi,7
|
|
mov Block2.TransferCase,ebp
|
|
lea ebp,[eax+ecx+PITCH*8]
|
|
mov cl,[esi].BlkY2.PHMV
|
|
add edi,ebp
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov Block3.PastRefBlockAddr,edi
|
|
xor ebp,ebp
|
|
add bl,dh
|
|
mov cl,WeightForwardMotion[ecx]
|
|
cmp bl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
mov [esi].BlkY2.CandidateVMVf,bl
|
|
add cl,dl
|
|
cmp cl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
sar ebx,1
|
|
mov [esi].BlkY2.CandidateHMVf,cl
|
|
adc ebp,ebp
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
mov bl,[esi].BlkY1.PVMV
|
|
shl edi,7
|
|
mov Block1.TransferCase,ebp
|
|
lea ebp,[eax+ecx+8]
|
|
mov cl,[esi].BlkY1.PHMV
|
|
add edi,ebp
|
|
mov bl,WeightForwardMotion[ebx]
|
|
mov Block2.PastRefBlockAddr,edi
|
|
xor ebp,ebp
|
|
add bl,dh
|
|
mov cl,WeightForwardMotion[ecx]
|
|
cmp bl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
mov [esi].BlkY1.CandidateVMVf,bl
|
|
add cl,dl
|
|
cmp cl,040H
|
|
jbe MVDeltaOutOfRange
|
|
|
|
sar ebx,1
|
|
mov [esi].BlkY1.CandidateHMVf,cl
|
|
adc ebp,ebp
|
|
sar ecx,1
|
|
lea edi,[ebx+ebx*2]
|
|
adc ebp,ebp
|
|
add eax,ecx
|
|
shl edi,7
|
|
mov FirstTransferCase,ebp
|
|
add edi,eax
|
|
test dh,dh ; Is vertical component MV delta zero?
|
|
mov Block1.PastRefBlockAddr,edi
|
|
je VMVdIsZero
|
|
|
|
lea edi,FutureBlock+80-48-48*FPITCH
|
|
xor eax,eax
|
|
mov bl,[esi].BlkY4.PVMV
|
|
mov al,[esi].BlkY4.CandidateVMVf
|
|
mov ebp,Block3.TransferCase ; Reload transfer case (computed goto idx)
|
|
sub al,bl ; -VMVb = -(VMVf - VMV)
|
|
mov [esi].BlkY4.CandidateVMVb,al
|
|
mov cl,[esi].BlkY3.PVMV
|
|
sar eax,1 ; CF == 1 if future vert is at half pel.
|
|
mov bl,[esi].BlkY3.CandidateVMVf
|
|
adc ebp,ebp ; ebp odd if future vert is at half pel.
|
|
IF FPITCH-40
|
|
**** The magic leaks out if FPITCH != 40
|
|
ENDIF
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
lea eax,[eax+eax*4] ; Multiply vertical component by FPITCH.
|
|
mov ebp,Block2.TransferCase
|
|
sub bl,cl
|
|
lea eax,[edi+eax*8+FPITCH*8+8] ; Addr of ref blk w/ vert MV.
|
|
mov [esi].BlkY3.CandidateVMVb,bl
|
|
sar ebx,1
|
|
mov Block4.FutureRefBlockAddr,eax ; Stash address of ref block from future.
|
|
adc ebp,ebp
|
|
mov cl,[esi].BlkY2.PVMV
|
|
mov Block2.TransferCase,ebp
|
|
lea eax,[ebx+ebx*4]
|
|
mov bl,[esi].BlkY2.CandidateVMVf
|
|
mov ebp,Block1.TransferCase
|
|
lea eax,[edi+eax*8+FPITCH*8]
|
|
sub bl,cl
|
|
mov Block3.FutureRefBlockAddr,eax
|
|
mov [esi].BlkY2.CandidateVMVb,bl
|
|
sar ebx,1
|
|
mov dh,[esi].BlkY1.PVMV
|
|
adc ebp,ebp
|
|
mov cl,[esi].BlkY1.CandidateVMVf
|
|
mov Block1.TransferCase,ebp
|
|
sub cl,dh
|
|
mov [esi].BlkY1.CandidateVMVb,cl
|
|
lea eax,[ebx+ebx*4]
|
|
sar ecx,1
|
|
mov ebp,FirstTransferCase
|
|
adc ebp,ebp
|
|
lea eax,[edi+eax*8+8]
|
|
mov Block2.FutureRefBlockAddr,eax
|
|
lea eax,[ecx+ecx*4]
|
|
mov FirstTransferCase,ebp
|
|
test dl,dl ; Is horizontal component MV delta zero?
|
|
lea edi,[edi+eax*8]
|
|
mov eax,0
|
|
mov Block1.FutureRefBlockAddr,edi
|
|
je HMVdIsZero
|
|
|
|
HMVdIsNonZero:
|
|
|
|
mov cl,[esi].BlkY4.CandidateHMVf
|
|
mov bl,[esi].BlkY4.PHMV
|
|
mov ebp,Block3.TransferCase
|
|
sub cl,bl ; -HMVb = -(HMVf - HMV)
|
|
mov [esi].BlkY4.CandidateHMVb,cl
|
|
mov edi,Block4.FutureRefBlockAddr ; Load addr of ref blk to factor in horz.
|
|
sar ecx,1 ; CF == 1 if future horz is at half pel.
|
|
mov bl,[esi].BlkY3.PHMV
|
|
adc ebp,ebp ; ebp odd if future horz is at half pel.
|
|
add edi,ecx ; Factor in HMVb.
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
mov cl,[esi].BlkY3.CandidateHMVf
|
|
sub cl,bl
|
|
mov Block4.FutureRefBlockAddr,edi ; Stash address of ref block from future.
|
|
mov ebp,Block2.TransferCase
|
|
mov [esi].BlkY3.CandidateHMVb,cl
|
|
sar ecx,1
|
|
mov edi,Block3.FutureRefBlockAddr
|
|
adc ebp,ebp
|
|
add edi,ecx
|
|
mov Block2.TransferCase,ebp
|
|
mov Block3.FutureRefBlockAddr,edi
|
|
mov cl,[esi].BlkY2.CandidateHMVf
|
|
mov bl,[esi].BlkY2.PHMV
|
|
mov ebp,Block1.TransferCase
|
|
sub cl,bl
|
|
mov [esi].BlkY2.CandidateHMVb,cl
|
|
mov edi,Block2.FutureRefBlockAddr
|
|
sar ecx,1
|
|
mov bl,[esi].BlkY1.PHMV
|
|
adc ebp,ebp
|
|
add edi,ecx
|
|
mov Block1.TransferCase,ebp
|
|
mov cl,[esi].BlkY1.CandidateHMVf
|
|
sub cl,bl
|
|
mov Block2.FutureRefBlockAddr,edi
|
|
mov eax,FirstTransferCase
|
|
mov [esi].BlkY1.CandidateHMVb,cl
|
|
sar ecx,1
|
|
mov edi,Block1.FutureRefBlockAddr
|
|
adc eax,eax
|
|
add edi,ecx
|
|
mov esi,Block1.PastRefBlockAddr
|
|
sub esp,64
|
|
xor ebp,ebp
|
|
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
|
|
VMVdIsZero:
|
|
|
|
mov bl,[esi].BlkY4.PVMV
|
|
mov cl,[esi].BlkY3.PVMV
|
|
mov ebp,Block3.TransferCase
|
|
mov dh,PB Block2.TransferCase
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
lea edi,FutureBlock+80-48-48*FPITCH
|
|
mov [esi].BlkY4.CandidateVMVb,bl
|
|
mov cl,WeightBackwardMotion[ecx]
|
|
sar ebx,1 ; CF == 1 if future vert is at half pel.
|
|
mov [esi].BlkY3.CandidateVMVb,cl
|
|
adc ebp,ebp ; ebp odd if future vert is at half pel.
|
|
sar ecx,1
|
|
lea eax,[ebx+ebx*4] ; Multiply vertical component by FPITCH.
|
|
adc dh,dh
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
lea ebp,[edi+eax*8+FPITCH*8+8] ; Addr of ref blk w/ vert MV factored in.
|
|
lea eax,[ecx+ecx*4]
|
|
mov PB Block2.TransferCase,dh
|
|
mov Block4.FutureRefBlockAddr,ebp ; Stash address of ref block from future.
|
|
lea ebp,[edi+eax*8+FPITCH*8]
|
|
mov bl,[esi].BlkY2.PVMV
|
|
mov Block3.FutureRefBlockAddr,ebp
|
|
mov cl,[esi].BlkY1.PVMV
|
|
mov ebp,Block1.TransferCase
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov dh,PB FirstTransferCase
|
|
mov [esi].BlkY2.CandidateVMVb,bl
|
|
sar ebx,1
|
|
mov cl,WeightBackwardMotion[ecx]
|
|
adc ebp,ebp
|
|
mov [esi].BlkY1.CandidateVMVb,cl
|
|
sar ecx,1
|
|
lea eax,[ebx+ebx*4]
|
|
adc dh,dh
|
|
mov Block1.TransferCase,ebp
|
|
lea ebp,[edi+eax*8+8]
|
|
lea eax,[ecx+ecx*4]
|
|
mov PB FirstTransferCase,dh
|
|
mov Block2.FutureRefBlockAddr,ebp
|
|
lea ebp,[edi+eax*8]
|
|
test dl,dl
|
|
mov Block1.FutureRefBlockAddr,ebp
|
|
jne HMVdIsNonZero
|
|
|
|
HMVdIsZero:
|
|
|
|
mov bl,[esi].BlkY4.PHMV
|
|
mov cl,[esi].BlkY3.PHMV
|
|
mov ebp,Block3.TransferCase
|
|
mov edx,Block2.TransferCase
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov eax,Block4.FutureRefBlockAddr
|
|
mov [esi].BlkY4.CandidateHMVb,bl
|
|
mov cl,WeightBackwardMotion[ecx]
|
|
sar ebx,1 ; CF == 1 if future horz is at half pel.
|
|
mov [esi].BlkY3.CandidateHMVb,cl
|
|
adc ebp,ebp ; ebp odd if future horz is at half pel.
|
|
add eax,ebx ; Addr of ref blk w/ horz MV factored in.
|
|
sar ecx,1
|
|
mov Block3.TransferCase,ebp ; Stash case to do after block 3.
|
|
adc edx,edx
|
|
mov edi,Block3.FutureRefBlockAddr
|
|
mov Block4.FutureRefBlockAddr,eax ; Stash address of ref block from future.
|
|
add edi,ecx
|
|
mov Block2.TransferCase,edx
|
|
mov Block3.FutureRefBlockAddr,edi
|
|
mov bl,[esi].BlkY2.PHMV
|
|
mov cl,[esi].BlkY1.PHMV
|
|
mov ebp,Block1.TransferCase
|
|
mov edx,FirstTransferCase
|
|
mov bl,WeightBackwardMotion[ebx]
|
|
mov eax,Block2.FutureRefBlockAddr
|
|
mov [esi].BlkY2.CandidateHMVb,bl
|
|
mov cl,WeightBackwardMotion[ecx]
|
|
sar ebx,1 ; CF == 1 if future horz is at half pel.
|
|
mov [esi].BlkY1.CandidateHMVb,cl
|
|
adc ebp,ebp ; ebp odd if future horz is at half pel.
|
|
add eax,ebx ; Addr of ref blk w/ horz MV factored in.
|
|
sar ecx,1
|
|
mov Block1.TransferCase,ebp ; Stash case to do after block 3.
|
|
adc edx,edx
|
|
mov edi,Block1.FutureRefBlockAddr
|
|
mov Block2.FutureRefBlockAddr,eax ; Stash address of ref block from future.
|
|
add edi,ecx
|
|
mov esi,Block1.PastRefBlockAddr
|
|
sub esp,64
|
|
xor ebp,ebp
|
|
|
|
jmp PD JumpTable[edx*4]
|
|
|
|
|
|
MVDeltaOutOfRange:
|
|
|
|
xor ebp,ebp
|
|
mov ebx,CurrSWDState ; Restore ME engine state.
|
|
jmp OutOfRangeHandlingDone
|
|
|
|
TakeEarlyOut:
|
|
|
|
sub esp,4
|
|
xor ecx,ecx
|
|
and esp,0FFFFFFC0H
|
|
mov ebx,CurrSWDState+64
|
|
mov esi,MBlockActionStream+64
|
|
mov eax,BestMV+64
|
|
add esp,64
|
|
mov bl,SWDState[ebx+3]
|
|
test bl,bl
|
|
jne SWDLoop
|
|
|
|
mov ecx,Block4.SWD0MVAccum
|
|
mov ebp,Block4.BestSWDAccum
|
|
jmp CandidatesDone
|
|
|
|
NonZeroVectorSWDDone:
|
|
|
|
mov ebx,CurrSWDState
|
|
mov esi,MBlockActionStream
|
|
xor ecx,ecx
|
|
mov ebp,-1
|
|
mov eax,[esi].BlkY1.CandidateBiDiMVs
|
|
mov edx,[esi].BlkY2.CandidateBiDiMVs
|
|
mov [esi].BlkY1.BestBiDiMVs,eax
|
|
mov [esi].BlkY2.BestBiDiMVs,edx
|
|
mov eax,[esi].BlkY3.CandidateBiDiMVs
|
|
mov edx,[esi].BlkY4.CandidateBiDiMVs
|
|
mov [esi].BlkY3.BestBiDiMVs,eax
|
|
mov [esi].BlkY4.BestBiDiMVs,edx
|
|
mov eax,Block1.CandidateSWDAccum
|
|
mov edx,Block2.CandidateSWDAccum
|
|
mov Block1.BestSWDAccum,eax
|
|
mov Block2.BestSWDAccum,edx
|
|
mov eax,Block3.CandidateSWDAccum
|
|
mov edx,Block4.CandidateSWDAccum
|
|
mov Block3.BestSWDAccum,eax
|
|
mov Block4.BestSWDAccum,edx
|
|
mov BlockN.BestSWDAccum,edx
|
|
|
|
OutOfRangeHandlingDone:
|
|
|
|
mov bl,SWDState[ebx+ebp*1+3]
|
|
mov eax,BestMV[ebp*4]
|
|
test bl,bl
|
|
jne SWDLoop
|
|
|
|
mov ecx,Block4.SWD0MVAccum
|
|
mov ebp,Block4.BestSWDAccum
|
|
|
|
CandidatesDone:
|
|
|
|
sub ecx,ebp
|
|
mov ebx,NonZeroMVDifferential
|
|
cmp ecx,ebx
|
|
jge ZeroMVNotGoodEnough
|
|
|
|
ZeroMVGoodEnough:
|
|
|
|
xor eax,eax
|
|
mov esi,MBlockActionStream
|
|
mov edi,Block1.SWD0MVAccum
|
|
mov ebx,Block2.SWD0MVAccum
|
|
mov ecx,Block3.SWD0MVAccum
|
|
mov ebp,Block4.SWD0MVAccum
|
|
mov Block1.BestSWDAccum,edi
|
|
mov Block2.BestSWDAccum,ebx
|
|
mov Block3.BestSWDAccum,ecx
|
|
mov Block4.BestSWDAccum,ebp
|
|
mov ebx,BlkY1_0deltaBiDiMVs
|
|
mov edi,BlkY2_0deltaBiDiMVs
|
|
mov [esi].BlkY1.BestBiDiMVs,ebx
|
|
mov [esi].BlkY2.BestBiDiMVs,edi
|
|
mov ebx,BlkY3_0deltaBiDiMVs
|
|
mov edi,BlkY4_0deltaBiDiMVs
|
|
mov [esi].BlkY3.BestBiDiMVs,ebx
|
|
mov [esi].BlkY4.BestBiDiMVs,edi
|
|
|
|
BelowZeroThreshold:
|
|
ZeroMVNotGoodEnough:
|
|
|
|
mov [esi].BlkY1.BHMV,al
|
|
mov [esi].BlkY2.BHMV,al
|
|
mov [esi].BlkY3.BHMV,al
|
|
mov [esi].BlkY4.BHMV,al
|
|
mov [esi].BlkY1.BVMV,ah
|
|
mov [esi].BlkY2.BVMV,ah
|
|
mov [esi].BlkY3.BVMV,ah
|
|
mov [esi].BlkY4.BVMV,ah
|
|
mov al,[esi].CodedBlocksB ; Fetch coded block pattern.
|
|
mov edi,EmptyBlockThreshold ; Get threshold for forcing block empty?
|
|
or al,03FH ; Initially set all blocks coded.
|
|
mov ecx,Block3.BestSWDAccum
|
|
mov ebx,InterSWDBlocks
|
|
mov edx,ebp
|
|
sub edx,ecx ; Get SWD for block 4.
|
|
cmp edx,edi ; Is it below empty threshold?
|
|
jg @f
|
|
|
|
and al,0F7H ; If so, indicate block 4 is NOT coded.
|
|
dec ebx
|
|
sub ebp,edx
|
|
|
|
@@:
|
|
|
|
mov edx,Block2.BestSWDAccum
|
|
sub ecx,edx
|
|
cmp ecx,edi
|
|
jg @f
|
|
|
|
and al,0FBH
|
|
dec ebx
|
|
sub ebp,ecx
|
|
|
|
@@:
|
|
|
|
mov ecx,Block1.BestSWDAccum
|
|
sub edx,ecx
|
|
cmp edx,edi
|
|
jg @f
|
|
|
|
and al,0FDH
|
|
dec ebx
|
|
sub ebp,edx
|
|
|
|
@@:
|
|
|
|
mov edx,InterSWDTotal
|
|
cmp ecx,edi
|
|
jg @f
|
|
|
|
and al,0FEH
|
|
dec ebx
|
|
sub ebp,ecx
|
|
|
|
@@:
|
|
|
|
mov [esi].CodedBlocksB,al ; Store coded block pattern.
|
|
add ebx,4
|
|
mov InterSWDBlocks,ebx
|
|
add edx,ebp ; Add to total for this macroblock class.
|
|
mov InterSWDTotal,edx
|
|
mov edx,esi
|
|
mov PD [esi].SWDB,ebp
|
|
jmp NextMacroBlock
|
|
|
|
|
|
|
|
|
|
BiDiNoInterp:
|
|
|
|
; esp -- Pointer to block of target macroblock.
|
|
; ebp -- SWD accumulator. Must be initialized by caller.
|
|
; esi -- Pointer to block of reference in past frame.
|
|
; edi -- Pointer to block of reference in future frame + 80.
|
|
; al, bl, cl, dl -- Scratch.
|
|
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
mov al,[edi-80] ; 00A Fetch pel from future ref.
|
|
mov bl,[esi] ; 00B Fetch pel from previous ref.
|
|
xor ecx,ecx
|
|
xor edx,edx
|
|
@@:
|
|
mov al,InterpPastAndFutureRef[eax+ebx] ; 00C (past+future) or 2*past
|
|
mov bl,BlockN.TargetBlock[0] ; 00D Fetch -2 * target pel.
|
|
mov cl,[edi+FPITCH*2+2-80] ; 22A
|
|
mov dl,[esi+PITCH*2+2] ; 22B
|
|
mov bl,WeightedDiff[ebx+eax] ; 00E Weighted difference.
|
|
mov cl,InterpPastAndFutureRef[ecx+edx] ; 22C
|
|
add ebp,ebx ; 00F Accumulate weighted difference.
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22D
|
|
mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
|
|
mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02b Fetch -2 * target pel.
|
|
mov dl,WeightedDiff[edx+ecx] ; 22E
|
|
mov cl,[esi+PITCH*2+0] ; 20a
|
|
add ebp,edx ; 22F
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+0] ; 20b
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 02c Weighted difference.
|
|
mov al,[esi+PITCH*1+1] ; 11a
|
|
add ebp,ebx ; 02d Accumulate weighted difference.
|
|
mov bl,BlockN.TargetBlock[TPITCH*1+1] ; 11b
|
|
mov dl,WeightedDiff[edx+ecx*2] ; 20c
|
|
mov cl,[esi+PITCH*1+3] ; 13a
|
|
add ebp,edx ; 20d
|
|
mov dl,BlockN.TargetBlock[TPITCH*1+3] ; 13b
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 11c
|
|
mov al,[esi+PITCH*3+1] ; 31a
|
|
add ebp,ebx ; 11d
|
|
mov bl,BlockN.TargetBlock[TPITCH*3+1] ; 31b
|
|
mov dl,WeightedDiff[edx+ecx*2] ; 13c
|
|
mov cl,[esi+PITCH*3+3] ; 33a
|
|
add ebp,edx ; 13d
|
|
mov dl,BlockN.TargetBlock[TPITCH*3+3] ; 33b
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 31c
|
|
add edi,4 ; Move to next 4 columns.
|
|
add ebp,ebx ; 31d
|
|
mov dl,WeightedDiff[edx+ecx*2] ; 33c
|
|
add ebp,edx ; 33d
|
|
add esi,4 ; Move to next 4 columns.
|
|
add esp,4 ; Move to next 4 columns.
|
|
mov al,[edi-80] ; 04A
|
|
mov bl,[esi] ; 04B
|
|
mov cl,4
|
|
and ecx,esp ; Twice, 4 cols each time.
|
|
jne @b
|
|
|
|
mov al,[edi-80+FPITCH*4-8] ; 40A
|
|
add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
mov cl,8
|
|
add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
and ecx,esp ; Twice, 4 rows each time.
|
|
mov bl,[esi] ; 40B
|
|
jne @b
|
|
|
|
mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
|
|
mov eax,BlockN.BestSWDAccum
|
|
cmp ebp,eax
|
|
jg TakeEarlyOut
|
|
|
|
mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
|
|
mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
|
|
mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
|
|
BiDiFutureHorz LABEL DWORD
|
|
mov edx,1
|
|
xor ecx,ecx
|
|
jmp BiDiSWDCalc_InterpFuture
|
|
|
|
BiDiFutureVert:
|
|
mov edx,FPITCH
|
|
xor ecx,ecx
|
|
jmp BiDiSWDCalc_InterpFuture
|
|
|
|
BiDiFutureBoth:
|
|
mov edx,FPITCH+1
|
|
xor ecx,ecx
|
|
|
|
; esp -- Pointer to block of target macroblock.
|
|
; ebp -- SWD accumulator. Must be initialized by caller.
|
|
; esi -- Pointer to block of reference in past frame.
|
|
; edi -- Pointer to block of reference in future frame + 80.
|
|
; edx -- Distance from future pel to other future pel with which to interp.
|
|
; al, bl, cl, dl -- Scratch.
|
|
|
|
BiDiSWDCalc_InterpFuture:
|
|
|
|
mov al,[edi-80] ; 00A Fetch pel from future ref.
|
|
xor ebx,ebx
|
|
@@:
|
|
|
|
mov bl,[edi+edx-80] ; 00B Fetch other future ref pel.
|
|
and eax,0000000FFH
|
|
mov BlockN.FutureRefInterpOffset,edx ; Stash interp offset.
|
|
mov dl,[edi+edx+FPITCH*2+2-80] ; 22B
|
|
mov al,InterpFutureRef[eax+ebx] ; 00C Get interpolated future ref.
|
|
mov bl,[esi] ; 00D Fetch pel from previous ref.
|
|
mov cl,[edi+FPITCH*2+2-80] ; 22A
|
|
and edx,0000000FFH ; Extract pel value.
|
|
mov al,InterpPastAndFutureRef[eax+ebx] ; 00E (past+future) or 2*past
|
|
mov bl,BlockN.TargetBlock[0] ; 00F Fetch -2 * target pel.
|
|
mov cl,InterpFutureRef[ecx+edx] ; 22C
|
|
mov dl,[esi+PITCH*2+2] ; 22D
|
|
mov bl,WeightedDiff[ebx+eax] ; 00G Weighted difference.
|
|
mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
|
|
add ebp,ebx ; 00H Accumulate weighted difference.
|
|
mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02b Fetch -2 * target pel.
|
|
mov cl,InterpPastAndFutureRef[ecx+edx] ; 22E
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22F
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 02c Weighted difference.
|
|
mov al,[esi+PITCH*2+0] ; 20a
|
|
mov dl,WeightedDiff[edx+ecx] ; 22G
|
|
add ebp,ebx ; 02d Accumulate weighted difference.
|
|
add ebp,edx ; 22H
|
|
mov bl,BlockN.TargetBlock[TPITCH*2+0] ; 20b
|
|
mov cl,[esi+PITCH*1+1] ; 11a
|
|
mov dl,BlockN.TargetBlock[TPITCH*1+1] ; 11b
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 20c
|
|
mov al,[esi+PITCH*1+3] ; 13a
|
|
add ebp,ebx ; 20d
|
|
mov dl,WeightedDiff[edx+ecx*2] ; 11c
|
|
add ebp,edx ; 11d
|
|
mov bl,BlockN.TargetBlock[TPITCH*1+3] ; 13b
|
|
mov cl,[esi+PITCH*3+1] ; 31a
|
|
mov dl,BlockN.TargetBlock[TPITCH*3+1] ; 31b
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 13c
|
|
mov al,[esi+PITCH*3+3] ; 33a
|
|
add ebp,ebx ; 13d
|
|
mov bl,BlockN.TargetBlock[TPITCH*3+3] ; 33b
|
|
mov dl,WeightedDiff[edx+ecx*2] ; 31c
|
|
add edi,4 ; Move to next 4 columns.
|
|
add ebp,edx ; 31d
|
|
mov bl,WeightedDiff[ebx+eax*2] ; 33c
|
|
add ebp,ebx ; 33d
|
|
mov edx,BlockN.FutureRefInterpOffset ; Prepare for next iteration.
|
|
add esi,4 ; Move to next 4 columns.
|
|
add esp,4 ; Move to next 4 columns.
|
|
mov al,[edi-80] ; 04A
|
|
mov cl,4
|
|
and ecx,esp ; Twice, 4 cols each time.
|
|
jne @b
|
|
|
|
mov al,[edi-80+FPITCH*4-8] ; 40A
|
|
add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
mov cl,8
|
|
add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
and ecx,esp ; Twice, 4 rows each time.
|
|
jne @b
|
|
|
|
mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
|
|
mov eax,BlockN.BestSWDAccum
|
|
cmp ebp,eax
|
|
jg TakeEarlyOut
|
|
|
|
mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
|
|
mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
|
|
mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
|
|
BiDiPastHorz LABEL DWORD
|
|
mov edx,edi
|
|
mov edi,1
|
|
xor eax,eax
|
|
jmp BiDiSWDCalc_InterpPast
|
|
|
|
BiDiPastVert:
|
|
mov edx,edi
|
|
mov edi,PITCH
|
|
xor eax,eax
|
|
jmp BiDiSWDCalc_InterpPast
|
|
|
|
BiDiPastBoth:
|
|
mov edx,edi
|
|
mov edi,PITCH+1
|
|
xor eax,eax
|
|
|
|
BiDiSWDCalc_InterpPast:
|
|
|
|
; esp -- Pointer to block of target macroblock.
|
|
; ebp -- SWD accumulator. Must be initialized by caller.
|
|
; esi -- Pointer to block of reference in past frame.
|
|
; edi -- Distance from future pel to other future pel with which to interp.
|
|
; edx -- Pointer to block of reference in future frame + 80.
|
|
; al, bl, cl, dl -- Scratch.
|
|
|
|
mov al,[esi] ; 00A Fetch pel from previous ref.
|
|
xor ebx,ebx
|
|
mov bl,[esi+edi] ; 00B Fetch other past ref pel.
|
|
xor ecx,ecx
|
|
@@:
|
|
add al,bl ; 00C Interp'd past ref, times 2.
|
|
mov bl,[edx-80] ; 00D Fetch pel from future ref.
|
|
mov BlockN.FutureRefBlockAddr,edx
|
|
mov dl,[edx+FPITCH*2+2-80] ; 22D
|
|
mov al,Interp2PastAndFutureRef[eax+ebx*2] ; 00E (past+future) or 2*past.
|
|
mov bl,BlockN.TargetBlock[0] ; 00F Fetch target pel.
|
|
mov cl,[esi+PITCH*2+2] ; 22A
|
|
and edx,0000000FFH
|
|
mov bl,WeightedDiff[eax+ebx] ; 00G Weighted difference.
|
|
mov al,[esi+edi+PITCH*2+2] ; 22B
|
|
add cl,al ; 22C
|
|
mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
|
|
add ebp,ebx ; 00H Accumulate weighted diff.
|
|
mov bl,[esi+edi+PITCH*0+2] ; 02b Fetch other past ref pel.
|
|
mov cl,Interp2PastAndFutureRef[ecx+edx*2] ; 22E
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22F
|
|
add al,bl ; 02c Interp'd past ref, times 2.
|
|
mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02d Fetch -2 * target pel.
|
|
mov dl,WeightedDiff[ecx+edx] ; 22G Weighted difference.
|
|
mov cl,[esi+PITCH*2+0] ; 20a
|
|
add ebp,edx ; 22H
|
|
mov dl,[esi+edi+PITCH*2+0] ; 20b
|
|
add cl,dl ; 20c
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+0] ; 20d
|
|
mov bl,WeightedDiff[eax+ebx] ; 02e Weighted difference.
|
|
mov al,[esi+PITCH*1+1] ; 11a
|
|
add ebp,ebx ; 02f Accumulate weighted diff.
|
|
mov bl,[esi+edi+PITCH*1+1] ; 11b
|
|
add al,bl ; 11c
|
|
mov bl,BlockN.TargetBlock[TPITCH*1+1] ; 11d
|
|
mov dl,WeightedDiff[ecx+edx] ; 20e
|
|
mov cl,[esi+PITCH*1+3] ; 13a
|
|
add ebp,edx ; 20f
|
|
mov dl,[esi+edi+PITCH*1+3] ; 13b
|
|
add cl,dl ; 13c
|
|
mov dl,BlockN.TargetBlock[TPITCH*1+3] ; 13d
|
|
mov bl,WeightedDiff[eax+ebx] ; 11e
|
|
mov al,[esi+PITCH*3+1] ; 31a
|
|
add ebp,ebx ; 11f
|
|
mov bl,[esi+edi+PITCH*3+1] ; 31b
|
|
add al,bl ; 31c
|
|
mov bl,BlockN.TargetBlock[TPITCH*3+1] ; 31d
|
|
mov dl,WeightedDiff[ecx+edx] ; 13e
|
|
mov cl,[esi+PITCH*3+3] ; 33a
|
|
add ebp,edx ; 13f
|
|
mov dl,[esi+edi+PITCH*3+3] ; 33b
|
|
add cl,dl ; 33c
|
|
mov dl,BlockN.TargetBlock[TPITCH*3+3] ; 33d
|
|
mov bl,WeightedDiff[eax+ebx] ; 31e
|
|
add esi,4 ; Move to next 4 columns.
|
|
add ebp,ebx ; 31f
|
|
mov dl,WeightedDiff[ecx+edx] ; 33e
|
|
add ebp,edx ; 33f
|
|
mov edx,BlockN.FutureRefBlockAddr
|
|
add edx,4 ; Move to next 4 columns.
|
|
add esp,4 ; Move to next 4 columns.
|
|
mov al,[esi] ; 04A
|
|
mov cl,4
|
|
mov bl,[esi+edi] ; 04B
|
|
and ecx,esp ; Twice, 4 cols each time.
|
|
mov cl,8
|
|
jne @b
|
|
|
|
add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
mov al,[esi+PITCH*4-8] ; 40A
|
|
mov bl,[esi+edi+PITCH*4-8] ; 40B
|
|
add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
and ecx,esp ; Twice, 4 rows each time.
|
|
jne @b
|
|
|
|
mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
|
|
mov eax,BlockN.BestSWDAccum
|
|
cmp ebp,eax
|
|
jg TakeEarlyOut
|
|
|
|
mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
|
|
mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
|
|
mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
|
|
BiDiSWDCalc_InterpBoth MACRO PastRefInterpOffset
|
|
|
|
; esp -- Pointer to block of target macroblock.
|
|
; ebp -- SWD accumulator. Must be initialized by caller.
|
|
; esi -- Pointer to block of reference in past frame.
|
|
; edi -- Pointer to block of reference in future frame + 80.
|
|
; al, bl, cl, dl -- Scratch.
|
|
|
|
@@:
|
|
|
|
mov bl,[edi-80] ; 00A Fetch pel from future ref.
|
|
mov BlockN.FutureRefBlockAddr,edx
|
|
mov al,[edi+edx-80] ; 00B Fetch other future ref pel.
|
|
mov dl,[edi+edx+FPITCH*2+2-80] ; 22B
|
|
mov cl,[edi+FPITCH*2+2-80] ; 22A
|
|
add esp,4 ; Move to next 4 columns.
|
|
mov bl,InterpFutureRef[eax+ebx] ; 00C Get interpolated future ref.
|
|
mov al,[esi] ; 00D Fetch pel from previous ref.
|
|
mov dl,InterpFutureRef[ecx+edx] ; 22C
|
|
mov cl,[esi+PITCH*2+2] ; 22D
|
|
lea ebx,[eax+ebx*2] ; 00E Interp'ed future plus one past.
|
|
mov al,[esi+PastRefInterpOffset] ; 00F Fetch other pel from past ref.
|
|
lea edx,[ecx+edx*2] ; 22E
|
|
mov cl,[esi+PITCH*2+2+PastRefInterpOffset] ; 22F
|
|
mov al,Interp2PastAndFutureRef[ebx+eax]; 00G (past+future) or 2*past.
|
|
xor ebx,ebx
|
|
mov bl,BlockN.TargetBlock[0-4] ; 00H Fetch target pel.
|
|
mov cl,Interp2PastAndFutureRef[edx+ecx] ; 22G
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+2-4] ; 22H
|
|
add esi,4 ; Move to next 4 columns.
|
|
and edx,0000000FFH
|
|
mov bl,WeightedDiff[eax+ebx] ; 00I Weighted difference.
|
|
add ebp,ebx ; 00J Accum weighted difference.
|
|
mov al,[esi+PITCH*0+2-4] ; 02a Fetch pel from prev ref.
|
|
mov dl,WeightedDiff[ecx+edx] ; 22I
|
|
mov bl,[esi+PastRefInterpOffset+PITCH*0+2-4]; 02b Fetch other past ref pel.
|
|
add al,bl ; 02c Interp'd past ref, *2.
|
|
mov bl,BlockN.TargetBlock[TPITCH*0+2-4] ; 02d Fetch -2 * target pel.
|
|
add ebp,edx ; 22J
|
|
mov cl,[esi+PITCH*2+0-4] ; 20a
|
|
add edi,4 ; Move to next 4 columns.
|
|
mov dl,[esi+PastRefInterpOffset+PITCH*2+0-4]; 20b
|
|
mov bl,WeightedDiff[eax+ebx] ; 02e Weighted difference.
|
|
mov al,[esi+PITCH*1+1-4] ; 11a
|
|
add ebp,ebx ; 02f Accumulate weighted diff.
|
|
mov bl,[esi+PastRefInterpOffset+PITCH*1+1-4]; 11b
|
|
add cl,dl ; 20c
|
|
mov dl,BlockN.TargetBlock[TPITCH*2+0-4] ; 20d
|
|
add al,bl ; 11c
|
|
mov bl,BlockN.TargetBlock[TPITCH*1+1-4] ; 11d
|
|
mov dl,WeightedDiff[ecx+edx] ; 20e
|
|
mov cl,[esi+PITCH*1+3-4] ; 13a
|
|
add ebp,edx ; 20f
|
|
mov dl,[esi+PastRefInterpOffset+PITCH*1+3-4]; 13b
|
|
mov bl,WeightedDiff[eax+ebx] ; 11e
|
|
mov al,[esi+PITCH*3+1-4] ; 31a
|
|
add ebp,ebx ; 11f
|
|
mov bl,[esi+PastRefInterpOffset+PITCH*3+1-4]; 31b
|
|
add cl,dl ; 13c
|
|
mov dl,BlockN.TargetBlock[TPITCH*1+3-4] ; 13d
|
|
add al,bl ; 31c
|
|
mov bl,BlockN.TargetBlock[TPITCH*3+1-4] ; 31d
|
|
mov dl,WeightedDiff[ecx+edx] ; 13e
|
|
mov cl,[esi+PITCH*3+3-4] ; 33a
|
|
add ebp,edx ; 13f
|
|
mov dl,[esi+PastRefInterpOffset+PITCH*3+3-4]; 33b
|
|
add cl,dl ; 33c
|
|
mov dl,BlockN.TargetBlock[TPITCH*3+3-4] ; 33d
|
|
mov bl,WeightedDiff[eax+ebx] ; 31e
|
|
mov al,4
|
|
add ebp,ebx ; 31f
|
|
mov dl,WeightedDiff[ecx+edx] ; 33e
|
|
add ebp,edx ; 33f
|
|
mov edx,BlockN.FutureRefBlockAddr-4
|
|
and eax,esp ; Twice, 4 cols each time.
|
|
jne @b
|
|
|
|
add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
|
|
mov cl,8
|
|
and ecx,esp ; Twice, 4 rows each time.
|
|
jne @b
|
|
|
|
mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
|
|
mov eax,BlockN.BestSWDAccum
|
|
cmp ebp,eax
|
|
jg TakeEarlyOut
|
|
|
|
mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
|
|
mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
|
|
mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
|
|
jmp PD JumpTable[eax*4]
|
|
|
|
ENDM
|
|
|
|
BiDiPastHorzFutureHorz LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,1
|
|
jmp BiDiSWDCalc_InterpBoth_PastByHorz
|
|
|
|
BiDiPastHorzFutureVert LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH
|
|
jmp BiDiSWDCalc_InterpBoth_PastByHorz
|
|
|
|
BiDiPastHorzFutureBoth LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH+1
|
|
|
|
BiDiSWDCalc_InterpBoth_PastByHorz:
|
|
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
BiDiSWDCalc_InterpBoth 1
|
|
|
|
BiDiPastVertFutureHorz LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,1
|
|
jmp BiDiSWDCalc_InterpBoth_PastByVert
|
|
|
|
BiDiPastVertFutureVert LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH
|
|
jmp BiDiSWDCalc_InterpBoth_PastByVert
|
|
|
|
BiDiPastVertFutureBoth LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH+1
|
|
|
|
BiDiSWDCalc_InterpBoth_PastByVert:
|
|
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
BiDiSWDCalc_InterpBoth PITCH
|
|
|
|
BiDiPastBothFutureHorz LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,1
|
|
jmp BiDiSWDCalc_InterpBoth_PastByBoth
|
|
|
|
BiDiPastBothFutureVert LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH
|
|
jmp BiDiSWDCalc_InterpBoth_PastByBoth
|
|
|
|
BiDiPastBothFutureBoth LABEL DWORD
|
|
|
|
xor ecx,ecx
|
|
mov edx,FPITCH+1
|
|
|
|
BiDiSWDCalc_InterpBoth_PastByBoth:
|
|
|
|
xor eax,eax
|
|
xor ebx,ebx
|
|
BiDiSWDCalc_InterpBoth PITCH+1
|
|
|
|
ALIGN 4
|
|
JumpTable:
|
|
|
|
DD BiDiNoInterp
|
|
DD BiDiFutureHorz
|
|
DD BiDiFutureVert
|
|
DD BiDiFutureBoth
|
|
DD BiDiPastHorz
|
|
DD BiDiPastHorzFutureHorz
|
|
DD BiDiPastHorzFutureVert
|
|
DD BiDiPastHorzFutureBoth
|
|
DD BiDiPastVert
|
|
DD BiDiPastVertFutureHorz
|
|
DD BiDiPastVertFutureVert
|
|
DD BiDiPastVertFutureBoth
|
|
DD BiDiPastBoth
|
|
DD BiDiPastBothFutureHorz
|
|
DD BiDiPastBothFutureVert
|
|
DD BiDiPastBothFutureBoth
|
|
DD ZeroVectorSWDDone
|
|
DD NonZeroVectorSWDDone
|
|
|
|
Done:
|
|
|
|
mov ecx,InterSWDTotal
|
|
mov edx,InterSWDBlocks
|
|
mov esp,StashESP
|
|
mov edi,[esp+InterSWDTotal_arg]
|
|
mov [edi],ecx
|
|
mov edi,[esp+InterSWDBlocks_arg]
|
|
mov [edi],edx
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
rturn
|
|
|
|
|
|
BFRAMEMOTIONESTIMATION endp
|
|
|
|
END
|