You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
6016 lines
222 KiB
6016 lines
222 KiB
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// INTEL CORPORATION PROPRIETARY INFORMATION
|
|
;//
|
|
;// This software is supplied under the terms of a license
|
|
;// agreement or nondisclosure agreement with Intel Corporation
|
|
;// and may not be copied or disclosed except in accordance
|
|
;// with the terms of that agreement.
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// $Header: S:\h26x\src\enc\exmme.asv 1.37 13 Dec 1996 17:19:38 MBODART $
|
|
;//
|
|
;// $Log: S:\h26x\src\enc\exmme.asv $
|
|
;//
|
|
;// Rev 1.37 13 Dec 1996 17:19:38 MBODART
|
|
;// Tuned the ME parameters for H.261.
|
|
;//
|
|
;// Rev 1.36 06 Nov 1996 16:18:24 BNICKERS
|
|
;// Improve performance.
|
|
;//
|
|
;// Rev 1.35 30 Oct 1996 17:30:36 BNICKERS
|
|
;// Fix UMV table for right edge macroblocks.
|
|
;//
|
|
;// Rev 1.34 30 Oct 1996 14:49:20 KLILLEVO
|
|
;// zero motion vectors for intra blocks in PB-frame mode.
|
|
;// This is necesseary in the Extended Motion Vector mode
|
|
;//
|
|
;// Rev 1.33 18 Oct 1996 16:57:16 BNICKERS
|
|
;// Fixes for EMV
|
|
;//
|
|
;// Rev 1.32 15 Oct 1996 17:53:04 BNICKERS
|
|
;//
|
|
;// Fix major bug w.r.t. EMV ME.
|
|
;//
|
|
;// Rev 1.31 14 Oct 1996 13:10:14 BNICKERS
|
|
;//
|
|
;// Correct several problems wrt H261 ME.
|
|
;//
|
|
;// Rev 1.30 11 Oct 1996 16:53:12 KLILLEVO
|
|
;//
|
|
;// Fix threshold
|
|
;//
|
|
;// Rev 1.29 11 Oct 1996 16:52:18 KLILLEVO
|
|
;// Another EMV fix.
|
|
;//
|
|
;// Rev 1.28 11 Oct 1996 15:43:16 KLILLEVO
|
|
;// Really fix the handling of the top row of MBs for EMV ME.
|
|
;//
|
|
;// Rev 1.27 11 Oct 1996 15:24:38 BNICKERS
|
|
;// Special handling of top row of MBs for EMV ME.
|
|
;//
|
|
;// Rev 1.26 11 Oct 1996 14:47:42 KLILLEVO
|
|
;// Kill full pel MV for Intra blocks so that EMV of adjacent blocks will work.
|
|
;//
|
|
;// Rev 1.25 10 Oct 1996 16:42:56 BNICKERS
|
|
;// Initial debugging of Extended Motion Vectors.
|
|
;//
|
|
;// Rev 1.24 04 Oct 1996 08:48:02 BNICKERS
|
|
;// Add EMV.
|
|
;//
|
|
;// Rev 1.23 24 Sep 1996 10:42:24 BNICKERS
|
|
;// For H261, zero out motion vectors when classifying MB as intra.
|
|
;//
|
|
;// Rev 1.22 12 Sep 1996 10:56:24 BNICKERS
|
|
;// Add arguments for thresholds and differentials.
|
|
;//
|
|
;// Rev 1.21 22 Jul 1996 15:23:24 BNICKERS
|
|
;// Reduce code size. Implement H261 spatial filter.
|
|
;//
|
|
;// Rev 1.20 18 Jul 1996 16:54:26 KLILLEVO
|
|
;// changed emptythreshold to 40 instead of 128 to remove some blockiness
|
|
;// from the still frame mode on MMX
|
|
;//
|
|
;// Rev 1.19 26 Jun 1996 12:49:02 KLILLEVO
|
|
;// Fix minor booboo left in by Brian.
|
|
;//
|
|
;// Rev 1.18 26 Jun 1996 12:21:50 BNICKERS
|
|
;// Make heuristic ME work without unrestricted motion vectors.
|
|
;//
|
|
;// Rev 1.17 25 Jun 1996 14:24:58 BNICKERS
|
|
;// Implement heuristic motion estimation for MMX, AP mode.
|
|
;//
|
|
;// Rev 1.16 15 May 1996 16:57:14 BNICKERS
|
|
;// Fix SWD tabulation (again)! @#$%!%
|
|
;//
|
|
;// Rev 1.15 15 May 1996 16:53:24 BNICKERS
|
|
;//
|
|
;// Fix SWD tabulation.
|
|
;//
|
|
;// Rev 1.14 15 May 1996 11:33:28 BNICKERS
|
|
;// Bug fix for calc of total SWD.
|
|
;//
|
|
;// Rev 1.13 14 May 1996 12:18:58 BNICKERS
|
|
;// Initial debugging of MMx B-Frame ME.
|
|
;//
|
|
;// Rev 1.12 03 May 1996 14:03:50 BNICKERS
|
|
;//
|
|
;// Minor bug fixes and integration refinements.
|
|
;//
|
|
;// Rev 1.11 02 May 1996 12:00:32 BNICKERS
|
|
;// Initial integration of B Frame ME, MMX version.
|
|
;//
|
|
;// Rev 1.10 16 Apr 1996 16:40:14 BNICKERS
|
|
;// Fix some important but simple bugs. Start adding table inits for B frm ME.
|
|
;//
|
|
;// Rev 1.9 10 Apr 1996 13:13:44 BNICKERS
|
|
;// Recoding of Motion Estimation, Advanced Prediction.
|
|
;//
|
|
;// Rev 1.8 05 Apr 1996 12:28:10 BNICKERS
|
|
;// Improvements to baseline half pel ME.
|
|
;//
|
|
;// Rev 1.7 26 Mar 1996 12:00:22 BNICKERS
|
|
;// Did some tuning for MMx encode.
|
|
;//
|
|
;// Rev 1.6 20 Mar 1996 17:01:44 KLILLEVO
|
|
;// fixed bug in new quant code
|
|
;//
|
|
;// Rev 1.5 20 Mar 1996 15:26:40 KLILLEVO
|
|
;// changed quantization to match IA quantization
|
|
;//
|
|
;// Rev 1.3 15 Mar 1996 15:51:16 BECHOLS
|
|
;// Completed monolithic - Brian
|
|
;//
|
|
;// Rev 1.0 16 Feb 1996 17:12:12 BNICKERS
|
|
;// Initial revision.
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;
|
|
; MMxMotionEstimation -- This function performs motion estimation for the
|
|
; macroblocks identified in the input list. This is
|
|
; the MMx version. Conditional assembly selects either
|
|
; the H263 or H261 version.
|
|
;
|
|
; Arguments: See ex5me.asm.
|
|
;
|
|
; Other assumptions: See ex5me.asm. Most of the read-only tables needed in
|
|
; ex5me.asm are not needed here.
|
|
;
|
|
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
OPTION M510
|
|
OPTION CASEMAP:NONE
|
|
|
|
IFDEF H261
|
|
ZEROVECTORTHRESHOLD = 600
|
|
NONZEROMVDIFFERENTIAL = 256
|
|
BLOCKMOTIONTHRESHOLD = 1152
|
|
BLOCKMVDIFFERENTIAL = 768
|
|
EMPTYTHRESHOLD = 40
|
|
INTERCODINGTHRESHOLD = 300
|
|
INTRACODINGDIFFERENTIAL = 200
|
|
ELSE
|
|
ZEROVECTORTHRESHOLD = 450
|
|
NONZEROMVDIFFERENTIAL = 375
|
|
BLOCKMOTIONTHRESHOLD = 1152
|
|
BLOCKMVDIFFERENTIAL = 768
|
|
EMPTYTHRESHOLD = 40
|
|
INTERCODINGTHRESHOLD = 1152
|
|
INTRACODINGDIFFERENTIAL = 1000
|
|
ENDIF
|
|
|
|
include iammx.inc
|
|
include e3inst.inc
|
|
include e3mbad.inc
|
|
|
|
.xlist
|
|
include memmodel.inc
|
|
.list
|
|
|
|
include exEDTQ.inc
|
|
|
|
MMXMEDATA SEGMENT PAGE
|
|
ALIGN 16
|
|
|
|
; Storage for Target and Reference frames can interleave into 8K of the 16K
|
|
; cache. Pitch must be 384.
|
|
;
|
|
; C# -- Stands for row number "#" of target macroblock in *C*urrent P frame.
|
|
; B# -- Stands for row number "#" of target macroblock in current *B* frame.
|
|
; R# -- Stands for row number "#" of 0MV *R*ef macroblock in past frame.
|
|
; v -- Stands for a row below 0MV, reference macroblock.
|
|
; These same cache lines would hit reference lines >8 above the 0MV.
|
|
; ^ -- Stands for a row below 0MV, reference macroblock.
|
|
; These same cache lines would hit reference lines >8 below the 0MV.
|
|
; +-+-+
|
|
; | | -- A cache line (32 bytes). Position of letters,<, and > indicate
|
|
; +-+-+ which 16 bytes may be used in the cache line.
|
|
;
|
|
; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
|
; |C0 | | v| |Cb | | ^| |B6 | | R6| |
|
|
; |C1 | | v| |Cc | | ^| |B7 | | R7| |
|
|
; |C2 | | v| |Cd | | ^| |B8 | | R8| |
|
|
; |C3 | | v| |Ce | | ^| |B9 | | R9| |
|
|
; |C4 | | v| |Cf | | ^| |Ba | | Ra| |
|
|
; |C5 | | v| |B0 | | R0| |Bb | | Rb| |
|
|
; |C6 | | v| |B1 | | R1| |Bc | | Rc| |
|
|
; |C7 | | v| |B2 | | R2| |Bd | | Rd| |
|
|
; |C8 | | ^| |B3 | | R3| |Be | | Re| |
|
|
; |C9 | | ^| |B4 | | R4| |Bf | | Rf| |
|
|
; |Ca | | ^| |B5 | | R5| +-+-+-+-+-+-+-+-+
|
|
; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
|
;
|
|
|
|
; The static storage space used for read-only tables, and the stack usage
|
|
; are coordinated such that they mesh in the data cache, and use only one
|
|
; 4K way of the 4-way, 16K cache.
|
|
;
|
|
; The first 32 bytes of the static storage space are unallocated, because
|
|
; the top of stack ranges in this area. As local procedure calls are made
|
|
; within this function, return addresses get pushed into these 32 bytes.
|
|
; (32 bytes; 0: 31)
|
|
|
|
DB 32 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
|
|
|
|
;
|
|
; The next 608 bytes of the static storage space are unallocated, because
|
|
; the local stack frame is made to hit cache at these addresses. More of
|
|
; the local stack frame is allocated after a gap of 64 bytes.
|
|
; (608 bytes; 32: 639)
|
|
|
|
LocalStorage LABEL DWORD
|
|
|
|
DB 608 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
|
|
|
|
; Motion Estimation State Engine adjustments to reference block address to get
|
|
; to next candidate reference block.
|
|
; (64 bytes; 640: 703)
|
|
|
|
FullPelMotionVectorAdjustment LABEL DWORD
|
|
|
|
DD -16*PITCH-8
|
|
VMG EQU 000H+0+8
|
|
VMGHM8 EQU 000H-8+8
|
|
|
|
DD -8*PITCH-8-010H
|
|
VM8HM8 EQU 010H
|
|
|
|
DD -8*PITCH-020H
|
|
VM8 EQU 020H
|
|
VM8HP8 EQU 020H+8
|
|
|
|
DD -4*PITCH-8-030H
|
|
VM4HM8 EQU 030H-8+8
|
|
VM4HM4 EQU 030H-4+8
|
|
VM4 EQU 030H+0+8
|
|
VM4HP4 EQU 030H+4+8
|
|
|
|
DD -4*PITCH+8-040H
|
|
VM4HP8 EQU 040H+8-8
|
|
VM4HPG EQU 040H+16-8
|
|
|
|
DD -2*PITCH-4-050H
|
|
VM2HM4 EQU 050H-4+4
|
|
VM2HM2 EQU 050H-2+4
|
|
VM2HM1 EQU 050H-1+4
|
|
VM2 EQU 050H+0+4
|
|
VM2HP1 EQU 050H+1+4
|
|
VM2HP2 EQU 050H+2+4
|
|
VM2HP4 EQU 050H+4+4
|
|
VM2HP8 EQU 050H+8+4
|
|
|
|
DD -1*PITCH-2-060H
|
|
VM1HM2 EQU 060H-2+2
|
|
VM1HM1 EQU 060H-1+2
|
|
VM1 EQU 060H+0+2
|
|
VM1HP1 EQU 060H+1+2
|
|
VM1HP2 EQU 060H+2+2
|
|
VM1HP4 EQU 060H+4+2
|
|
|
|
DD -16-070H
|
|
HMG EQU 070H-16+16
|
|
HM8 EQU 070H-8+16
|
|
HM4 EQU 070H-4+16
|
|
HM3 EQU 070H-3+16
|
|
HM2 EQU 070H-2+16
|
|
HM1 EQU 070H-1+16
|
|
|
|
DD -080H
|
|
NOADJ EQU 080H
|
|
HP1 EQU 080H+1
|
|
HP2 EQU 080H+2
|
|
HP4 EQU 080H+4
|
|
HP8 EQU 080H+8
|
|
|
|
DD 1*PITCH-2-090H
|
|
VP1HM2 EQU 090H-2+2
|
|
VP1HM1 EQU 090H-1+2
|
|
VP1 EQU 090H+0+2
|
|
VP1HP1 EQU 090H+1+2
|
|
VP1HP2 EQU 090H+2+2
|
|
VP1HP4 EQU 090H+4+2
|
|
|
|
DD 2*PITCH-4-0A0H
|
|
VP2HM4 EQU 0A0H-4+4
|
|
VP2HM2 EQU 0A0H-2+4
|
|
VP2HM1 EQU 0A0H-1+4
|
|
VP2 EQU 0A0H+0+4
|
|
VP2HP1 EQU 0A0H+1+4
|
|
VP2HP2 EQU 0A0H+2+4
|
|
VP2HP4 EQU 0A0H+4+4
|
|
VP2HP8 EQU 0A0H+8+4
|
|
|
|
DD 4*PITCH-8-0B0H
|
|
VP4HM8 EQU 0B0H-8+8
|
|
VP4HM4 EQU 0B0H-4+8
|
|
VP4HM2 EQU 0B0H-2+8
|
|
VP4 EQU 0B0H+0+8
|
|
VP4HP2 EQU 0B0H+2+8
|
|
VP4HP4 EQU 0B0H+4+8
|
|
|
|
DD 4*PITCH+8-0C0H
|
|
VP4HP8 EQU 0C0H+8-8
|
|
VP4HPG EQU 0C0H+16-8
|
|
|
|
DD 8*PITCH-8-0D0H
|
|
VP8HM8 EQU 0D0H-8+8
|
|
VP8HM4 EQU 0D0H-4+8
|
|
|
|
DD 8*PITCH-0E0H
|
|
VP8 EQU 0E0H+0
|
|
VP8HP4 EQU 0E0H+4
|
|
VP8HP8 EQU 0E0H+8
|
|
|
|
DD 16*PITCH-0F0H
|
|
VPG EQU 0F0H+0
|
|
VPGHP8 EQU 0F0H+8
|
|
|
|
; Additional space reserved for stack variables. If more space is needed,
|
|
; it should go here.
|
|
; (160 bytes; 704: 863)
|
|
|
|
DB 160 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
|
|
|
|
; QWORD Constants used by motion estimation, frame differencing, and FDCT.
|
|
; (144 bytes; 864:1007)
|
|
|
|
C0101010101010101 DD 001010101H, 001010101H
|
|
CFFFF0000FFFF0000 DD 0FFFF0000H, 0FFFF0000H
|
|
C0200010101010101 DD 001010101H, 002000101H
|
|
C0001000200020001 DD 000020001H, 000010002H
|
|
CFFFF00000000FFFF DD 00000FFFFH, 0FFFF0000H
|
|
C0000FFFFFFFF0000 DD 0FFFF0000H, 00000FFFFH
|
|
CFF000000000000FF DD 0000000FFH, 0FF000000H
|
|
C0101010101010002 DD 001010002H, 001010101H
|
|
C0100010001000100 DD 001000100H, 001000100H
|
|
C0001000100010001 DD 000010001H, 000010001H
|
|
C7F7F7F7F7F7F7F7F DD 07F7F7F7FH, 07F7F7F7FH
|
|
C1 DD 07D8A7D8AH, 07D8A7D8AH
|
|
C2 DD 076417641H, 076417641H
|
|
C3 DD 06A6D6A6DH, 06A6D6A6DH
|
|
C4 DD 05A825A82H, 05A825A82H
|
|
C5 DD 0471D471DH, 0471D471DH
|
|
C6 DD 030FC30FCH, 030FC30FCH
|
|
C7 DD 018F818F8H, 018F818F8H
|
|
|
|
; Distances to Block Action Descriptors for blocks that provide remote vectors
|
|
; for OBMC. Which element accessed depends on edge condition. Top edge is
|
|
; stack based variable, since different instances may have different distances
|
|
; to BAD of block above. Bottom edge is always a constant, regardless of
|
|
; edge condition. This is used in OBMC frame differencing.
|
|
; (16 bytes; 1008:1023)
|
|
|
|
BlockToLeft DD 0, -SIZEOF T_MacroBlockActionDescr+SIZEOF T_Blk
|
|
BlockToRight DD 0, SIZEOF T_MacroBlockActionDescr-SIZEOF T_Blk
|
|
|
|
; Table to map linearized motion vector to vertical part, used by motion
|
|
; estimation. (Shift linearized motion vector right by 8 bits, and then
|
|
; use result as index into this array to get vertical MV.)
|
|
; (96 bytes; 1024:1119)
|
|
|
|
IF PITCH-384
|
|
*** error: The magic of this table assumes a pitch of 384.
|
|
ENDIF
|
|
DB -64, -64
|
|
DB -62
|
|
DB -60, -60
|
|
DB -58
|
|
DB -56, -56
|
|
DB -54
|
|
DB -52, -52
|
|
DB -50
|
|
DB -48, -48
|
|
DB -46
|
|
DB -44, -44
|
|
DB -42
|
|
DB -40, -40
|
|
DB -38
|
|
DB -36, -36
|
|
DB -34
|
|
DB -32, -32
|
|
DB -30
|
|
DB -28, -28
|
|
DB -26
|
|
DB -24, -24
|
|
DB -22
|
|
DB -20, -20
|
|
DB -18
|
|
DB -16, -16
|
|
DB -14
|
|
DB -12, -12
|
|
DB -10
|
|
DB -8, -8
|
|
DB -6
|
|
DB -4, -4
|
|
DB -2
|
|
DB 0
|
|
UnlinearizedVertMV DB 0
|
|
DB 2
|
|
DB 4, 4
|
|
DB 6
|
|
DB 8, 8
|
|
DB 10
|
|
DB 12, 12
|
|
DB 14
|
|
DB 16, 16
|
|
DB 18
|
|
DB 20, 20
|
|
DB 22
|
|
DB 24, 24
|
|
DB 26
|
|
DB 28, 28
|
|
DB 30
|
|
DB 32, 32
|
|
DB 34
|
|
DB 36, 36
|
|
DB 38
|
|
DB 40, 40
|
|
DB 42
|
|
DB 44, 44
|
|
DB 46
|
|
DB 48, 48
|
|
DB 50
|
|
DB 52, 52
|
|
DB 54
|
|
DB 56, 56
|
|
DB 58
|
|
DB 60, 60
|
|
DB 62
|
|
; Table to provide index value in low byte, and rounding term of 1 in all bytes.
|
|
; Used in frame differencing, when half pel horizontal interpolation is needed.
|
|
; (1024 bytes; 1120:2143)
|
|
|
|
Pel_Rnd LABEL DWORD
|
|
CNT = 0
|
|
REPEAT 128
|
|
DD CNT+001010101H, 001010101H
|
|
CNT = CNT + 1
|
|
ENDM
|
|
|
|
; Motion Estimation State Engine Rules.
|
|
; (896 bytes;2144:3039)
|
|
|
|
StateEngineFirstRule LABEL BYTE ; Rules that govern state engine of estimator.
|
|
StateEngine EQU StateEngineFirstRule-20+2
|
|
|
|
; Starting States:
|
|
|
|
IF PITCH-384
|
|
*** error: The magic of this table assumes a pitch of 384.
|
|
ENDIF
|
|
DB ? ; 0: not used.
|
|
DB 3 ; 1: Upper left corner.
|
|
DB 3 ; 2: Upper edge.
|
|
DB 3 ; 3: Upper right corner.
|
|
DB 3 ; 4: Left edge.
|
|
DB 3 ; 5: Interior MB, not doing block search.
|
|
DB 0 ; 6: Right edge.
|
|
DB 0 ; 7: Lower left corner.
|
|
DB 0 ; 8: Lower edge.
|
|
DB 0 ; 9: Lower right corner.
|
|
|
|
DB ? ; 0: not used.
|
|
DB 34 ; 1: Upper left corner.
|
|
DB 66 ; 2: Upper edge.
|
|
DB 42 ; 3: Upper right corner.
|
|
DB 98 ; 4: Left edge.
|
|
DB 16 ; 5: Interior MB, not doing block search.
|
|
DB 114 ; 6: Right edge.
|
|
DB 50 ; 7: Lower left corner.
|
|
DB 82 ; 8: Lower edge.
|
|
DB 58 ; 9: Lower right corner.
|
|
|
|
DB ?,? ; Skip 2 bytes.
|
|
|
|
LASTINITIALMESTATE EQU 9
|
|
|
|
; Interior Telescoping States:
|
|
|
|
; Try +/- 8,4,2,1, vertically first, then horizontally.
|
|
|
|
FIRSTBLOCKMESTATE EQU 10
|
|
|
|
DB VM2, VM2, 12, 11 ; 10: V+1 better/worse than central. Try V-1.
|
|
DB VP2HP1, HP1, 13, 13 ; 11: Accept V+1/V-1 as best. Try H+1.
|
|
DB VP1HP1, HP1, 13, 13 ; 12: Accept central/V-1 as best. Try H+1.
|
|
DB HM2, HM2, 15, 14 ; 13: H+1 better/worse than central. Try H-1.
|
|
DB HP2, NOADJ, 0FFH, 0FFH ; 14: Accept H+1/H-1 as best. Done.
|
|
DB HP1, NOADJ, 0FFH, 0FFH ; 15: Accept central/H-1 as best. Done.
|
|
|
|
DB VMG, VMG, 18, 17 ; 16: V+8 better/worse than central. Try V-8.
|
|
DB VPGHP8, HP8, 19, 19 ; 17: Accept V+8/V-8 as best. Try H+8.
|
|
DB VP8HP8, HP8, 19, 19 ; 18: Accept central/V-8 as best. Try H+8.
|
|
DB HMG, HMG, 21, 20 ; 19: H+8 better/worse than central. Try H-8.
|
|
DB VP4HPG, VP4, 22, 22 ; 20: Accept H+8/H-8 as best. Try V+4.
|
|
DB VP4HP8, VP4, 22, 22 ; 21: Accept central/H-8 as best. Try V+4.
|
|
|
|
DB VM8, VM8, 24, 23 ; 22: V+4 better/worse than central. Try V-4.
|
|
DB VP8HP4, HP4, 25, 25 ; 23: Accept V+4/V-4 as best. Try H+4.
|
|
DB VP4HP4, HP4, 25, 25 ; 24: Accept central/V-4 as best. Try H+4.
|
|
DB HM8, HM8, 27, 26 ; 25: H+4 better/worse than central. Try H-4.
|
|
DB VP2HP8, VP2, 28, 28 ; 26: Accept H+4/H-4 as best. Try V+2.
|
|
DB VP2HP4, VP2, 28, 28 ; 27: Accept central/H-4 as best. Try V+2.
|
|
|
|
DB VM4, VM4, 30, 29 ; 28: V+2 better/worse than central. Try V-2.
|
|
DB VP4HP2, HP2, 31, 31 ; 29: Accept V+2/V-2 as best. Try H+2.
|
|
DB VP2HP2, HP2, 31, 31 ; 30: Accept central/V-2 as best. Try H+2.
|
|
DB HM4, HM4, 33, 32 ; 31: H+2 better/worse than central. Try H-2.
|
|
DB VP1HP4, VP1, 10, 10 ; 32: Accept H+2/H-2 as best. Try V+1.
|
|
DB VP1HP2, VP1, 10, 10 ; 33: Accept central/H-2 as best. Try V+1.
|
|
|
|
; Boundary States:
|
|
|
|
; Upper left corner:
|
|
|
|
DB VM8HP8, HP8, 35, 101 ; 34: Accept corner/V+8. Try H+8.
|
|
DB VP4HM8, VP4, 36, 70 ; 35: Accept corner/H+8. Try V+4.
|
|
DB VM4HP4, HP4, 37, 105 ; 36: Accept corner/V+4. Try H+4.
|
|
DB VP2HM4, VP2, 38, 74 ; 37: Accept corner/H+4. Try V+2.
|
|
DB VM2HP2, HP2, 39, 109 ; 38: Accept corner/V+2. Try H+2.
|
|
DB VP1HM2, VP1, 40, 78 ; 39: Accept corner/H+2. Try V+1.
|
|
DB VM1HP1, HP1, 41, 113 ; 40: Accept corner/V+1. Try H+1.
|
|
DB HM1, NOADJ, 0F5H, 0F7H ; 41: Accept corner/H+1. Done.
|
|
|
|
; Upper right corner:
|
|
|
|
DB VM8HM8, HM8, 43, 117 ; 42: Accept corner/V+8. Try H-8.
|
|
DB VP4HP8, VP4, 44, 70 ; 43: Accept corner/H-8. Try V+4.
|
|
DB VM4HM4, HM4, 45, 121 ; 44: Accept corner/V+4. Try H-4.
|
|
DB VP2HP4, VP2, 46, 74 ; 45: Accept corner/H-4. Try V+2.
|
|
DB VM2HM2, HM2, 47, 125 ; 46: Accept corner/V+2. Try H-2.
|
|
DB VP1HP2, VP1, 48, 78 ; 47: Accept corner/H-2. Try V+1.
|
|
DB VM1HM1, HM1, 49, 129 ; 48: Accept corner/V+1. Try H-1.
|
|
DB HP1, NOADJ, 0F6H, 0F7H ; 49: Accept corner/H-1. Done
|
|
|
|
; Lower left corner:
|
|
|
|
DB VP8HP8, HP8, 51, 101 ; 50: Accept corner/V-8. Try H+8.
|
|
DB VM4HM8, VM4, 52, 86 ; 51: Accept corner/H+8. Try V-4.
|
|
DB VP4HP4, HP4, 53, 105 ; 52: Accept corner/V-4. Try H+4.
|
|
DB VM2HM4, VM2, 54, 90 ; 53: Accept corner/H+4. Try V-2.
|
|
DB VP2HP2, HP2, 55, 109 ; 54: Accept corner/V-2. Try H+2.
|
|
DB VM1HM2, VM1, 56, 94 ; 55: Accept corner/H+2. Try V-1.
|
|
DB VP1HP1, HP1, 57, 113 ; 56: Accept corner/V-1. Try H+1.
|
|
DB HM1, NOADJ, 0F9H, 0FBH ; 57: Accept corner/H+1. Done.
|
|
|
|
; Lower right corner:
|
|
|
|
DB VP8HM8, HM8, 59, 117 ; 58: Accept corner/V-8. Try H-8.
|
|
DB VM4HP8, VM4, 60, 86 ; 59: Accept corner/H-8. Try V-4.
|
|
DB VP4HM4, HM4, 61, 121 ; 60: Accept corner/V-4. Try H-4.
|
|
DB VM2HP4, VM2, 62, 90 ; 61: Accept corner/H-4. Try V-2.
|
|
DB VP2HM2, HM2, 63, 125 ; 62: Accept corner/V-2. Try H-2.
|
|
DB VM1HP2, VM1, 64, 94 ; 63: Accept corner/H-2. Try V-1.
|
|
DB VP1HM1, HM1, 65, 129 ; 64: Accept corner/V-1. Try H-1.
|
|
DB HP1, NOADJ, 0FAH, 0FBH ; 65: Accept corner/H-1. Done.
|
|
|
|
; Upper edge:
|
|
|
|
DB VM8HP8, HP8, 67, 19 ; 66: Accept central/V+8 as best. Try H+8.
|
|
DB HMG, HMG, 69, 68 ; 67: H+8 worse/better than central. Try H-8.
|
|
DB VP4HPG, VP4, 70, 70 ; 68: Accept H+8/H-8 as best. Try V+4.
|
|
DB VP4HP8, VP4, 70, 70 ; 69: Accept central/H-8 as best. Try V+4.
|
|
DB VM4HP4, HP4, 71, 25 ; 70: Accept central/V+4 as best. Try H+4.
|
|
DB HM8, HM8, 73, 72 ; 71: H+4 worse/better than central. Try H-4.
|
|
DB VP2HP8, VP2, 74, 74 ; 72: Accept H+4/H-4 as best. Try V+2.
|
|
DB VP2HP4, VP2, 74, 74 ; 73: Accept central/H-4 as best. Try V+2.
|
|
DB VM2HP2, HP2, 75, 31 ; 74: Accept central/V+2 as best. Try H+2.
|
|
DB HM4, HM4, 77, 76 ; 75: H+2 worse/better than central. Try H-2.
|
|
DB VP1HP4, VP1, 78, 78 ; 76: Accept H+2/H-2 as best. Try V+1.
|
|
DB VP1HP2, VP1, 78, 78 ; 77: Accept central/H-2 as best. Try V+1.
|
|
DB VM1HP1, HP1, 79, 13 ; 78: Accept central/V+1 as best. Try H+1.
|
|
DB HM2, HM2, 81, 80 ; 79: H+1 worse/better than central. Try H-1.
|
|
DB HP2, NOADJ, 0F7H, 0F7H ; 80: Accept H+1/H-1 as best. Done.
|
|
DB HP1, NOADJ, 0F7H, 0F7H ; 81: Accept central/H-1 as best. Done.
|
|
|
|
; Lower edge:
|
|
|
|
DB VP8HP8, HP8, 83, 19 ; 82: Accept central/V-8 as best. Try H+8.
|
|
DB HMG, HMG, 85, 84 ; 83: H+8 worse/better than central. Try H-8.
|
|
DB VM4HPG, VM4, 86, 86 ; 84: Accept H+8/H-8 as best. Try V-4.
|
|
DB VM4HP8, VM4, 86, 86 ; 85: Accept central/H-8 as best. Try V-4.
|
|
DB VP4HP4, HP4, 87, 25 ; 86: Accept central/V-4 as best. Try H+4.
|
|
DB HM8, HM8, 89, 88 ; 87: H+4 worse/better than central. Try H-4.
|
|
DB VM2HP8, VM2, 90, 90 ; 88: Accept H+4/H-4 as best. Try V-2.
|
|
DB VM2HP4, VM2, 90, 90 ; 89: Accept central/H-4 as best. Try V-2.
|
|
DB VP2HP2, HP2, 91, 31 ; 90: Accept central/V-2 as best. Try H+2.
|
|
DB HM4, HM4, 93, 92 ; 91: H+2 worse/better than central. Try H-2.
|
|
DB VM1HP4, VM1, 94, 94 ; 92: Accept H+2/H-2 as best. Try V-1.
|
|
DB VM1HP2, VM1, 94, 94 ; 93: Accept central/H-2 as best. Try V-1.
|
|
DB VP1HP1, HP1, 95, 13 ; 94: Accept central/V-1 as best. Try H+1.
|
|
DB HM2, HM2, 97, 96 ; 95: H+1 worse/better than central. Try H-1.
|
|
DB HP2, NOADJ, 0FBH, 0FBH ; 96: Accept H+1/H-1 as best. Done.
|
|
DB HP1, NOADJ, 0FBH, 0FBH ; 97: Accept central/H-1 as best. Done.
|
|
|
|
; Left edge:
|
|
|
|
DB VMG, VMG, 100, 99 ; 98: V+8 worse/better than central. Try V-8.
|
|
DB VPGHP8, HP8, 101, 101 ; 99: Accept V+8/V-8 as best. Try H+8.
|
|
DB VP8HP8, HP8, 101, 101 ; 100: Accept central/V-8 as best. Try H+8.
|
|
DB VP4HM8, VP4, 102, 22 ; 101: Accept central/H+8 as best. Try V+4.
|
|
DB VM8, VM8, 104, 103 ; 102: V+4 worse/better than central. Try V-4.
|
|
DB VP8HP4, HP4, 105, 105 ; 103: Accept V+4/V-4 as best. Try H+4.
|
|
DB VP4HP4, HP4, 105, 105 ; 104: Accept central/V-4 as best. Try H+4.
|
|
DB VP2HM4, VP2, 106, 28 ; 105: Accept central/H+4 as best. Try V+2.
|
|
DB VM4, VM4, 108, 107 ; 106: V+2 worse/better than central. Try V-2.
|
|
DB VP4HP2, HP2, 109, 109 ; 107: Accept V+2/V-2 as best. Try H+2.
|
|
DB VP2HP2, HP2, 109, 109 ; 108: Accept central/V-2 as best. Try H+2.
|
|
DB VP1HM2, VP1, 110, 10 ; 109: Accept central/H+2 as best. Try V+1.
|
|
DB VM2, VM2, 112, 111 ; 110: V+1 worse/better than central. Try V-1.
|
|
DB VP2HP1, HP1, 113, 113 ; 111: Accept V+1/V-1 as best. Try H+1.
|
|
DB VP1HP1, HP1, 113, 113 ; 112: Accept central/V-1 as best. Try H+1.
|
|
DB HM1, NOADJ, 0FDH, 0FDH ; 113: Accept central/H+1 as best. Done.
|
|
|
|
; Right edge:
|
|
|
|
DB VPG, VPG, 116, 115 ; 114: V-8 worse/better than central. Try V+8.
|
|
DB VMGHM8, HM8, 117, 117 ; 115: Accept V-8/V+8 as best. Try H-8.
|
|
DB VM8HM8, HM8, 117, 117 ; 116: Accept central/V+8 as best. Try H-8.
|
|
DB VP4HP8, VP4, 118, 22 ; 117: Accept central/H+8 as best. Try V+4.
|
|
DB VM8, VM8, 120, 119 ; 118: V+4 worse/better than central. Try V-4.
|
|
DB VP8HM4, HM4, 121, 121 ; 119: Accept V+4/V-4 as best. Try H-4.
|
|
DB VP4HM4, HM4, 121, 121 ; 120: Accept central/V-4 as best. Try H-4.
|
|
DB VP2HP4, VP2, 122, 28 ; 121: Accept central/H+4 as best. Try V+2.
|
|
DB VM4, VM4, 124, 123 ; 122: V+2 worse/better than central. Try V-2.
|
|
DB VP4HM2, HM2, 125, 125 ; 123: Accept V+2/V-2 as best. Try H-2.
|
|
DB VP2HM2, HM2, 125, 125 ; 124: Accept central/V-2 as best. Try H-2.
|
|
DB VP1HP2, VP1, 126, 10 ; 125: Accept central/H+2 as best. Try V+1.
|
|
DB VM2, VM2, 128, 127 ; 126: V+1 worse/better than central. Try V-1.
|
|
DB VP2HM1, HM1, 129, 129 ; 127: Accept V+1/V-1 as best. Try H-1.
|
|
DB VP1HM1, HM1, 129, 129 ; 128: Accept central/V-1 as best. Try H-1.
|
|
DB HP1, NOADJ, 0FEH, 0FEH ; 129: Accept central/H+1 as best. Done.
|
|
|
|
; Exhaustive search, radius 1 here, reaching out to radius 2 further below.
|
|
; . . . . .
|
|
; . 2 5 3 . C = center.
|
|
; . 7 C 8 .
|
|
; . 4 6 1 . # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
FIRST_HEURISTIC_EXHAUSTIVE = 130
|
|
|
|
DB VM2HM2, VM2HM2, 131, 138 ; 130: #1 worse/better than C. Try #2.
|
|
DB HP2, HP2, 132, 145 ; 131: #2 worse/better than C. Try #3.
|
|
DB VP2HM2, VP2HM2, 133, 151 ; 132: #3 worse/better than C. Try #4.
|
|
DB VM2HP1, VM2HP1, 134, 156 ; 133: #4 worse/better than C. Try #5.
|
|
DB VP2, VP2, 135, 160 ; 134: #5 worse/better than C. Try #6.
|
|
DB VM1HM1, VM1HM1, 136, 163 ; 135: #6 worse/better than C. Try #7.
|
|
DB HP2, HP2, 137, 165 ; 136: #7 worse/better than C. Try #8.
|
|
DB HM1, HP1, 0FFH, 166 ; 137: If C best, quit. If 8 best, keep going.
|
|
DB HP2, HP2, 139, 145 ; 138: #2 worse/better than #1. Try #3.
|
|
DB VP2HM2, VP2HM2, 140, 151 ; 139: #3 worse/better than #1. Try #4.
|
|
DB VM2HP1, VM2HP1, 141, 156 ; 140: #4 worse/better than #1. Try #5.
|
|
DB VP2, VP2, 142, 160 ; 141: #5 worse/better than #1. Try #6.
|
|
DB VM1HM1, VM1HM1, 143, 163 ; 142: #6 worse/better than #1. Try #7.
|
|
DB HP2, HP2, 144, 165 ; 143: #7 worse/better than #1. Try #8.
|
|
DB HP1, HP1, 199, 166 ; 144: #8 worse/better than #1. Take best, go on.
|
|
DB VP2HM2, VP2HM2, 146, 151 ; 145: #3 worse/better than #2. Try #4.
|
|
DB VM2HP1, VM2HP1, 147, 156 ; 146: #4 worse/better than #2. Try #5.
|
|
DB VP2, VP2, 148, 160 ; 147: #5 worse/better than #2. Try #6.
|
|
DB VM1HM1, VM1HM1, 149, 163 ; 148: #6 worse/better than #2. Try #7.
|
|
DB HP2, HP2, 150, 165 ; 149: #7 worse/better than #2. Try #8.
|
|
DB HM3, HP1, 208, 166 ; 150: #8 worse/better than #2. Take best, go on.
|
|
DB VM2HP1, VM2HP1, 152, 156 ; 151: #4 worse/better than #3. Try #5.
|
|
DB VP2, VP2, 153, 160 ; 152: #5 worse/better than #3. Try #6.
|
|
DB VM1HM1, VM1HM1, 154, 163 ; 153: #6 worse/better than #3. Try #7.
|
|
DB HP2, HP2, 155, 165 ; 154: #7 worse/better than #3. Try #8.
|
|
DB HP1, HP1, 217, 166 ; 155: #8 worse/better than #3. Take best, go on.
|
|
DB VP2, VP2, 157, 160 ; 156: #5 worse/better than #4. Try #6.
|
|
DB VM1HM1, VM1HM1, 158, 163 ; 157: #6 worse/better than #4. Try #7.
|
|
DB HP2, HP2, 159, 165 ; 158: #7 worse/better than #4. Try #8.
|
|
DB HM3, HP1, 190, 166 ; 159: #8 worse/better than #4. Take best, go on.
|
|
DB VM1HM1, VM1HM1, 161, 163 ; 160: #6 worse/better than #5. Try #7.
|
|
DB HP2, HP2, 162, 165 ; 161: #7 worse/better than #5. Try #8.
|
|
DB VM2HM1, HP1, 184, 166 ; 162: #8 worse/better than #5. Take best, go on.
|
|
DB HP2, HP2, 164, 165 ; 163: #7 worse/better than #6. Try #8.
|
|
DB VP2HM1, HP1, 176, 166 ; 164: #8 worse/better than #6. Take best, go on.
|
|
DB HM3, HP1, 172, 166 ; 165: #8 worse/better than #7. Take best, go on.
|
|
|
|
; . . . . . C = center.
|
|
; . ~ ~ ~ 2 ~ = tried, but not as good.
|
|
; . ~ C X 1 X = best so far.
|
|
; . ~ ~ ~ 3 # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
DB VM1, VM1, 167, 169 ; 166: #1 better/worse than X. Try #2.
|
|
DB VP2, VP2, 168, 171 ; 167: #2 better/worse than X. Try #3.
|
|
DB VM1HM1, NOADJ, 0FFH,0FFH ; 168: #3 better/worse than X. Take best, quit.
|
|
DB VP2, VP2, 170, 171 ; 169: #2 better/worse than #1. Try #3.
|
|
DB VM1, NOADJ, 0FFH,0FFH ; 170: #3 better/worse than #1. Take best, quit.
|
|
DB VM2, NOADJ, 0FFH,0FFH ; 171: #3 better/worse than #2. Take best, quit.
|
|
|
|
; . . . . . C = center.
|
|
; 2 ~ ~ ~ . ~ = tried, but not as good.
|
|
; 1 X C ~ . X = best so far.
|
|
; 3 ~ ~ ~ . # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
DB VM1, VM1, 173, 175 ; 172: #1 better/worse than X. Try #2.
|
|
DB VP2, VP2, 174, 177 ; 173: #2 better/worse than X. Try #3.
|
|
DB VM1HP1, NOADJ, 0FFH,0FFH ; 174: #3 better/worse than X. Take best, quit.
|
|
DB VP2, VP2, 176, 177 ; 175: #2 better/worse than #1. Try #3.
|
|
DB VM1, NOADJ, 0FFH,0FFH ; 176: #3 better/worse than #1. Take best, quit.
|
|
DB VM2, NOADJ, 0FFH,0FFH ; 177: #3 better/worse than #2. Take best, quit.
|
|
|
|
; . . . . . C = center.
|
|
; . ~ ~ ~ . ~ = tried, but not as good.
|
|
; . ~ C ~ . X = best so far.
|
|
; . ~ X ~ . # = order to try additional candidates.
|
|
; . 2 1 3 .
|
|
|
|
DB HM1, HM1, 179, 181 ; 178: #1 better/worse than X. Try #2.
|
|
DB HP2, HP2, 180, 183 ; 179: #2 better/worse than X. Try #3.
|
|
DB VM1HM1, NOADJ, 0FFH,0FFH ; 180: #3 better/worse than X. Take best, quit.
|
|
DB HP2, HP2, 182, 183 ; 181: #2 better/worse than #1. Try #3.
|
|
DB HM1, NOADJ, 0FFH,0FFH ; 182: #3 better/worse than #1. Take best, quit.
|
|
DB HM2, NOADJ, 0FFH,0FFH ; 183: #3 better/worse than #2. Take best, quit.
|
|
|
|
; . 2 1 3 . C = center.
|
|
; . ~ X ~ . ~ = tried, but not as good.
|
|
; . ~ C ~ . X = best so far.
|
|
; . ~ ~ ~ . # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
DB HM1, HM1, 185, 187 ; 184: #1 better/worse than X. Try #2.
|
|
DB HP2, HP2, 186, 189 ; 185: #2 better/worse than X. Try #3.
|
|
DB VP1HM1, NOADJ, 0FFH,0FFH ; 186: #3 better/worse than X. Take best, quit.
|
|
DB HP2, HP2, 188, 189 ; 187: #2 better/worse than #1. Try #3.
|
|
DB HM1, NOADJ, 0FFH,0FFH ; 188: #3 better/worse than #1. Take best, quit.
|
|
DB HM2, NOADJ, 0FFH,0FFH ; 189: #3 better/worse than #2. Take best, quit.
|
|
|
|
; . . . . . C = center.
|
|
; . ~ ~ ~ . ~ = tried, but not as good.
|
|
; 1 ~ C ~ . X = best so far.
|
|
; 2 X ~ ~ . # = order to try additional candidates.
|
|
; 4 3 5 . .
|
|
|
|
DB VP1, VP1, 191, 195 ; 190: #1 better/worse than X. Try #2.
|
|
DB VP1HP1, VP1HP1, 178, 192 ; 191: #2 better/worse than X. Try #3.
|
|
DB HM1, HM1, 193, 181 ; 192: #3 better/worse than #2. Try #4.
|
|
DB HP2, HP2, 194, 183 ; 193: #4 better/worse than #2. Try #5.
|
|
DB VM1HM2, NOADJ, 0FFH,0FFH ; 194: #5 better/worse than #2. Take best, quit.
|
|
DB VP1HP1, VP1HP1, 196, 192 ; 195: #2 better/worse than #1. Try #3.
|
|
DB HM1, HM1, 197, 181 ; 196: #3 better/worse than #1. Try #4.
|
|
DB HP2, HP2, 198, 183 ; 197: #4 better/worse than #1. Try #5.
|
|
DB VM2HM2, NOADJ, 0FFH,0FFH ; 198: #5 better/worse than #1. Take best, quit.
|
|
|
|
; . . . . . C = center.
|
|
; . ~ ~ ~ . ~ = tried, but not as good.
|
|
; . ~ C ~ 1 X = best so far.
|
|
; . ~ ~ X 2 # = order to try additional candidates.
|
|
; . . 4 3 5
|
|
|
|
DB VP1, VP1, 200, 204 ; 199: #1 better/worse than X. Try #2.
|
|
DB VP1HM1, VP1HM1, 178, 201 ; 200: #2 better/worse than X. Try #3.
|
|
DB HM1, HM1, 202, 181 ; 201: #3 better/worse than #2. Try #4.
|
|
DB HP2, HP2, 203, 183 ; 202: #4 better/worse than #2. Try #5.
|
|
DB VM1, NOADJ, 0FFH,0FFH ; 203: #5 better/worse than #2. Take best, quit.
|
|
DB VP1HM1, VP1HM1, 205, 201 ; 204: #2 better/worse than #1. Try #3.
|
|
DB HM1, HM1, 206, 181 ; 205: #3 better/worse than #1. Try #4.
|
|
DB HP2, HP2, 207, 183 ; 206: #4 better/worse than #1. Try #5.
|
|
DB VM2, NOADJ, 0FFH,0FFH ; 207: #5 better/worse than #1. Take best, quit.
|
|
|
|
; 4 3 5 . . C = center.
|
|
; 2 X ~ ~ . ~ = tried, but not as good.
|
|
; 1 ~ C ~ . X = best so far.
|
|
; . ~ ~ ~ . # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
DB VM1, VM1, 209, 213 ; 208: #1 better/worse than X. Try #2.
|
|
DB VM1HP1, VM1HP1, 184, 210 ; 209: #2 better/worse than X. Try #3.
|
|
DB HM1, HM1, 211, 187 ; 210: #3 better/worse than #2. Try #4.
|
|
DB HP2, HP2, 212, 189 ; 211: #4 better/worse than #2. Try #5.
|
|
DB VP1HM2, NOADJ, 0FFH,0FFH ; 212: #5 better/worse than #2. Take best, quit.
|
|
DB VM1HP1, VM1HP1, 214, 210 ; 213: #2 better/worse than #1. Try #3.
|
|
DB HM1, HM1, 215, 187 ; 214: #3 better/worse than #1. Try #4.
|
|
DB HP2, HP2, 216, 189 ; 215: #4 better/worse than #1. Try #5.
|
|
DB VP2HM2, NOADJ, 0FFH,0FFH ; 216: #5 better/worse than #1. Take best, quit.
|
|
|
|
; . . 4 3 5 C = center.
|
|
; . ~ ~ X 2 ~ = tried, but not as good.
|
|
; . ~ C ~ 1 X = best so far.
|
|
; . ~ ~ ~ . # = order to try additional candidates.
|
|
; . . . . .
|
|
|
|
DB VM1, VM1, 218, 222 ; 217: #1 better/worse than X. Try #2.
|
|
DB VM1HM1, VM1HM1, 184, 219 ; 218: #2 better/worse than X. Try #3.
|
|
DB HM1, HM1, 220, 187 ; 219: #3 better/worse than #2. Try #4.
|
|
DB HP2, HP2, 221, 189 ; 220: #4 better/worse than #2. Try #5.
|
|
DB VP1, NOADJ, 0FFH,0FFH ; 221: #5 better/worse than #2. Take best, quit.
|
|
DB VM1HM1, VM1HM1, 223, 219 ; 222: #2 better/worse than #1. Try #3.
|
|
DB HM1, HM1, 224, 187 ; 223: #3 better/worse than #1. Try #4.
|
|
DB HP2, HP2, 225, 189 ; 224: #4 better/worse than #1. Try #5.
|
|
DB VP2, NOADJ, 0FFH,0FFH ; 225: #5 better/worse than #1. Take best, quit.
|
|
|
|
FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR = 226
|
|
|
|
DB VP1HP1, VP1HP1, 130, 130 ; 226: Redoing ctr, away from limiting edge.
|
|
|
|
DB ?, ?, ?, ?, ?, ?
|
|
|
|
; Table of values to add to SWDs for half pel reference macroblocks, to cause
|
|
; those that are off the edge of the frame to produce artificially high SWDs.
|
|
; (64 bytes;3040:3103)
|
|
|
|
InvalidateBadHalfPelMVs LABEL DWORD
|
|
|
|
DD 0FFFFFFFFH, 0FFFFFF00H, 0FFFF00FFH, 0FFFF0000H
|
|
DD 0FF00FFFFH, 0FF00FF00H, 0FF0000FFH, 0FF000000H
|
|
DD 000FFFFFFH, 000FFFF00H, 000FF00FFH, 000FF0000H
|
|
DD 00000FFFFH, 00000FF00H, 0000000FFH, 000000000H
|
|
|
|
; Tables (interleaved) to select case from next table (below these) to drive
|
|
; the weighting of the future and past predictions in the construction of
|
|
; B-frame reference blocks.
|
|
; (448 bytes;3104:3551)
|
|
|
|
VertWtSel LABEL BYTE
|
|
DB 0
|
|
HorzWtSel LABEL BYTE
|
|
DB 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 1, 0
|
|
DB 1, 0
|
|
DB 2, 16
|
|
DB 2, 16
|
|
DB 3, 32
|
|
DB 3, 32
|
|
DB 4, 48
|
|
DB 4, 48
|
|
DB 5, 64
|
|
DB 5, 64
|
|
DB 6, 80
|
|
DB 6, 80
|
|
DB 7, 96
|
|
DB 7, 96
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 8, 112
|
|
DB 9, 128
|
|
DB 9, 128
|
|
DB 10, 144
|
|
DB 10, 144
|
|
DB 11, 160
|
|
DB 11, 160
|
|
DB 12, 176
|
|
DB 12, 176
|
|
DB 13, 192
|
|
DB 13, 192
|
|
DB 14, 208
|
|
DB 14, 208
|
|
DB 15, 224
|
|
DB 15, 224
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240 ; Chroma starts here
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240 ; Luma ends here
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 1, 0
|
|
DB 1, 0
|
|
DB 2, 16
|
|
DB 2, 16
|
|
DB 3, 32
|
|
DB 3, 32
|
|
DB 4, 48
|
|
DB 4, 48
|
|
DB 5, 64
|
|
DB 5, 64
|
|
DB 6, 80
|
|
DB 6, 80
|
|
DB 7, 96
|
|
DB 7, 96
|
|
DB 8, 112
|
|
DB 9, 128
|
|
DB 9, 128
|
|
DB 10, 144
|
|
DB 10, 144
|
|
DB 11, 160
|
|
DB 11, 160
|
|
DB 12, 176
|
|
DB 12, 176
|
|
DB 13, 192
|
|
DB 13, 192
|
|
DB 14, 208
|
|
DB 14, 208
|
|
DB 15, 224
|
|
DB 15, 224
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
DB 0, 240
|
|
|
|
; Table indexed by VertWtSel and HorzWtSel to get index of weight to apply to
|
|
; future and past predictions in the construction of B-frame reference blocks
|
|
; for frame differencing.
|
|
; (264 bytes;3552:3815)
|
|
;
|
|
; Indexed by VertWtSel[VMV]+HorzWtSel[HMV]+N to get idx of weight for line N.
|
|
|
|
P8F0 = 0*8
|
|
F1P7 = 1*8
|
|
F2P6 = 2*8
|
|
F3P5 = 3*8
|
|
F4P4 = 4*8
|
|
F5P3 = 5*8
|
|
F6P2 = 6*8
|
|
F7P1 = 7*8
|
|
F8P0 = 8*8
|
|
P1F7 = 9*8
|
|
P2F6 = 10*8
|
|
P3F5 = 11*8
|
|
P4F4 = 12*8
|
|
P5F3 = 13*8
|
|
P6F2 = 14*8
|
|
P7F1 = 15*8
|
|
|
|
Diff_IdxRefWts LABEL BYTE
|
|
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
|
|
|
|
BFrmSWDState LABEL BYTE ; State engine rules for finding best motion vector.
|
|
; (48 bytes; 3816:3863)
|
|
|
|
; 1st number: Horizontal Motion displacement to try, in half pel increments.
|
|
; 2nd number: Vertical Motion displacement to try, in half pel increments.
|
|
; 3rd number: Next state to enter if previous best is still best.
|
|
; 4th number: Next state to enter if this motion is better than previous best.
|
|
|
|
DB -2, 0, 4, 8 ; 0 -- ( 0, 0) Try (-2, 0)
|
|
DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0)
|
|
DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0)
|
|
DB 0, -2, 16, 20 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2})
|
|
DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2)
|
|
DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
|
|
|
|
DB -1, 0, 28, 32 ; 24
|
|
DB 1, 0, 36, 36 ; 28
|
|
DB 2, 0, 36, 36 ; 32
|
|
DB 0, -1, 40, 44 ; 36
|
|
DB 0, 1, 0, 0 ; 40
|
|
DB 0, 2, 0, 0 ; 44
|
|
|
|
; Table used by Quant RLE to navigate the zigzag order of quantized coeffs.
|
|
; Contents of this table are initialized by first entry to MMxEDTQ. In
|
|
; unlikely event of race condition, it will just get initialized by more
|
|
; than one encoder instance.
|
|
; (128 bytes; 3864:3991)
|
|
|
|
NextZigZagCoeff LABEL BYTE
|
|
|
|
DB 128 DUP (0FFH)
|
|
|
|
; Table used to initial above table.
|
|
; (64 bytes: 3992:4055)
|
|
|
|
InitZigZagCoeff LABEL BYTE
|
|
|
|
DB Q01,Q10,Q20,Q11,Q02,Q03,Q12,Q21,Q30,Q40,Q31,Q22,Q13,Q04,Q05,Q14
|
|
DB Q23,Q32,Q41,Q50,Q60,Q51,Q42,Q33,Q24,Q15,Q06,Q07,Q16,Q25,Q34,Q43
|
|
DB Q52,Q61,Q70,Q71,Q62,Q53,Q44,Q35,Q26,Q17,Q27,Q36,Q45,Q54,Q63,Q72
|
|
DB Q73,Q64,Q55,Q46,Q37,Q47,Q56,Q65,Q74,Q75,Q66,Q57,Q67,Q76,Q77, 0
|
|
|
|
; Constants needed by the Quant RLE phase.
|
|
; (128 bytes; 4056:4183)
|
|
|
|
Recip2QP LABEL DWORD
|
|
WORD 0H, 0H ; QP = 000h
|
|
WORD 04000H, 04000H ; QP = 001h
|
|
WORD 02000H, 02000H ; QP = 002h
|
|
WORD 01555H, 01555H ; QP = 003h
|
|
WORD 01000H, 01000H ; QP = 004h
|
|
WORD 00CCCH, 00CCCH ; QP = 005h
|
|
WORD 00AAAH, 00AAAH ; QP = 006h
|
|
WORD 00924H, 00924H ; QP = 007h
|
|
WORD 00800H, 00800H ; QP = 008h
|
|
WORD 0071CH, 0071CH ; QP = 009h
|
|
WORD 00666H, 00666H ; QP = 00Ah
|
|
WORD 005D1H, 005D1H ; QP = 00Bh
|
|
WORD 00555H, 00555H ; QP = 00Ch
|
|
WORD 004ECH, 004ECH ; QP = 00Dh
|
|
WORD 00492H, 00492H ; QP = 00Eh
|
|
WORD 00444H, 00444H ; QP = 00Fh
|
|
WORD 00400H, 00400H ; QP = 010h
|
|
WORD 003C3H, 003C3H ; QP = 011h
|
|
WORD 0038EH, 0038EH ; QP = 012h
|
|
WORD 0035EH, 0035EH ; QP = 013h
|
|
WORD 00333H, 00333H ; QP = 014h
|
|
WORD 0030CH, 0030CH ; QP = 015h
|
|
WORD 002E8H, 002E8H ; QP = 016h
|
|
WORD 002C8H, 002C8H ; QP = 017h
|
|
WORD 002AAH, 002AAH ; QP = 018h
|
|
WORD 0028FH, 0028FH ; QP = 019h
|
|
WORD 00276H, 00276H ; QP = 01Ah
|
|
WORD 0025EH, 0025EH ; QP = 01Bh
|
|
WORD 00249H, 00249H ; QP = 01Ch
|
|
WORD 00234H, 00234H ; QP = 01Dh
|
|
WORD 00222H, 00222H ; QP = 01Eh
|
|
WORD 00210H, 00210H ; QP = 01Fh
|
|
|
|
; Skip over space to get to where the following tables can go. They will
|
|
; hit the cache at the same point as a portion of the StateEngine states
|
|
; that aren't used in the heuristic ME mode.
|
|
; (2056 bytes; 4184:6239)
|
|
|
|
DB 2056 DUP (?) ; Static space place-holder.
|
|
|
|
; Table to select base address in next table below to use for particular block
|
|
; of macroblock. First column provides address of base element of HorzWtSel
|
|
; to use to map horizontal MV to list of weighting indices to use. ; Second
|
|
; column is similar, but for Vertical MV. Third and fourth columns not used.
|
|
; 6 rows; one for each block in a macroblock.
|
|
; (88 bytes; 6240:6327)
|
|
|
|
LeftRightBlkPosition LABEL DWORD
|
|
DD HorzWtSel+0-64
|
|
UpDownBlkPosition LABEL DWORD
|
|
DD VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
|
|
DD HorzWtSel+32-64, VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
|
|
DD HorzWtSel+0-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
|
|
DD HorzWtSel+32-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
|
|
DD HorzWtSel+128, VertWtSel+128, 0DEADBEEFH
|
|
BlkEmptyFlag LABEL BYTE ; sneak this in here
|
|
DB 16, 0, 32, 0
|
|
DD HorzWtSel+128, VertWtSel+128
|
|
|
|
|
|
; The following table, indexed by MBEdgeType&7, returns a mask which is used to
|
|
; zero-out the motion vectors for predictors that are off the edge of the
|
|
; frame. The index is a 3 bit value, each bit being set if the macroblock
|
|
; is NOT on the corresponding edge. 1 == left; 2 == right; 4 == top;
|
|
; The value gotten out is (where A==left; B==above; C==above right):
|
|
; <mask(A) mask(A) mask(C) mask(C) mask(B) mask(B) mask(A) mask(A)>
|
|
; The mask is 0xFF if the corresponding remote block is NOT off the edge, and
|
|
; 0x00 if it is off the edge.
|
|
; (32 bytes: 6328: 6359)
|
|
|
|
ValidRemoteVectors LABEL DWORD
|
|
DWORD 0DEADBEEFH ; 0: Can't be on left and right edges at once.
|
|
DWORD 0FF0000FFH ; 1: Top right corner.
|
|
DWORD 000000000H ; 2: Top left corner.
|
|
DWORD 0FF0000FFH ; 3: Top edge.
|
|
DWORD 0DEADBEEFH ; 4: Can't be on left and right edges at once.
|
|
DWORD 0FF00FFFFH ; 5: Right edge.
|
|
DWORD 000FFFF00H ; 6: Left edge.
|
|
DWORD 0FFFFFFFFH ; 7: Central macroblock.
|
|
|
|
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
|
|
; to be subtracted with saturation to the predicted motion vector for extended
|
|
; motion vector search. Since saturation occurs at 0, the values here are
|
|
; such that the motion vectors are biased to the appropriate point for the
|
|
; clamping effect. The index is a 4 bit value, each bit being set if the
|
|
; macroblock is NOT on the corresponding edge. 1 == left; 2 == right;
|
|
; 4 == top; 8 == bottom. The 8 values being calculated are as follows:
|
|
; ; [ 0: 7] -- HMV lower limit for signature search
|
|
; ; [ 8:15] -- HMV lower limit
|
|
; ; [16:23] -- HMV upper limit for signature search
|
|
; ; [24:31] -- HMV upper limit
|
|
; ; [32:39] -- VMV lower limit for signature search
|
|
; ; [40:47] -- VMV lower limit
|
|
; ; [48:55] -- VMV upper limit for signature search
|
|
; ; [56:63] -- VMV upper limit
|
|
; (88 bytes: 6360:6447)
|
|
|
|
EMV_ClampLowerEnd LABEL DWORD
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
|
|
BYTE 87, 94, 97, 100, ; 5: Bottom right corner.
|
|
87, 94, 97, 100
|
|
BYTE 119, 126, 97, 100, ; 6: Bottom left corner.
|
|
87, 94, 97, 100
|
|
BYTE 87, 94, 97, 100, ; 7: Bottom edge.
|
|
87, 94, 97, 100
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
|
|
BYTE 87, 94, 97, 100, ; 9: Top right corner.
|
|
119, 126, 97, 100
|
|
BYTE 119, 126, 97, 100, ; 10: Top left corner.
|
|
119, 126, 97, 100
|
|
BYTE 87, 94, 97, 100, ; 11: Top edge.
|
|
119, 126, 97, 100
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
|
|
BYTE 87, 94, 97, 100, ; 13: Right edge.
|
|
87, 94, 97, 100
|
|
BYTE 119, 126, 97, 100, ; 14: Left edge.
|
|
87, 94, 97, 100
|
|
BYTE 87, 94, 97, 100, ; 15: Central macroblock.
|
|
87, 94, 97, 100
|
|
|
|
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
|
|
; to be added with saturation to the result of the application of the preceed-
|
|
; ing table, to clamp the upper limit on the motion vector search parameters.
|
|
; Since saturation occurs at 255, the values here are such that the motion
|
|
; vectors are biased to the appropriate point for the clamping effect.
|
|
; (88 bytes: 6448:6535)
|
|
|
|
EMV_ClampUpperEnd LABEL DWORD
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
|
|
BYTE 184, 193, 216, 225, ; 5: Bottom right corner.
|
|
184, 193, 216, 225
|
|
BYTE 216, 225, 184, 193, ; 6: Bottom left corner.
|
|
184, 193, 216, 225
|
|
BYTE 184, 193, 184, 193, ; 7: Bottom edge.
|
|
184, 193, 216, 225
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
|
|
BYTE 184, 193, 216, 225, ; 9: Top right corner.
|
|
216, 225, 184, 193
|
|
BYTE 216, 225, 184, 193, ; 10: Top left corner.
|
|
216, 225, 184, 193
|
|
BYTE 184, 193, 184, 193, ; 11: Top edge.
|
|
216, 225, 184, 193
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
|
|
BYTE 184, 193, 216, 225, ; 13: Right edge.
|
|
184, 193, 184, 193
|
|
BYTE 216, 225, 184, 193, ; 14: Left edge.
|
|
184, 193, 184, 193
|
|
BYTE 184, 193, 184, 193, ; 15: Central macroblock.
|
|
184, 193, 184, 193
|
|
|
|
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
|
|
; to be added without saturation to the result of the application of the
|
|
; preceeding table, to return the the motion vector search parameters to the
|
|
; proper range for subsequent use.
|
|
; (88 bytes: 6536:6623)
|
|
|
|
EMV_RestoreRange LABEL DWORD
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
|
|
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
|
|
BYTE 120, 255, 88, 225, ; 5: Bottom right corner.
|
|
120, 255, 88, 225
|
|
BYTE 120, 255, 56, 193, ; 6: Bottom left corner.
|
|
120, 255, 88, 225
|
|
BYTE 120, 255, 56, 193, ; 7: Bottom edge.
|
|
120, 255, 88, 225
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
|
|
BYTE 120, 255, 88, 225, ; 9: Top right corner.
|
|
120, 255, 56, 193
|
|
BYTE 120, 255, 56, 193, ; 10: Top left corner.
|
|
120, 255, 56, 193
|
|
BYTE 120, 255, 56, 193, ; 11: Top edge.
|
|
120, 255, 56, 193
|
|
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
|
|
BYTE 120, 255, 88, 225, ; 13: Right edge.
|
|
120, 255, 56, 193
|
|
BYTE 120, 255, 56, 193, ; 14: Left edge.
|
|
120, 255, 56, 193
|
|
BYTE 120, 255, 56, 193, ; 15: Central macroblock.
|
|
120, 255, 56, 193
|
|
|
|
; Tables indexed by indices fetched from Diff_IdxRefWts. These tables return
|
|
; a multipler to apply to past or future predictions to construct the
|
|
; B-frame candidate reference blocks.
|
|
; (128 bytes;6624:6751)
|
|
|
|
FutureWt_FF_or_00 LABEL DWORD
|
|
|
|
DD 000000000H, 000000000H
|
|
DD 000000000H, 0FF000000H
|
|
DD 000000000H, 0FFFF0000H
|
|
DD 000000000H, 0FFFFFF00H
|
|
DD 000000000H, 0FFFFFFFFH
|
|
DD 0FF000000H, 0FFFFFFFFH
|
|
DD 0FFFF0000H, 0FFFFFFFFH
|
|
DD 0FFFFFF00H, 0FFFFFFFFH
|
|
DD 0FFFFFFFFH, 0FFFFFFFFH
|
|
DD 0FFFFFFFFH, 000FFFFFFH
|
|
DD 0FFFFFFFFH, 00000FFFFH
|
|
DD 0FFFFFFFFH, 0000000FFH
|
|
DD 0FFFFFFFFH, 000000000H
|
|
DD 000FFFFFFH, 000000000H
|
|
DD 00000FFFFH, 000000000H
|
|
DD 0000000FFH, 000000000H
|
|
|
|
MMXMEDATA ENDS
|
|
|
|
;=============================================================================
|
|
|
|
.CODE EDTQ
|
|
|
|
ASSUME cs : FLAT
|
|
ASSUME ds : FLAT
|
|
ASSUME es : FLAT
|
|
ASSUME fs : FLAT
|
|
ASSUME gs : FLAT
|
|
ASSUME ss : FLAT
|
|
|
|
EXTERN MMxDoForwardDCT:NEAR
|
|
EXTERN MMxDoForwardDCTx:NEAR
|
|
EXTERN MMxDoForwardDCTy:NEAR
|
|
IFDEF H261
|
|
ELSE
|
|
EXTERN MMxDoBFrameLumaBlocks:NEAR
|
|
EXTERN MMxDoBFrameChromaBlocks:NEAR
|
|
ENDIF
|
|
|
|
MMxEDTQ proc C AMBAS: DWORD,
|
|
ATarg: DWORD,
|
|
APrev: DWORD,
|
|
ABTarg: DWORD,
|
|
AWtFwd: DWORD,
|
|
AWtBwd: DWORD,
|
|
AFrmWd: DWORD,
|
|
ADoHalf: DWORD,
|
|
ADoBlk: DWORD,
|
|
ADoSF: DWORD,
|
|
ADoAP: DWORD,
|
|
ADoB: DWORD,
|
|
ADoLuma: DWORD,
|
|
ADoExtMV:DWORD,
|
|
AQP: DWORD,
|
|
ABQP: DWORD,
|
|
AB0VecT: DWORD,
|
|
ASpaFilT:DWORD,
|
|
ASpaFilD:DWORD,
|
|
ASWDTot: DWORD,
|
|
ABSWDTot:DWORD,
|
|
ACodStr: DWORD,
|
|
ABCodStr:DWORD
|
|
|
|
LocalFrameSize = 1536 ; Space needed for locals
|
|
|
|
RegStoSize = 16
|
|
|
|
; Arguments:
|
|
|
|
MBlockActionStream_arg = RegStoSize + 4
|
|
TargetFrameBaseAddress_arg = RegStoSize + 8
|
|
PreviousFrameBaseAddress_arg = RegStoSize + 12
|
|
BTargetFrameBaseAddress_arg = RegStoSize + 16
|
|
SignatureBaseAddress_arg = RegStoSize + 20
|
|
WeightForwardMotion_arg = RegStoSize + 24
|
|
WeightBackwardMotion_arg = RegStoSize + 28
|
|
FrameWidth = RegStoSize + 32
|
|
DoHalfPelEstimation_arg = RegStoSize + 36
|
|
DoBlockLevelVectors_arg = RegStoSize + 40
|
|
DoSpatialFiltering_arg = RegStoSize + 44
|
|
DoAdvancedPrediction_arg = RegStoSize + 48
|
|
DoBFrame_arg = RegStoSize + 52
|
|
DoLumaBlocksInThisPass_arg = RegStoSize + 56
|
|
DoExtendedMotionVectors_arg = RegStoSize + 60
|
|
QuantizationLevel = RegStoSize + 64
|
|
BQuantizationLevel = RegStoSize + 68
|
|
BFrmZeroVectorThreshold_arg = RegStoSize + 72
|
|
SpatialFiltThreshold_arg = RegStoSize + 76
|
|
SpatialFiltDifferential_arg = RegStoSize + 80
|
|
PSWDTotal = RegStoSize + 84
|
|
PBSWDTotal = RegStoSize + 88
|
|
CodeStreamCursor_arg = RegStoSize + 92
|
|
BCodeStreamCursor_arg = RegStoSize + 96
|
|
EndOfArgList = RegStoSize + 100
|
|
|
|
StackOffset TEXTEQU <0>
|
|
CONST_384 TEXTEQU <384>
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
|
|
; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
|
|
|
|
mov esi,esp
|
|
and esp,0FFFFF000H
|
|
sub esp,000000FE0H
|
|
IFDEF H261
|
|
|
|
mov ebp,PITCH
|
|
|
|
CONST_384 TEXTEQU <ebp>
|
|
|
|
mov eax,[esi+SpatialFiltThreshold_arg]
|
|
mov ebx,[esi+SpatialFiltDifferential_arg]
|
|
mov SpatialFiltThreshold,eax
|
|
mov SpatialFiltDifferential,ebx
|
|
mov ecx,[esi+TargetFrameBaseAddress_arg]
|
|
mov ebx,[esi+SignatureBaseAddress_arg]
|
|
sub ecx,ebx
|
|
mov eax,[esi+TargetFrameBaseAddress_arg]
|
|
mov SigToTarget,ecx
|
|
add ecx,PITCH*80+64
|
|
neg ecx
|
|
mov TargetToSig_Debiased,ecx
|
|
mov ebx,[esi+PreviousFrameBaseAddress_arg]
|
|
mov PreviousFrameBaseAddress,ebx
|
|
mov TargetFrameBaseAddress,eax
|
|
sub ebx,eax
|
|
mov ecx,[esi+QuantizationLevel]
|
|
mov TargToRef,ebx
|
|
mov eax,[esi+CodeStreamCursor_arg]
|
|
mov ebx,ecx
|
|
mov CodeStreamCursor,eax
|
|
shl ebx,16
|
|
xor edx,edx
|
|
or ebx,ecx
|
|
mov ecx,Recip2QP[ecx*4]
|
|
mov QPDiv2,ebx
|
|
mov Recip2QPToUse,ecx
|
|
mov eax,[esi+DoSpatialFiltering_arg]
|
|
mov DoExtendedMotionVectors,edx
|
|
test eax,eax
|
|
je @f
|
|
mov eax,3
|
|
@@:
|
|
mov DoSpatialFiltering,al
|
|
mov SWDTotal,edx
|
|
mov BestMBHalfPelMV,edx
|
|
mov ebx,PreviousFrameBaseAddress
|
|
mov BlockAbove[0],edx
|
|
sub ebx,16
|
|
mov edx,[esi+FrameWidth]
|
|
mov SpatiallyFilteredMB,ebx
|
|
imul edx,-SIZEOF T_MacroBlockActionDescr/16
|
|
add edx,2*SIZEOF T_Blk
|
|
mov eax,14 ; 14 if restricted MVs and doing heuristic ME.
|
|
mov BlockAbove[4],edx
|
|
mov DoHeuristicME,eax
|
|
|
|
ELSE
|
|
|
|
mov eax,[esi+DoExtendedMotionVectors_arg]
|
|
test eax,eax
|
|
je @f
|
|
mov eax,7
|
|
@@:
|
|
mov DoExtendedMotionVectors,eax
|
|
mov eax,[esi+BFrmZeroVectorThreshold_arg]
|
|
mov edi,[esi+WeightForwardMotion_arg]
|
|
mov BFrmZeroVectorThreshold,eax
|
|
mov ecx,60
|
|
mov ebx,060606060H
|
|
lea edx,WeightForwardMotion+128
|
|
@@:
|
|
mov eax,[edi+ecx]
|
|
and eax,03F3F3F3FH ; ???
|
|
mov ebp,[edi+ecx+64]
|
|
and ebp,03F3F3F3FH ; ???
|
|
xor eax,ebx
|
|
xor ebp,ebx
|
|
mov [edx+ecx+64],eax
|
|
mov [edx+ecx-128],ebp
|
|
sub ecx,4
|
|
mov ebp,PITCH
|
|
jge @b
|
|
|
|
mov edi,[esi+WeightBackwardMotion_arg]
|
|
mov eax,edx
|
|
lea edx,WeightBackwardMotion+128
|
|
mov ecx,60
|
|
sub eax,edx
|
|
jne @b
|
|
|
|
CONST_384 TEXTEQU <ebp>
|
|
|
|
mov ebx,[esi+PreviousFrameBaseAddress_arg]
|
|
mov eax,[esi+TargetFrameBaseAddress_arg]
|
|
mov PreviousFrameBaseAddress,ebx
|
|
mov TargetFrameBaseAddress,eax
|
|
mov ecx,[esi+BTargetFrameBaseAddress_arg]
|
|
sub ebx,eax
|
|
mov TargToRef,ebx
|
|
sub eax,ecx
|
|
mov BFrameBaseAddress,ecx
|
|
mov BFrameToFuture,eax
|
|
mov ecx,[esi+TargetFrameBaseAddress_arg]
|
|
mov ebx,[esi+SignatureBaseAddress_arg]
|
|
sub ecx,ebx
|
|
mov edx,[esi+FrameWidth]
|
|
mov SigToTarget,ecx
|
|
add ecx,PITCH*80+64
|
|
neg ecx
|
|
imul edx,-SIZEOF T_MacroBlockActionDescr/16
|
|
mov TargetToSig_Debiased,ecx
|
|
mov ecx,[esi+DoBFrame_arg]
|
|
add edx,2*SIZEOF T_Blk
|
|
xor cl,1
|
|
mov BlockAbove[4],edx
|
|
mov IsPlainPFrame,cl
|
|
mov ecx,[esi+QuantizationLevel]
|
|
mov eax,[esi+CodeStreamCursor_arg]
|
|
mov ebx,ecx
|
|
mov CodeStreamCursor,eax
|
|
mov eax,[esi+BCodeStreamCursor_arg]
|
|
mov BCodeStreamCursor,eax
|
|
shl ebx,16
|
|
mov eax,[esi+DoHalfPelEstimation_arg]
|
|
or ebx,ecx
|
|
mov ecx,Recip2QP[ecx*4]
|
|
mov QPDiv2,ebx
|
|
mov Recip2QPToUse,ecx
|
|
mov ecx,[esi+BQuantizationLevel]
|
|
xor edx,edx
|
|
mov ebx,ecx
|
|
shl ebx,16
|
|
mov BestMBHalfPelMV,edx
|
|
or ebx,ecx
|
|
mov ecx,Recip2QP[ecx*4]
|
|
mov BQPDiv2,ebx
|
|
mov BRecip2QPToUse,ecx
|
|
test eax,eax
|
|
je @f
|
|
mov eax,-4
|
|
@@:
|
|
mov DoHalfPelME,eax
|
|
mov eax,[esi+DoBlockLevelVectors_arg]
|
|
mov DoBlockLevelVectors,al
|
|
mov eax,[esi+DoAdvancedPrediction_arg]
|
|
mov DoAdvancedPrediction,al
|
|
mov SWDTotal,edx
|
|
test eax,eax
|
|
lea eax,[eax+14] ; 14 if restricted MVs and doing heuristic ME.
|
|
je @f
|
|
xor eax,eax ; 0 if unrestricted MVs and doing heuristic ME.
|
|
@@:
|
|
mov DoHeuristicME,eax
|
|
mov BSWDTotal,edx
|
|
mov PendingOBMC,edx
|
|
mov BlockAbove[0],edx
|
|
ENDIF
|
|
mov eax,01E98E268H
|
|
mov EMVLimitsForThisMB,eax
|
|
; ; [ 0: 7] -- HMV lower limit for sig search (biased 128)
|
|
; ; [ 8:15] -- HMV lower limit (signed)
|
|
; ; [16:23] -- HMV upper limit for sig search (biased 128)
|
|
; ; [24:31] -- HMV upper limit (signed)
|
|
mov EMVLimitsForThisMB+4,eax ; Same as for HMV.
|
|
mov edx,[esi+MBlockActionStream_arg]
|
|
mov al,NextZigZagCoeff[Q77]
|
|
test al,al
|
|
je ZigZagCoeffInitialized
|
|
|
|
xor ecx,ecx
|
|
lea ebx,InitZigZagCoeff
|
|
xor eax,eax
|
|
|
|
@@:
|
|
|
|
mov al,[ebx]
|
|
inc ebx
|
|
mov NextZigZagCoeff[ecx],al
|
|
mov ecx,eax
|
|
test eax,eax
|
|
jne @b
|
|
|
|
ZigZagCoeffInitialized:
|
|
|
|
mov StashESP,esi
|
|
mov eax,[esi+DoLumaBlocksInThisPass_arg]
|
|
test eax,eax
|
|
jne FirstMacroBlock ; Jump if doing luma plane
|
|
|
|
jmp FirstMacroBlock_ChromaProcessing
|
|
|
|
IntraCodedChromaProcessingDone:
|
|
|
|
IFDEF H261
|
|
ELSE
|
|
mov al,IsPlainPFrame
|
|
test al,al
|
|
jne NextMacroBlock_ChromaProcessing
|
|
|
|
mov eax,QPDiv2
|
|
mov ebx,BQPDiv2
|
|
|
|
call MMxDoBFrameChromaBlocks
|
|
ENDIF
|
|
|
|
NextMacroBlock_ChromaProcessing:
|
|
|
|
mov bl,[edx].CodedBlocks
|
|
sub edx,-SIZEOF T_MacroBlockActionDescr
|
|
and bl,040H ; Check for end-of-stream
|
|
jne TrulyDone
|
|
|
|
FirstMacroBlock_ChromaProcessing:
|
|
|
|
mov al,[edx].BlockType ; Chroma handling. Intra? Or Inter?
|
|
mov ecx,TargetFrameBaseAddress
|
|
cmp al,INTRA
|
|
jne ChromaIsInterCoded
|
|
|
|
mov esi,[edx].BlkU.BlkOffset
|
|
mov StashBlockType,al
|
|
add esi,ecx
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
|
|
|
|
shl bl,4
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov esi,[edx].BlkV.BlkOffset
|
|
mov [edx].CodedBlocks,al
|
|
mov ecx,TargetFrameBaseAddress
|
|
add esi,ecx
|
|
|
|
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
|
|
|
|
shl bl,5
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
pop ecx ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
mov [edx].CodedBlocks,al
|
|
jmp IntraCodedChromaProcessingDone
|
|
|
|
ChromaIsInterCoded:
|
|
|
|
mov edi,[edx].BlkU.BlkOffset ; Get address of next macroblock to do.
|
|
mov ebx,[edx].BlkU.MVs
|
|
add edi,ecx
|
|
mov esi,[edx].BlkU.PastRef
|
|
mov StashBlockType,al
|
|
IFDEF H261
|
|
mov ecx,2+256*1 ; cl==2 tells SpatialLoopFilter code to do one
|
|
; ; block. ch==1 causes it to return to here.
|
|
mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
|
|
cmp al,INTERSLF
|
|
je DoSpatialFilterForChroma
|
|
|
|
ReturnFromSpatialFilterForU:
|
|
|
|
ENDIF
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
shl bl,4
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov ecx,TargetFrameBaseAddress
|
|
mov [edx].CodedBlocks,al
|
|
pop edi ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
mov edi,[edx].BlkV.BlkOffset ; Get address of next macroblock to do.
|
|
mov ebx,[edx].BlkV.MVs
|
|
add edi,ecx
|
|
mov esi,[edx].BlkV.PastRef
|
|
IFDEF H261
|
|
mov ecx,2-256*1 ; cl==2 tells SpatialLoopFilter code to do one
|
|
; ; block. ch==-1 causes it to return to here.
|
|
mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
|
|
mov al,[edx].BlockType
|
|
cmp al,INTERSLF
|
|
je DoSpatialFilterForChroma
|
|
|
|
ReturnFromSpatialFilterForV:
|
|
|
|
ENDIF
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
shl bl,5
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
pop ecx ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
mov [edx].CodedBlocks,al
|
|
jmp IntraCodedChromaProcessingDone
|
|
|
|
;============================================================================
|
|
; Here we copy the target macroblock, and interpolate left, right, and both.
|
|
; We also accumulate the target pels for each block. Result is four partial
|
|
; sums in four packed words. After summing them all up, the final sum will
|
|
; be the sum of the 64 pels of each block, divided by 2.
|
|
|
|
NextMacroBlock:
|
|
|
|
mov bl,[edx].CodedBlocks
|
|
sub edx,-SIZEOF T_MacroBlockActionDescr
|
|
and bl,040H ; Check for end-of-stream
|
|
jne Done
|
|
|
|
FirstMacroBlock:
|
|
|
|
mov edi,TargetFrameBaseAddress
|
|
mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
|
|
add edi,esi
|
|
mov esi,TargToRef
|
|
add esi,edi
|
|
mov TargetMacroBlockBaseAddr,edi
|
|
mov Addr0MVRef,esi
|
|
|
|
;============================================================================
|
|
; We calculate the 0-motion SWD. We use 32 match points per block, and
|
|
; write the result seperately for each block. If the SWD for the 0-motion
|
|
; vector is below a threshold, we don't bother searching for other possibly
|
|
; better motion vectors.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of ref block.
|
|
; edi -- Address of target block.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Not used. Will be linearized MV in non-zero MV search.
|
|
; ebx -- CurrSWDState, i.e. FirstMEState, times 8
|
|
; eax -- Scratch
|
|
; mm7 -- Best SWD for macroblock.
|
|
; mm0-mm6 Scratch
|
|
;
|
|
|
|
mov cl,[edx].CodedBlocks ; Init CBP for macroblock.
|
|
or cl,03FH ; Indicate all 6 blocks are coded.
|
|
mov eax,DoHeuristicME ; 0 if unrestricted MVs and heur ME.
|
|
; ; 14 if restricted MVs and heur ME.
|
|
; ; 15 if suppressing heuristic ME.
|
|
mov [edx].CodedBlocks,cl
|
|
js IntraByDecree
|
|
|
|
xor ebx,ebx ; Avoid partial register stall.
|
|
xor ecx,ecx
|
|
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
|
|
pcmpeqd mm7,mm7 ; Init previous best SWD to huge.
|
|
mov bl,[edx].FirstMEState ; Test for INTRA-BY-DECREE.
|
|
sub eax,ecx ; Negative iff should do heuristic ME
|
|
; ; for this macroblock.
|
|
test bl,bl
|
|
je IntraByDecree
|
|
|
|
sar eax,31
|
|
psrlq mm7,2
|
|
or ebx,eax ; -1 if doing heuristic ME.
|
|
mov al,INTER1MV ; Speculate INTER, 1 motion vector.
|
|
mov [edx].BlockType,al
|
|
psrld mm7,14 ; mm7[32:63]: Previous best SWD = 0x0000FFFF.
|
|
; ; mm7[ 0:31]: Prev SWD that we diminish = 0x0003FFFF.
|
|
; ; Since we can't diminish it below 0x00020000, we
|
|
; ; won't take the short circuit exit from MblkEstQWA.
|
|
|
|
; At this point:
|
|
; ebp -- PITCH
|
|
; esi -- Address of upper left block of 0,0 ref area.
|
|
; edi -- Address of upper left block of target.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Scratch
|
|
; ebx -- CurrSWDState, i.e. FirstMEState.
|
|
; eax -- Scratch
|
|
; mm7 -- Previous best SWD initialized to huge (0xFFFF, 0x3FFFF).
|
|
; mm0-mm6 -- Scratch
|
|
|
|
;============================================================================
|
|
; Compute SWD for macroblock.
|
|
|
|
ComputeMBSWD:
|
|
|
|
; Registers at this point:
|
|
; ebp -- PITCH
|
|
; esi -- Address of upper left block of candidate ref area.
|
|
; edi -- Address of upper left block of target.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Scratch
|
|
; ebx -- CurrSWDState
|
|
; eax -- Scratch
|
|
; mm7 -- Previous best SWD.
|
|
; mm0-mm6 -- Scratch
|
|
;
|
|
|
|
lea ecx,[ebp+ebp*4] ; Get PITCH*5
|
|
lea eax,[ebp+ebp*2] ; Get PITCH*3
|
|
movq mm0,[esi+PITCH*15] ; FL A: Ref MB, lower left block, line 15.
|
|
psubw mm0,[edi+PITCH*15] ; FL B: Diff for lower left block, line 15.
|
|
movq mm6,[esi+PITCH*15+8] ; FR A
|
|
psllw mm0,8 ; FL C: Extract diffs for line 15 even pels.
|
|
psubw mm6,[edi+PITCH*15+8] ; FR B
|
|
pmaddwd mm0,mm0 ; FL D: Square of diffs for even pels.
|
|
movq mm1,[esi+PITCH*9] ; 9L A
|
|
psllw mm6,8 ; FR C
|
|
psubw mm1,[edi+PITCH*9] ; 9L B
|
|
pmaddwd mm6,mm6 ; FR D
|
|
movq mm5,[esi+PITCH*9+8] ; 9R A
|
|
psllw mm1,8 ; 9L C
|
|
psubw mm5,[edi+PITCH*9+8] ; 9R B
|
|
pmaddwd mm1,mm1 ; 9L D
|
|
movq mm2,[esi+eax*4] ; CL a
|
|
psllw mm5,8 ; 9R C
|
|
psubw mm2,[edi+eax*4] ; CL b
|
|
pmaddwd mm5,mm5 ; 9R D
|
|
movq mm3,[esi+eax*4+8] ; CR a
|
|
pmaddwd mm2,mm2 ; CL c: Square of diffs for odd pels.
|
|
psubw mm3,[edi+eax*4+8] ; CR b
|
|
paddusw mm0,mm1 ; LL + Accumulate SWD for lower left block.
|
|
movq mm1,[esi+eax*1] ; 3L A
|
|
pmaddwd mm3,mm3 ; CR c
|
|
psubw mm1,[edi+eax*1] ; 3L B
|
|
paddusw mm6,mm5 ; LR +
|
|
movq mm5,[esi+eax*1+8] ; 3R A
|
|
psllw mm1,8 ; 3L C
|
|
psubw mm5,[edi+eax*1+8] ; 3R B
|
|
paddusw mm0,mm2 ; LL +
|
|
movq mm2,[esi] ; 0L a
|
|
pmaddwd mm1,mm1 ; 3L D
|
|
psubw mm2,[edi] ; 0L b
|
|
paddusw mm6,mm3 ; LR +
|
|
movq mm3,[esi+8] ; 0R a
|
|
psllw mm5,8 ; 3R C
|
|
psubw mm3,[edi+8] ; 0R b
|
|
pmaddwd mm5,mm5 ; 3R D
|
|
movq mm4,[esi+eax*2] ; 6L a
|
|
pmaddwd mm2,mm2 ; 0L c
|
|
psubw mm4,[edi+eax*2] ; 6L b
|
|
pmaddwd mm3,mm3 ; 0R c
|
|
movq PartSWDForLLBlk,mm0 ; Stash SWD for lines 9,12,15, LL blk.
|
|
paddusw mm0,mm6 ; Sum SWD for lines 9,12,15 LL and LR.
|
|
movq PartSWDForLRBlk,mm6 ; Stash SWD for lines 9,12,15, LR blk.
|
|
pmaddwd mm4,mm4 ; 6L c
|
|
movq mm6,[esi+eax*2+8] ; 6R a
|
|
paddusw mm1,mm2 ; UL +
|
|
psubw mm6,[edi+eax*2+8] ; 6R b
|
|
paddusw mm5,mm3 ; UR +
|
|
movq mm2,[esi+ebp*1] ; 1L A
|
|
pmaddwd mm6,mm6 ; 6R c
|
|
psubw mm2,[edi+ebp*1] ; 1L B
|
|
paddusw mm1,mm4 ; UL +
|
|
movq mm3,[esi+ecx*1] ; 5L A
|
|
paddusw mm0,mm1 ; Sum partial SWD for LL, LR, and UL.
|
|
psubw mm3,[edi+ecx*1] ; 5L B
|
|
paddusw mm5,mm6 ; UR +
|
|
movq mm6,[esi+ebp*4] ; 4L a
|
|
paddusw mm0,mm5 ; Sum partial SWD for all blocks.
|
|
movq PartSWDForURBlk,mm5 ; Stash SWD for lines 0,3,6, UR blk.
|
|
punpckldq mm5,mm0 ; Get low sum into high bits.
|
|
psubw mm6,[edi+ebp*4] ; 4L b
|
|
paddusw mm5,mm0 ; Total up SWD for every third line.
|
|
movq mm0,[esi+ebp*2] ; 2L a
|
|
psrlq mm5,47 ; Position, and double.
|
|
psubw mm0,[edi+ebp*2] ; 2L b
|
|
pcmpgtd mm5,mm7 ; Is 2 * SWD for 6 lines > prev SWD?
|
|
pmaddwd mm0,mm0 ; 2L c
|
|
psllw mm2,8 ; 1L C
|
|
movdf eax,mm5
|
|
pmaddwd mm2,mm2 ; 1L D
|
|
test eax,eax
|
|
jne MblkEst_EarlyOut
|
|
|
|
lea eax,[ecx+ebp*2] ; PITCH*7
|
|
psllw mm3,8 ; 5L C
|
|
paddusw mm1,mm2 ; UL +
|
|
pmaddwd mm3,mm3 ; 5L D
|
|
movq mm5,[esi+eax*1] ; 7L A
|
|
psubw mm5,[edi+eax*1] ; 7L B
|
|
pmaddwd mm6,mm6 ; 4L c
|
|
movq mm2,[esi+PITCH*11+8] ; BR A
|
|
psllw mm5,8 ; 7L C
|
|
psubw mm2,[edi+PITCH*11+8] ; BR B
|
|
paddusw mm1,mm3 ; UL +
|
|
movq mm3,[esi+PITCH*13+8] ; DR A
|
|
paddusw mm1,mm0 ; UL +
|
|
psubw mm3,[edi+PITCH*13+8] ; DR B
|
|
pmaddwd mm5,mm5 ; 7L D
|
|
movq mm0,[esi+ebp*8+8] ; 8R a
|
|
paddusw mm1,mm6 ; UL +
|
|
psubw mm0,[edi+ebp*8+8] ; 8R b
|
|
psllw mm2,8 ; BR C
|
|
movq mm4,[esi+ecx*2+8] ; AR a
|
|
paddusw mm1,mm5 ; UL +
|
|
psubw mm4,[edi+ecx*2+8] ; AR b
|
|
punpckldq mm6,mm1 ; Get low SWD accum to hi order of mm6.
|
|
movq mm5,[esi+eax*2+8] ; ER a
|
|
paddusw mm6,mm1 ; mm6[48:63] is SWD for upper left blk.
|
|
psubw mm5,[edi+eax*2+8] ; ER b
|
|
psrlq mm6,48 ; mm6 is SWD for upper left block.
|
|
psubusw mm7,mm6 ; Diminish prev best SWD by cand UL blk.
|
|
pmaddwd mm2,mm2 ; BR D
|
|
pmaddwd mm0,mm0 ; 8R c
|
|
psllw mm3,8 ; DR C
|
|
movq mm1,[esi+ebp*1+8] ; 1R A
|
|
pmaddwd mm3,mm3 ; DR D
|
|
paddusw mm2,PartSWDForLRBlk ; LR +
|
|
pmaddwd mm4,mm4 ; AR c
|
|
psubw mm1,[edi+ebp*1+8] ; 1R B
|
|
paddusw mm2,mm0 ; LR +
|
|
movq mm0,[esi+ecx*1+8] ; 5R A
|
|
pmaddwd mm5,mm5 ; ER c
|
|
psubw mm0,[edi+ecx*1+8] ; 5R B
|
|
paddusw mm2,mm3 ; LR +
|
|
movq mm3,[esi+eax*1+8] ; 7R A
|
|
paddusw mm2,mm4 ; LR +
|
|
paddusw mm2,mm5 ; LR +
|
|
psllw mm1,8 ; 1R C
|
|
psubw mm3,[edi+eax*1+8] ; 7R B
|
|
punpckldq mm5,mm2 ; Get low SWD accum to hi order of mm5.
|
|
paddusw mm5,mm2 ; mm5[48:63] is SWD for lower right blk.
|
|
pmaddwd mm1,mm1 ; 1R D
|
|
movq mm2,[esi+ebp*2+8] ; 2R a
|
|
psrlq mm5,48 ; mm5 is SWD for lower right block.
|
|
psubusw mm7,mm5 ; Diminish prev best SWD by cand LR blk.
|
|
punpckldq mm6,mm5 ; mm6[0:31] UL SWD; mm6[32:63] LR SWD.
|
|
psubw mm2,[edi+ebp*2+8] ; 2R b
|
|
psllw mm0,8 ; 5R C
|
|
movq mm5,[esi+ebp*4+8] ; 4R a
|
|
pmaddwd mm0,mm0 ; 5R D
|
|
psubw mm5,[edi+ebp*4+8] ; 4R b
|
|
psllw mm3,8 ; 7R C
|
|
paddusw mm1,PartSWDForURBlk ; UR +
|
|
pmaddwd mm3,mm3 ; 7R D
|
|
paddusw mm1,mm0 ; UR +
|
|
pmaddwd mm2,mm2 ; 2R c
|
|
movq mm0,[esi+PITCH*11] ; BL A
|
|
pmaddwd mm5,mm5 ; 4R c
|
|
psubw mm0,[edi+PITCH*11] ; BL B
|
|
paddusw mm1,mm3 ; UR +
|
|
movq mm3,[esi+ecx*2] ; AL a
|
|
paddusw mm1,mm2 ; UR +
|
|
psubw mm3,[edi+ecx*2] ; AL b
|
|
paddusw mm1,mm5 ; UR +
|
|
pmaddwd mm3,mm3 ; AL c
|
|
psllw mm0,8 ; BL C
|
|
movq mm2,[esi+PITCH*13] ; DL A
|
|
pmaddwd mm0,mm0 ; BL D
|
|
psubw mm2,[edi+PITCH*13] ; DL B
|
|
punpckldq mm5,mm1 ; Get low SWD accum to hi order of mm5.
|
|
movq mm4,[esi+ebp*8] ; 8L a
|
|
paddusw mm5,mm1 ; mm5[48:63] is SWD for upper right blk.
|
|
psubw mm4,[edi+ebp*8] ; 8L b
|
|
psllw mm2,8 ; DL C
|
|
movq mm1,[esi+eax*2] ; EL a
|
|
pmaddwd mm2,mm2 ; DL D
|
|
psubw mm1,[edi+eax*2] ; EL b
|
|
pmaddwd mm4,mm4 ; 8L c
|
|
paddusw mm3,PartSWDForLLBlk ; LL +
|
|
pmaddwd mm1,mm1 ; EL c
|
|
paddusw mm3,mm0 ; LL +
|
|
psrlq mm5,48 ; mm5 is SWD for upper right block.
|
|
paddusw mm3,mm2 ; LL +
|
|
psubusw mm7,mm5 ; Diminish prev best SWD by cand UR blk.
|
|
paddusw mm3,mm4 ; LL +
|
|
movq mm0,mm7
|
|
paddusw mm3,mm1 ; LL +
|
|
psrlq mm7,32 ; Get original Best SWD
|
|
punpckldq mm1,mm3
|
|
pxor mm2,mm2
|
|
paddusw mm1,mm3
|
|
psrlq mm1,48
|
|
punpckldq mm5,mm1 ; mm5[32:63] SWD for LL. mm5[0:31] SWD for UR.
|
|
psubusw mm0,mm1
|
|
psubusw mm7,mm0 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
|
|
pcmpeqd mm2,mm0 ; [0:31] == 0 iff cand better, else -1.
|
|
|
|
; Registers at this point:
|
|
; ebp -- PITCH
|
|
; edi -- Target MacroBlock Base Address.
|
|
; esi -- Address of upper left block of candidate ref area.
|
|
; edx -- MBlockActionStream
|
|
; ebx -- CurrSWDState
|
|
; mm7 -- New best SWD for macroblock.
|
|
; mm6 -- [0:31] SWD for upper left; [32:63] SWD for lower right.
|
|
; mm5 -- [0:31] SWD for upper right; [32:63] SWD for lower left.
|
|
; mm2 -- [0:31] 0 if cand better, else -1.
|
|
|
|
cmp ebx,LASTINITIALMESTATE ; Did we just do zero motion vector?
|
|
jg MEForNonZeroMVDone
|
|
|
|
movdf eax,mm7 ; SWD for this candidate.
|
|
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
|
|
test ebx,ebx
|
|
jns ZeroMVDoneForNonHeuristicME
|
|
|
|
HeuristicME_EarlyOut:
|
|
|
|
movq mm0,EMVLimitsForThisMB ; Speculate no extended motion vectors.
|
|
pcmpeqb mm1,mm1 ; <FFFF FFFF FFFF FFFF>
|
|
xor ecx,ecx
|
|
cmp bl,-3
|
|
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
|
|
jle HeuristicME_CaseSigMVDone_or_CaseAboveMVDone
|
|
|
|
sub eax,NONZEROMVDIFFERENTIAL
|
|
inc bl
|
|
mov ebx,DoExtendedMotionVectors ; 7 iff doing extende MVs, else 0.
|
|
jne HeuristicME_CaseLeftMVDone
|
|
|
|
HeuristicME_Case0MVDone:
|
|
|
|
movq SWDULandLR,mm6
|
|
pcmpeqb mm4,mm4 ; <FFFF FFFF FFFF FFFF>
|
|
movq SWDURandLL,mm5
|
|
psllw mm4,15 ; <8000 8000 8000 8000>
|
|
cmp eax,ZEROVECTORTHRESHOLD-NONZEROMVDIFFERENTIAL
|
|
; ; Compare 0-MV against ZeroVectorThreshold.
|
|
jl BelowZeroThresh ; Jump if 0-MV is good enough.
|
|
|
|
mov SWDForNon0MVToBeat,eax
|
|
and ebx,ecx ; Elim flag for bottom row. 0 iff no ExtMV.
|
|
mov eax,BlockAbove[4]
|
|
je NotExtendedMVs ; Jump if not doing extended MVs?
|
|
|
|
; Below: A==left; B==above; C==above rt.
|
|
movdt mm3,ValidRemoteVectors[ebx*4] ; <mask(A) (C) (B) (A)>
|
|
movq mm2,mm4 ; <8000 8000 8000 8000>
|
|
|
|
IF SIZEOF T_MacroBlockActionDescr-128
|
|
**** error: Due to assembler weakness, can't use spaces here, so SIZEOF
|
|
**** T_MacroBlockActionDescr is replaced by constant. If assembly error
|
|
**** occurs, the constant has been changed, and the three instructions in
|
|
**** the next 10 lines have to change.
|
|
ENDIF
|
|
IF SIZEOF T_Blk-16
|
|
**** error: Due to assembler weakness, can't use spaces here, so SIZEOF T_Blk
|
|
**** is replaced by constant. If assembly error occurs, the constant has been
|
|
**** changed, and the three instructions in the next 10 lines have to change.
|
|
ENDIF
|
|
movdt mm0,[edx-128].BestFullPelMBMVs ; <x x Av,h x >
|
|
punpcklbw mm3,mm3 ; mask for both MV parts
|
|
movdt mm1,[edx+eax-2*16+128].BestFullPelMBMVs ; <x x Cv,h x >
|
|
psrlw mm2,8 ; <0080 0080 0080 0080>
|
|
por mm4,mm2 ; <8080 ...> bias value.
|
|
punpcklwd mm1,mm0 ; <Av,h Cv,h x x >
|
|
punpcklwd mm0,[edx+eax-2*16].BestFullPelMBMVs ; <Bv,h Av,h x x >
|
|
;
|
|
punpckhdq mm0,mm1 ; <Av,h Cv,h Bv,h Av,h>
|
|
;
|
|
pand mm0,mm3 ; Set to 0 any off edge.
|
|
and ebx,4 ; If zero, we're on the top edge.
|
|
paddb mm0,mm4 ; <Av,h Cv,h Bv,h Av,h> biased
|
|
je @f ; If on top edge, cause LEFT to be taken.
|
|
movq mm1,mm0 ; <Av,h Cv,h Bv,h Av,h>
|
|
psrlq mm0,16 ; <x Av,h Cv,h Bv,h>
|
|
psubusb mm0,mm1 ; <x floor(A-C) floor(C-B) floor(B-A)>
|
|
;
|
|
paddb mm0,mm1 ; <x max(A,C) max(C,B) max(B,A)>
|
|
;
|
|
movq mm1,mm0 ; <x max(A,C) max(C,B) max(B,A)>
|
|
psrlq mm0,16 ; <x x max(A,C) max(C,B)>
|
|
pxor mm1,mm0 ; Part of median calc.
|
|
psrlq mm0,16 ; <x x x max(A,C)>
|
|
pxor mm0,mm1 ; <x x x median(A,B,C)> biased by +128.
|
|
;
|
|
|
|
@@:
|
|
|
|
punpcklbw mm0,mm0 ; 2 copies of median predictor MVs.
|
|
pcmpeqb mm1,mm1
|
|
punpcklwd mm0,mm0 ; 4 copies. Will now calc the following:
|
|
; ; [ 0: 7] -- HMV lower limit for sig search
|
|
; ; [ 8:15] -- HMV lower limit
|
|
; ; [16:23] -- HMV upper limit for sig search
|
|
; ; [24:31] -- HMV upper limit
|
|
; ; [32:39] -- VMV lower limit for sig search
|
|
; ; [40:47] -- VMV lower limit
|
|
; ; [48:55] -- VMV upper limit for sig search
|
|
; ; [56:63] -- VMV upper limit
|
|
;
|
|
psubusb mm0,EMV_ClampLowerEnd[ecx*8-40]
|
|
psllw mm1,3 ; <FF F8 FF F8 FF F8 FF F8> i.e. Mask to
|
|
; ; set sig srch range to mult of 8.
|
|
paddusb mm0,EMV_ClampUpperEnd[ecx*8-40]
|
|
|
|
psubb mm0,EMV_RestoreRange[ecx*8-40]
|
|
|
|
NotExtendedMVs:
|
|
|
|
movq SWD0MVURandLL,mm5
|
|
pand mm0,mm1 ; Set sig search at multiples of four.
|
|
movq SWD0MVULandLR,mm6
|
|
pcmpeqb mm2,mm2 ; Set cand as worse than 0MV, in case skip.
|
|
movq EMVLimitsForThisMB,mm0
|
|
and cl,1
|
|
je HeuristicME_SkipLeftMV
|
|
|
|
mov BestOfFourStartingPoints,esi
|
|
mov ebx,-2 ; Indicate trying MV of MB to left.
|
|
movsx ecx,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBVMV
|
|
movsx eax,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBHMV
|
|
|
|
ClampHeurMECandidateToRange:
|
|
|
|
movsx esi,PB EMVLimitsForThisMB+5 ; VMV lower limit.
|
|
cmp ecx,esi
|
|
jl ClampVMV_1
|
|
|
|
movsx esi,PB EMVLimitsForThisMB+7 ; VMV upper limit.
|
|
cmp ecx,esi
|
|
jle @f
|
|
|
|
ClampVMV_1:
|
|
|
|
mov ecx,esi
|
|
|
|
@@:
|
|
|
|
movsx esi,PB EMVLimitsForThisMB+1 ; HMV lower limit.
|
|
cmp eax,esi
|
|
jl ClampHMV_1
|
|
|
|
movsx esi,PB EMVLimitsForThisMB+3 ; HMV upper limit.
|
|
cmp eax,esi
|
|
jle @f
|
|
|
|
ClampHMV_1:
|
|
|
|
mov eax,esi
|
|
|
|
@@:
|
|
|
|
sar eax,1
|
|
lea ecx,[ecx+ecx*2]
|
|
IF PITCH-384
|
|
*** error: The magic here assumes a pitch of 384.
|
|
ENDIF
|
|
shl ecx,6
|
|
mov esi,Addr0MVRef
|
|
add eax,ecx ; Clamped Linearized Motion Vector
|
|
;
|
|
sub eax,1
|
|
jc MblkEst_EarlyOut ; Jump if Lin MV is zero.
|
|
|
|
lea esi,[esi+eax+1] ; Candidate reference address.
|
|
jmp ComputeMBSWD
|
|
|
|
HeuristicME_SkipLeftMV:
|
|
|
|
mov BestOfFourStartingPoints,esi
|
|
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
|
|
|
|
HeuristicME_CaseLeftMVDone:
|
|
|
|
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
|
|
mov ebx,BlockAbove[4]
|
|
and cl,4
|
|
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
|
|
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
|
|
movq SWDURandLL[eax*8],mm5
|
|
pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
|
|
mov BestOfFourStartingPoints[eax*4],esi
|
|
je HeuristicME_SkipAboveMV
|
|
|
|
movsx ecx,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBVMV
|
|
movsx eax,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBHMV
|
|
mov ebx,-3 ; Indicate trying MV of MB above.
|
|
jmp ClampHeurMECandidateToRange
|
|
|
|
HeuristicME_CaseSigMVDone_or_CaseAboveMVDone:
|
|
HeuristicME_SkipAboveMV:
|
|
|
|
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
|
|
jne HeuristicME_CaseSigMVDone
|
|
|
|
HeuristicME_CaseAboveMVDone:
|
|
|
|
mov cl,4
|
|
lea ebx,C0001000100010001
|
|
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
|
|
pxor mm0,mm0
|
|
movq SWDURandLL[eax*8],mm5
|
|
pxor mm1,mm1
|
|
mov BestOfFourStartingPoints[eax*4],esi
|
|
lea esi,TargetSigContribForRowPairs
|
|
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
|
|
pcmpeqb mm7,mm7 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
|
|
|
|
; ebp -- Pitch
|
|
; edi -- Address of target macroblock.
|
|
; esi -- Address at which to store target macroblock's signature contributions.
|
|
; cl -- Loop counter.
|
|
; mm0 -- Accumulator for target MB's sig contrib for first four even columns.
|
|
; mm1 -- Accumulator for target MB's sig contrib for last four even columns.
|
|
|
|
movq mm2,[edi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
|
|
pcmpeqb mm5,mm5 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
|
|
paddb mm2,[edi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
|
|
psrlw mm5,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
|
|
@@:
|
|
|
|
movq mm3,[edi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20>
|
|
movq mm4,mm2 ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
|
|
paddb mm3,[edi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
|
|
psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>
|
|
pmaddwd mm2,[ebx] ; D:<P07+P17+P05+P15 P03+P13+P01+P11>
|
|
movq mm7,mm5 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
pand mm5,mm3 ; W:<P26+P36 P24+P34 P22+P32 P20+P30>
|
|
psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>
|
|
pmaddwd mm3,[ebx] ; D:<P27+P37+P25+P35 P23+P33+P21+P31>
|
|
paddw mm0,mm5 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
|
|
movq mm5,[edi+ebp*2+8] ; B:<P2F P2E P2D P2C P2B P2A P29 P28>
|
|
pand mm4,mm7 ; W:<P06+P16 P04+P14 P02+P12 P00+P10>
|
|
paddb mm5,[edi+PITCH*3+8] ; B:<P2F+P3F P2E+P3E P2D+P3D P2C+P3C ...>
|
|
paddw mm0,mm4 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
|
|
movq mm4,[edi+8] ; B:<P0F P0E P0D P0C P0B P0A P09 P08>
|
|
movq mm6,mm7 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
paddb mm4,[edi+ebp*1+8] ; B:<P0F+P1F P0E+P1E P0D+P1D P0C+P1C ...>
|
|
pand mm7,mm5 ; W:<P2E+P3E P2C+P3C P2A+P3A P28+P38>
|
|
pand mm6,mm4 ; W:<P0E+P1E P0C+P1C P0A+P1A P08+P18>
|
|
psrlw mm5,8 ; W:<P2F+P3F P2D+P3D P2B+P3B P29+P39>
|
|
pmaddwd mm5,[ebx] ; D:<P2F+P3F+P2D+P3D P2B+P3B+P29+P39>
|
|
psrlw mm4,8 ; W:<P0F+P1F P0D+P1D P0B+P1B P09+P19>
|
|
pmaddwd mm4,[ebx] ; D:<P0F+P1F+P0D+P1D P0B+P1B+P09+P19>
|
|
paddw mm1,mm7 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
|
|
paddw mm1,mm6 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
|
|
lea edi,[edi+ebp*4] ; Advance input cursor
|
|
paddw mm3,mm5 ; D:<P2F+P3F+P2D+P3D+P27+P37+P25+P35
|
|
; ; P2B+P3B+P29+P39+P23+P33+P21+P31>
|
|
pcmpeqb mm5,mm5 ; Next W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
|
|
paddw mm4,mm2 ; D:<P0F+P1F+P0D+P1D+P07+P17+P05+P15
|
|
; ; P0B+P1B+P09+P19+P03+P13+P01+P11>
|
|
punpckldq mm7,mm3 ; D:<P0B+P1B+P09+P19+P03+P13+P01+P11 junk>
|
|
paddw mm7,mm3 ; [32:47]:<sum of odd pels of lines 0 and 1>
|
|
punpckldq mm6,mm4 ; W:<P2B+P3B+P29+P39+P23+P33+P21+P31 junk>
|
|
movq mm2,[edi] ; Next B:<P07 P06 P05 P04 P03 P02 P01 P00>
|
|
paddw mm6,mm4 ; [32:47]:<sum of odd pels of lines 2 and 3>
|
|
paddb mm2,[edi+ebp*1] ; Next B:<P07+P17 P06+P16 P05+P15 ...>
|
|
punpckhwd mm6,mm7 ; [0:31] W:<Line_0&1_odd Line_2&3_odd>
|
|
mov MBlockActionStream,edx
|
|
dec cl
|
|
movdf [esi],mm6 ; Save W:<Line_0&1_odd Line_2&3_odd>
|
|
psrlw mm5,8 ; Next W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
lea esi,[esi+4] ; Advance output cursor
|
|
jne @b
|
|
|
|
; ebp -- Pitch
|
|
; edi -- Address of candidate reference MB's signature contribs.
|
|
; esi -- Address at which target MB's signature contribs were stored, plus 16.
|
|
; edx -- Scratch.
|
|
; ecx -- Count down number of lines of signatures to try.
|
|
; ebx -- Increment to get from end of one line of signatures to start of next.
|
|
; al -- Count down number of signatures to try in a line.
|
|
; ah -- Reinits counter of signatures to try in a line.
|
|
; mm0 -- Target MB's sig contrib for first four even columns.
|
|
; mm1 -- Target MB's sig contrib for last four even columns.
|
|
; mm2 -- Target MB's sig contrib for first four pairs of rows, odd columns.
|
|
; mm3 -- Amount and address of best signature seen so far.
|
|
|
|
IF PITCH-384
|
|
*** error: The magic here assumes a pitch of 384.
|
|
ENDIF
|
|
xor eax,eax
|
|
mov ecx,TargetToSig_Debiased
|
|
mov al,EMVLimitsForThisMB+4 ; Lower vert lim for sig srch (half pels)
|
|
xor ebx,ebx
|
|
add edi,ecx
|
|
mov bl,EMVLimitsForThisMB+0 ; Lower horz lim for sig srch (half pels)
|
|
shr ebx,1
|
|
lea ecx,[eax+eax*2]
|
|
shl ecx,6
|
|
add edi,ebx
|
|
add edi,ecx
|
|
xor ecx,ecx
|
|
add ebx,ebx
|
|
mov cl,EMVLimitsForThisMB+6 ; Upper vert lim for sig srch (half pels)
|
|
sub ecx,eax
|
|
mov al,EMVLimitsForThisMB+2 ; Upper horz lim for sig srch (half pels)
|
|
shr ecx,3 ; Number of lines of sigs to do, minus 1.
|
|
sub eax,ebx
|
|
shr eax,3 ; Number of columns of sigs to do.
|
|
lea ebx,[ebp-1+080000000H]
|
|
sub ebx,eax ; 1/4th amt to add to move to next line.
|
|
mov ah,al
|
|
inc ah ; To reinit cntr for line.
|
|
movq mm2,[esi-16]
|
|
pcmpeqd mm3,mm3 ; Set winning signature artificially high.
|
|
movdt mm4,[edi]
|
|
psrld mm3,2
|
|
punpckldq mm4,[edi+4] ; ref sig contribs of left even cols.
|
|
|
|
TryNextSignature:
|
|
|
|
movdt mm5,[edi+8]
|
|
psubw mm4,mm0 ; diffs for sums of left even columns.
|
|
punpckldq mm5,[edi+12] ; ref sig contribs of right even cols.
|
|
pmaddwd mm4,mm4 ; Squared differences.
|
|
movdt mm6,[edi+ebp*2] ; Sums for first two pairs of rows.
|
|
psubw mm5,mm1 ; diffs for sums of right even columns.
|
|
punpckldq mm6,[edi+PITCH*6] ; Sums for second two pairs of rows.
|
|
pmaddwd mm5,mm5 ; Squared differences.
|
|
movdt mm7,[edi+PITCH*10] ; Sums for third two pairs of rows.
|
|
psubw mm6,mm2 ; Words: diffs for sums of first 4 pairs rows.
|
|
punpckldq mm7,[edi+PITCH*14] ; Sums for last two pairs of rows.
|
|
pmaddwd mm6,mm6 ; Squared differences.
|
|
psubw mm7,[esi-8] ; Words: diffs for sums of first 4 pairs rows.
|
|
paddd mm4,mm5 ; Accumulate squared differences.
|
|
sub al,1 ; Decrement line counter.
|
|
pmaddwd mm7,mm7 ; Squared differences.
|
|
sbb edx,edx ; -1 if done with line, else 0.
|
|
paddd mm6,mm4 ; Accumulate squared differences.
|
|
and edx,ebx ; 1/4 Amt to sub to goto next line, else 0.
|
|
paddd mm7,mm6 ; Accumulate squared differences.
|
|
movdt mm5,edi ; Address of this signature
|
|
punpckldq mm6,mm7 ; <low_order_accumulator junk>
|
|
paddd mm7,mm6 ; <full_signature_amt junk>
|
|
psllq mm5,32 ; <Addr_of_this_signature 0>
|
|
lea edi,[edi+edx*4+4] ; advance signature position to next cand.
|
|
punpckhdq mm5,mm7 ; <cand_signature_amt cand_signature_addr>
|
|
sar edx,31 ; -1 if done with line, else 0.
|
|
pcmpgtd mm7,mm3 ; <0xFFFFFFFF if cand not better junk>
|
|
movdt mm4,[edi]
|
|
punpckhdq mm7,mm7 ; <0xFFFFFFFFFFFFFFFF if cand not better>
|
|
punpckldq mm4,[edi+4]
|
|
pand mm3,mm7 ; 1st_best if cand not better, else 0.
|
|
and dl,ah ; Num cols in a line if done with line, else 0.
|
|
pandn mm7,mm5 ; cand if better than 1st_best, else 0.
|
|
add al,dl ; Reinit col count if finishing with line.
|
|
por mm3,mm7 ; Better of cand and 1st_best.
|
|
sbb ecx,0 ; Decrement line count if just finished line.
|
|
jge TryNextSignature
|
|
|
|
movdf ecx,mm3 ; Fetch address of best signature.
|
|
pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
|
|
mov edi,TargetMacroBlockBaseAddr
|
|
mov ebx,-4 ; Indicate trying MV of best signature.
|
|
sub ecx,edi
|
|
mov eax,SigToTarget
|
|
movdt mm7,BestMBFullPelSWD ; Reload SWD for best full pel MB MV.
|
|
lea esi,[ecx+eax] ; Linearized motion vector
|
|
add eax,ecx ; Linearized motion vector
|
|
sar esi,8 ; Full pel vert lin offset div 256.
|
|
mov edx,MBlockActionStream ; Reload pointer to MBA descriptor.
|
|
shl eax,25
|
|
punpckldq mm7,mm7
|
|
movsx ecx,UnlinearizedVertMV[esi] ; Get full pel vert MV component.
|
|
sar eax,24 ; Full pel HMV.
|
|
jmp ClampHeurMECandidateToRange
|
|
|
|
HeuristicME_CaseSigMVDone:
|
|
HeuristicME_SkipSigMV:
|
|
|
|
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
|
|
pcmpeqd mm0,mm0 ; Init previous best SWD to huge.
|
|
mov ecx,Addr0MVRef ; Start to calc linearized MV.
|
|
mov bh,EMVLimitsForThisMB+1 ; HMV lower limit.
|
|
mov BestOfFourStartingPoints[eax*4],esi
|
|
add bh,4
|
|
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
|
|
psrlq mm0,2
|
|
movq SWDURandLL[eax*8],mm5
|
|
psrld mm0,14
|
|
mov eax,BestOfFourStartingPoints
|
|
mov bl,EMVLimitsForThisMB+5 ; VMV lower limit.
|
|
mov esi,eax
|
|
sub eax,ecx ; Linearized motion vector
|
|
mov ecx,eax ; Linearized motion vector
|
|
add al,al ; Full pel HMV.
|
|
cmp al,bh
|
|
jl ClampHMV_2
|
|
|
|
mov bh,EMVLimitsForThisMB+3 ; HMV upper limit
|
|
sub bh,4
|
|
cmp al,bh
|
|
jle NoClampHMV_2
|
|
|
|
ClampHMV_2:
|
|
|
|
sar ecx,8 ; Full pel vert lin offset div 256.
|
|
add bl,4
|
|
movzx eax,bh
|
|
movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
|
|
cmp cl,bl
|
|
jl @f
|
|
|
|
mov bl,EMVLimitsForThisMB+7 ; VMV upper limit.
|
|
movq mm7,mm0
|
|
sub bl,4
|
|
cmp cl,bl
|
|
jle NoClampVMV_2
|
|
|
|
@@:
|
|
|
|
movsx ecx,bl
|
|
movq mm7,mm0
|
|
|
|
NoClampVMV_2:
|
|
|
|
sar eax,1
|
|
lea ecx,[ecx+ecx*2]
|
|
shl ecx,6
|
|
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
|
|
mov esi,Addr0MVRef
|
|
add eax,ecx ; Linearized motion vector.
|
|
add esi,eax
|
|
jmp ComputeMBSWD
|
|
|
|
NoClampHMV_2:
|
|
|
|
sar ecx,8 ; Full pel vert lin offset div 256.
|
|
add bl,4
|
|
mov ah,bl
|
|
movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
|
|
cmp cl,ah
|
|
jl @f
|
|
|
|
mov ah,EMVLimitsForThisMB+7 ; VMV upper limit.
|
|
lea esi,[esi+ebp+1]
|
|
sub ah,4
|
|
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE ; New state number.
|
|
cmp cl,ah
|
|
jle ComputeMBSWD
|
|
|
|
@@:
|
|
|
|
movsx ecx,ah
|
|
movzx eax,al
|
|
sar eax,1
|
|
lea ecx,[ecx+ecx*2]
|
|
shl ecx,6
|
|
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
|
|
mov esi,Addr0MVRef
|
|
add eax,ecx ; Linearized motion vector.
|
|
add esi,eax
|
|
movq mm7,mm0
|
|
jmp ComputeMBSWD
|
|
|
|
|
|
ZeroMVDoneForNonHeuristicME:
|
|
|
|
movq SWDULandLR,mm6
|
|
movq SWDURandLL,mm5
|
|
cmp eax,ZEROVECTORTHRESHOLD ; Compare 0-MV against ZeroVectorThreshold.
|
|
jl BelowZeroThresh ; Jump if 0-MV is good enough.
|
|
|
|
xor ecx,ecx
|
|
sub eax,NONZEROMVDIFFERENTIAL
|
|
mov cl,StateEngineFirstRule[ebx] ; MV adjustment.
|
|
mov bl,StateEngineFirstRule[ebx+10] ; New state number.
|
|
shl ecx,11
|
|
mov SWDForNon0MVToBeat,eax
|
|
movq SWD0MVULandLR,mm6
|
|
movq SWD0MVURandLL,mm5
|
|
lea esi,[esi+ecx-PITCH*8]
|
|
jmp ComputeMBSWD
|
|
|
|
MEForNonZeroMVDone:
|
|
|
|
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
|
|
|
|
MblkEst_EarlyOut:
|
|
|
|
xor ecx,ecx
|
|
test ebx,ebx
|
|
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
|
|
pcmpeqb mm2,mm2 ; Set cand as worse than 0MV.
|
|
mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
|
|
js HeuristicME_EarlyOut
|
|
|
|
add esi,ecx ; Adjust ref addr for horz motion.
|
|
mov bl,StateEngine[eax+ebx*4+3] ; 0:239 -> New state number;
|
|
; ; 240:255 -> flags which 1/2 pel to do.
|
|
shr ecx,4
|
|
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
|
|
movq SWDURandLL[eax*8],mm5
|
|
pxor mm6,mm6 ; Speculatively zero to prep for half pel ME.
|
|
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
|
|
cmp bl,240 ; Terminal state?
|
|
jb ComputeMBSWD
|
|
|
|
mov eax,esi
|
|
mov ecx,Addr0MVRef ; Start to calc linearized MV.
|
|
sub eax,ecx ; Linearized Motion Vector
|
|
;
|
|
mov ecx,eax
|
|
;
|
|
sar eax,8 ; Full pel vert lin offset div 256.
|
|
and cl,07FH ; Full pel HMV
|
|
add cl,cl
|
|
;
|
|
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
|
|
IFDEF H261
|
|
ELSE
|
|
mov eax,DoHalfPelME ; 0 if not, -4 if so.
|
|
test eax,eax
|
|
je SkipHalfPelMBME
|
|
|
|
cmp cl,EMVLimitsForThisMB+1 ; Skip half pel ME if at edge of range
|
|
jle SkipHalfPelMBME
|
|
|
|
cmp cl,EMVLimitsForThisMB+3
|
|
jge SkipHalfPelMBME
|
|
|
|
cmp ch,EMVLimitsForThisMB+5
|
|
jle SkipHalfPelMBME
|
|
|
|
cmp ch,EMVLimitsForThisMB+7
|
|
jge SkipHalfPelMBME
|
|
|
|
|
|
; Registers:
|
|
; ebp -- PITCH
|
|
; esi -- Address of best full pel reference macroblock
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Nothing presently.
|
|
; edi -- Address of target macroblock.
|
|
; ebx -- 240 + Flags to indicate which half pel ME to do:
|
|
; 1 --> right; 2 --> left; 4 --> down; 8 --> up
|
|
; eax -- Count from -4 to -1 for blocks of macroblock.
|
|
; mm0:mm7 -- Scratch
|
|
|
|
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
|
|
pxor mm7,mm7 ; Prep accumulator for half pel ME.
|
|
|
|
call HalfPelMotionEstimation
|
|
|
|
movdt mm7,InvalidateBadHalfPelMVs[eax*4] ; Need to inflate SWDs for
|
|
; ; MVs that go off frame edge.
|
|
mov eax,esi
|
|
mov ebx,Addr0MVRef ; Start to calc linearized MV.
|
|
sub eax,ebx ; Linearized Motion Vector
|
|
punpcklbw mm7,mm7 ; Expand adjustment to words.
|
|
mov ecx,eax ; Linearized Motion Vector
|
|
paddusw mm7,mm3 ; Now have SWDs for half pel MBME.
|
|
sar eax,8 ; Full pel vert lin offset div 256.
|
|
and cl,07FH ; Full pel HMV
|
|
add cl,cl
|
|
movq mm6,mm7
|
|
mov [edx].BestFullPelMBHMV,cl ; Save HMV
|
|
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
|
|
movdf eax,mm7 ; eax[ 0:15] -- SWD for leftward ref.
|
|
; ; eax[16:31] -- SWD for rightward ref.
|
|
psrlq mm6,32
|
|
mov [edx].BestFullPelMBVMV,ch ; Save VMV
|
|
mov ebx,eax
|
|
shr eax,16 ; eax -- SWD for leftward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
|
|
cmp eax,ebx
|
|
jg MBME_RightBetterThanLeft
|
|
|
|
MBME_LeftBetterThanRight:
|
|
|
|
cmp eax,BestMBFullPelSWD
|
|
jge MBME_CtrIsBestHMV
|
|
|
|
MBME_LeftBestHMV:
|
|
|
|
movdf ebx,mm6 ; ebx[ 0:15] -- SWD for downward ref.
|
|
; ; ebx[16:31] -- SWD for upward ref.
|
|
mov BestHalfPelHorzSWD,eax
|
|
mov eax,ebx
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jg MBME_LeftBestHMV_DownBetterThanUp
|
|
|
|
MBME_LeftBestHMV_UpBetterThanDown:
|
|
|
|
cmp eax,BestMBFullPelSWD
|
|
jge MBME_LeftIsBest
|
|
|
|
MBME_LeftBestHMV_UpBestVMV:
|
|
|
|
sub esi,PITCH+1 ; Try ref 1/2 pel left and up
|
|
mov BestHalfPelVertSWD,eax
|
|
mov al,4
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+ebp*1+1] ; Back to center.
|
|
cmp eax,ebx
|
|
jle MBME_UpBetterThanUpLeft
|
|
|
|
MBME_UpLeftBetterThanUp:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge MBME_LeftIsBest
|
|
|
|
MBME_UpLeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
|
|
dec ch ; Back up the vert MV one up.
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_UpBetterThanUpLeft:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jg MBME_LeftIsBest
|
|
|
|
MBME_UpIsBest:
|
|
|
|
mov ebx,eax
|
|
dec ch ; Back up the vert MV one up.
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_LeftBestHMV_DownBetterThanUp:
|
|
|
|
cmp ebx,BestMBFullPelSWD
|
|
jge MBME_LeftIsBest
|
|
|
|
MBME_LeftBestHMV_DownBestVMV:
|
|
|
|
dec esi ; Try ref 1/2 pel left and down
|
|
mov BestHalfPelVertSWD,ebx
|
|
mov al,4
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
mov eax,BestHalfPelVertSWD
|
|
inc esi ; Back to center.
|
|
cmp eax,ebx
|
|
jle MBME_DownBetterThanDownLeft
|
|
|
|
MBME_DownLeftBetterThanDown:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge MBME_LeftIsBest
|
|
|
|
MBME_DownLeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-1] ; Best is ref 1/2 pel left and down
|
|
inc ch ; Advance the vert MV one down.
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_DownBetterThanDownLeft:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jle MBME_DownIsBest
|
|
|
|
MBME_LeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-1] ; Best is ref 1/2 pel left.
|
|
mov ebx,BestHalfPelHorzSWD
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_RightBetterThanLeft:
|
|
|
|
cmp ebx,BestMBFullPelSWD
|
|
jge MBME_CtrIsBestHMV
|
|
|
|
MBME_RightBestHMV:
|
|
|
|
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
|
|
; ; eax[16:31] -- SWD for upward ref.
|
|
mov BestHalfPelHorzSWD,ebx
|
|
mov ebx,eax
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jg MBME_RightBestHMV_DownBetterThanUp
|
|
|
|
MBME_RightBestHMV_UpBetterThanDown:
|
|
|
|
cmp eax,BestMBFullPelSWD
|
|
jge MBME_RightIsBest
|
|
|
|
MBME_RightBestHMV_UpBestVMV:
|
|
|
|
sub esi,ebp ; Try ref 1/2 pel right and up
|
|
mov BestHalfPelVertSWD,eax
|
|
mov al,4
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+ebp*1] ; Back to center.
|
|
cmp eax,ebx
|
|
jle MBME_UpBetterThanUpRight
|
|
|
|
MBME_UpRightBetterThanUp:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge MBME_RightIsBest
|
|
|
|
MBME_UpRightIsBest:
|
|
|
|
inc cl ; Advance the horz MV one to right.
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
|
|
dec ch ; Back up the vert MV one up.
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_UpBetterThanUpRight:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jle MBME_UpIsBest
|
|
|
|
MBME_RightIsBest:
|
|
|
|
mov ebx,BestHalfPelHorzSWD
|
|
inc cl ; Advance the horz MV one to right.
|
|
mov eax,esi
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_RightBestHMV_DownBetterThanUp:
|
|
|
|
cmp ebx,BestMBFullPelSWD
|
|
jge MBME_RightIsBest
|
|
|
|
MBME_RightBestHMV_DownBestVMV:
|
|
|
|
mov BestHalfPelVertSWD,ebx
|
|
mov al,4
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
mov eax,BestHalfPelVertSWD
|
|
cmp eax,ebx
|
|
jle MBME_DownBetterThanDownRight
|
|
|
|
MBME_DownRightBetterThanDown:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge MBME_RightIsBest
|
|
|
|
MBME_DownRightIsBest:
|
|
|
|
inc cl ; Advance the horz MV one to right.
|
|
mov eax,esi
|
|
inc ch ; Advance vert MV one down.
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_DownBetterThanDownRight:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jg MBME_RightIsBest
|
|
|
|
MBME_DownIsBest:
|
|
|
|
mov ebx,eax
|
|
inc ch ; Advance vert MV one down.
|
|
mov eax,esi
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_CtrIsBestHMV:
|
|
|
|
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
|
|
; ; eax[16:31] -- SWD for upward ref.
|
|
mov ebx,eax
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jge MBME_CtrBestHMV_DownBetterThanUp
|
|
|
|
MBME_CtrBestHMV_UpBetterThanDown:
|
|
|
|
mov ebx,BestMBFullPelSWD
|
|
cmp eax,ebx
|
|
jge MBME_CenterIsBest
|
|
|
|
; Up is best.
|
|
|
|
mov ebx,eax
|
|
dec ch ; Back up the vert MV one up.
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
MBME_CtrBestHMV_DownBetterThanUp:
|
|
|
|
mov eax,ebx
|
|
mov ebx,BestMBFullPelSWD
|
|
cmp eax,ebx
|
|
jge MBME_CenterIsBest
|
|
|
|
; Down is best.
|
|
|
|
mov ebx,eax
|
|
inc ch ; Advande the vert MV one down.
|
|
mov eax,esi
|
|
jmp MBME_HalfPelSearchDone
|
|
|
|
ENDIF
|
|
|
|
SkipHalfPelMBME:
|
|
|
|
mov [edx].BestFullPelMBHMV,cl ; Save HMV
|
|
movdf ebx,mm7 ; SWD for best full pel MB MV.
|
|
mov [edx].BestFullPelMBVMV,ch ; Save VMV
|
|
|
|
MBME_CenterIsBest:
|
|
|
|
mov eax,esi
|
|
|
|
MBME_HalfPelSearchDone:
|
|
|
|
mov BestMBHalfPelSWD,ebx
|
|
mov BestMBHalfPelMV,cl ; Save HMV
|
|
mov BestMBHalfPelRefAddr,eax
|
|
mov BestMBHalfPelMV+1,ch ; Save VMV
|
|
|
|
IFDEF H261
|
|
ELSE ; H263
|
|
mov bl,EMVLimitsForThisMB+1 ; Lower limit comparison.
|
|
mov al,DoBlockLevelVectors ; Are we doing block level MVs?
|
|
dec al
|
|
jne NoBlockMotionVectors
|
|
|
|
mov cl,[edx].CodedBlocks ; Fetch coded block pattern.
|
|
add bl,2
|
|
and cl,080H
|
|
jne NoBlockMotionVectors ; Skip Block ME if forced intra.
|
|
|
|
mov al,[edx].BestFullPelMBHMV ; Compare full pel HMV against limits.
|
|
mov cl,EMVLimitsForThisMB+3
|
|
cmp al,bl
|
|
jl NoBlockMotionVectors
|
|
|
|
mov bl,EMVLimitsForThisMB+5
|
|
sub cl,2
|
|
cmp al,cl ; Upper limit comparison.
|
|
jg NoBlockMotionVectors
|
|
|
|
mov al,[edx].BestFullPelMBVMV ; Compare full pel VMV against limits.
|
|
add bl,2
|
|
mov cl,EMVLimitsForThisMB+7
|
|
cmp al,bl
|
|
mov ebx,PD [edx].BestFullPelMBVMV-3
|
|
jl NoBlockMotionVectors
|
|
|
|
sar ebx,18
|
|
sub cl,2
|
|
cmp al,cl ; Upper limit comparison.
|
|
jg NoBlockMotionVectors
|
|
|
|
mov ecx,BestMBHalfPelSWD ; Jump if SWD for MB MV < thresh.
|
|
IF PITCH-384
|
|
*** error: The magic here assumes a pitch of 384.
|
|
ENDIF
|
|
and ebx,0FFFFFF80H ; VMV*128
|
|
cmp ecx,BLOCKMOTIONTHRESHOLD
|
|
jle NoBlockMotionVectors
|
|
|
|
;==========================================================================
|
|
; Starting from the best full pel macroblock motion vector calculated above, we
|
|
; search for the best block motion vectors.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of ref block.
|
|
; edi -- Address of target block.
|
|
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
|
|
; ecx -- Scratch
|
|
; ebx -- CurrSWDState
|
|
; eax -- Scratch
|
|
; mm7 -- Best SWD for current block
|
|
; mm6 -- unused.
|
|
; mm5 -- Best SWD for right block of pair worked on by inner loop.
|
|
; mm0-mm4 Scratch
|
|
;
|
|
|
|
movq mm0,HalfPelMBMESWDAccum+8
|
|
movq mm1,HalfPelMBMESWDAccum+16
|
|
psubusw mm7,mm0
|
|
movq mm2,HalfPelMBMESWDAccum+0
|
|
psubusw mm0,mm1
|
|
movq [edx].BlkY4.BlkLvlSWD+16,mm7
|
|
psubusw mm1,mm2
|
|
movq [edx].BlkY2.BlkLvlSWD+16,mm0
|
|
movq [edx].BlkY3.BlkLvlSWD+16,mm1
|
|
movq [edx].BlkY1.BlkLvlSWD+16,mm2
|
|
|
|
movsx eax,[edx].BestFullPelMBHMV
|
|
sar eax,1
|
|
lea ebx,[ebx+ebx*2]
|
|
mov esi,Addr0MVRef
|
|
add ebx,ebp
|
|
mov Addr0MVRefBlk,esi
|
|
add esi,eax
|
|
lea ecx,[ecx+ecx*2] ; Best MBMV SWD times 3.
|
|
add esi,ebx ; Try V+1 first
|
|
shr ecx,2 ; Best MBMV SWD * 3/4.
|
|
mov eax,SWDForNon0MVToBeat
|
|
mov BestBlockRefAddrVP1,esi ; Stash BestBlockRefAddr
|
|
sub ecx,BLOCKMVDIFFERENTIAL ; Best MBMV SWD * 3/4 - Differential.
|
|
lea eax,[eax+eax*2-BLOCKMVDIFFERENTIAL*4] ; Non0MBMVSWDToBeat*3-4*Diff.
|
|
mov LimitForSWDForBlkMV,ecx
|
|
shr eax,2 ; Non0MBMVSWDToBeat * 3/4.
|
|
mov ebx,FIRSTBLOCKMESTATE
|
|
cmp eax,ecx
|
|
jg @f
|
|
|
|
mov LimitForSWDForBlkMV,eax
|
|
mov ecx,eax
|
|
|
|
@@:
|
|
|
|
movdt mm5,SWDURandLL ; Get SWD for best MB level full pel MVs, blk 2.
|
|
test ecx,ecx
|
|
jle NoBlockMotionVectors
|
|
movdt mm7,SWDULandLR ; Get SWD for best MB level full pel MVs, blk 1.
|
|
movdf SWDForBlock2Or4,mm5
|
|
|
|
;============================================================================
|
|
; Compute SWD for block.
|
|
|
|
DoBlkMEForNextBlk:
|
|
ComputeBlkSWD:
|
|
|
|
movq mm0,[esi+ebp*1]
|
|
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
|
|
movq mm1,[esi+PITCH*3] ; Ref MB, upper left block, Line 3.
|
|
psllw mm0,8 ; Extract diffs for line 1 even pels.
|
|
psubw mm1,[edi+PITCH*3] ; Diff for line 3.
|
|
pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
|
|
movq mm2,[esi+PITCH*5]
|
|
psllw mm1,8
|
|
psubw mm2,[edi+PITCH*5]
|
|
pmaddwd mm1,mm1
|
|
movq mm3,[esi+PITCH*7]
|
|
psllw mm2,8
|
|
psubw mm3,[edi+PITCH*7]
|
|
pmaddwd mm2,mm2
|
|
movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
|
|
psllw mm3,8
|
|
psubw mm4,[edi] ; Diff for line 0.
|
|
paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
|
|
movq mm1,[esi+ebp*2]
|
|
pmaddwd mm3,mm3
|
|
psubw mm1,[edi+ebp*2]
|
|
paddusw mm0,mm2
|
|
movq mm2,[esi+ebp*4]
|
|
pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
|
|
psubw mm2,[edi+ebp*4]
|
|
paddusw mm0,mm3
|
|
movq mm3,[esi+PITCH*6]
|
|
pmaddwd mm1,mm1
|
|
psubw mm3,[edi+PITCH*6]
|
|
pmaddwd mm2,mm2
|
|
paddusw mm0,mm4
|
|
pmaddwd mm3,mm3
|
|
paddusw mm0,mm1
|
|
;
|
|
paddusw mm0,mm2
|
|
;
|
|
paddusw mm0,mm3
|
|
;
|
|
punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
|
|
movq mm4,mm7 ; Get original Best SWD for block
|
|
|
|
paddusw mm1,mm0 ; mm1[48:63] is SWD for block.
|
|
pxor mm2,mm2
|
|
psrlq mm1,48 ; mm1 is SWD for block.
|
|
;
|
|
psubusw mm4,mm1
|
|
xor ecx,ecx
|
|
pcmpeqd mm2,mm4 ; mm2[0:31] == 0 iff cand better, else -1.
|
|
psubusw mm7,mm4 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
|
|
;
|
|
;
|
|
movdf eax,mm2 ; edi == 0 iff cand better, else -1.
|
|
;
|
|
|
|
; Registers at this point:
|
|
; ebp -- PITCH
|
|
; esi -- Address of block of candidate ref area.
|
|
; edi -- 0 iff candidate SWD better, else -1.
|
|
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
|
|
; ecx -- Scratch
|
|
; ebx -- CurrSWDState.
|
|
; eax -- CurrSWDState.
|
|
; mm7 -- New best SWD for current block
|
|
; mm6 -- Unused.
|
|
|
|
movq [edx].BlkY1.BlkLvlSWD,mm7 ; Save best blk level SWD.
|
|
pxor mm6,mm6 ; Spec zero to prep for half pel ME.
|
|
mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
|
|
mov bl,StateEngine[eax+ebx*4+3] ; New state number; 255 means done.
|
|
add esi,ecx ; Adjust ref addr for horz motion.
|
|
mov eax,DoHalfPelME ; 0 if not, -4 if so.
|
|
shr ecx,4
|
|
cmp bl,240 ; Terminal state?
|
|
jae @f
|
|
|
|
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
|
|
jmp ComputeBlkSWD
|
|
|
|
@@:
|
|
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
|
|
add eax,4
|
|
mov ecx,esi
|
|
jne SkipHalfPelBlkME
|
|
|
|
; Registers:
|
|
; ebp -- PITCH
|
|
; esi -- Address of best full pel reference macroblock
|
|
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
|
|
; ecx -- Copy of esi.
|
|
; edi -- Address of target block.
|
|
; ebx -- Scratch
|
|
; eax -- Set to 0 to cause HalfPelMotionEstimation to quit after one block.
|
|
; mm0:mm7 -- Scratch
|
|
|
|
mov ebx,BestBlockRefAddrVP1
|
|
add ecx,ebp
|
|
cmp ebx,ecx
|
|
jne FullPelBlkMEMovedFromCenter
|
|
|
|
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
|
|
movq mm3,[edx].BlkY1.BlkLvlSWD+16 ; SWDs: H+1, H-1, V+1, V-1.
|
|
jmp FullPelBlkMEDidNotMoveFromCenter
|
|
|
|
FullPelBlkMEMovedFromCenter:
|
|
|
|
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
|
|
pxor mm7,mm7 ; Prep accumulator for half pel ME.
|
|
|
|
call HalfPelMotionEstimation
|
|
|
|
lea esi,[esi+ebp*8+8] ; Fix reference pointer.
|
|
lea edi,[edi+ebp*8+8] ; Fix target pointer.
|
|
|
|
FullPelBlkMEDidNotMoveFromCenter:
|
|
|
|
mov eax,esi
|
|
mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
|
|
sub ecx,ebx ; Linearized Motion Vector
|
|
sub eax,ebx ; Linearized Motion Vector
|
|
sar eax,8 ; Full pel vert lin offset div 256.
|
|
and cl,07FH ; Full pel HMV
|
|
movdf ebx,mm3 ; ebx[ 0:15] -- SWD for leftward ref.
|
|
; ; ebx[16:31] -- SWD for rightward ref.
|
|
psrlq mm3,32
|
|
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
|
|
mov eax,ebx
|
|
shr eax,16 ; eax -- SWD for leftward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
|
|
cmp eax,ebx
|
|
jg BlkME_RightBetterThanLeft
|
|
|
|
BlkME_LeftBetterThanRight:
|
|
|
|
add cl,cl
|
|
mov ebx,BestBlkFullPelSWD
|
|
cmp eax,ebx
|
|
jge BlkME_CtrIsBestHMV
|
|
|
|
BlkME_LeftBestHMV:
|
|
|
|
movdf ebx,mm3 ; ebx[ 0:15] -- SWD for downward ref.
|
|
; ; ebx[16:31] -- SWD for upward ref.
|
|
mov BestHalfPelHorzSWD,eax
|
|
mov eax,ebx
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jg BlkME_LeftBestHMV_DownBetterThanUp
|
|
|
|
BlkME_LeftBestHMV_UpBetterThanDown:
|
|
|
|
cmp eax,BestBlkFullPelSWD
|
|
jge BlkME_LeftIsBest
|
|
|
|
BlkME_LeftBestHMV_UpBestVMV:
|
|
|
|
sub esi,PITCH+1 ; Try ref 1/2 pel left and up
|
|
mov BestHalfPelVertSWD,eax
|
|
mov al,1
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
lea edi,[edi+ebp*8+8]
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+PITCH*9+9] ; Back to center.
|
|
cmp eax,ebx
|
|
jle BlkME_UpBetterThanUpLeft
|
|
|
|
BlkME_UpLeftBetterThanUp:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge BlkME_LeftIsBest
|
|
|
|
BlkME_UpLeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
|
|
dec ch ; Back up the vert MV one up.
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_UpBetterThanUpLeft:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jg BlkME_LeftIsBest
|
|
|
|
BlkME_UpIsBest:
|
|
|
|
dec ch ; Back up the vert MV one up.
|
|
mov ebx,eax
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_LeftBestHMV_DownBetterThanUp:
|
|
|
|
cmp ebx,BestBlkFullPelSWD
|
|
jge BlkME_LeftIsBest
|
|
|
|
BlkME_LeftBestHMV_DownBestVMV:
|
|
|
|
dec esi ; Try ref 1/2 pel left and down
|
|
mov BestHalfPelVertSWD,ebx
|
|
mov al,1
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
lea edi,[edi+ebp*8+8]
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+ebp*8+9] ; Back to center.
|
|
cmp eax,ebx
|
|
jle BlkME_DownBetterThanDownLeft
|
|
|
|
BlkME_DownLeftBetterThanDown:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge BlkME_LeftIsBest
|
|
|
|
BlkME_DownLeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-1] ; Best is ref 1/2 pel left and down
|
|
inc ch ; Advance the vert MV one down.
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_DownBetterThanDownLeft:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jle BlkME_DownIsBest
|
|
|
|
BlkME_LeftIsBest:
|
|
|
|
dec cl ; Back up the horz MV one to the left.
|
|
lea eax,[esi-1] ; Best is ref 1/2 pel left.
|
|
mov ebx,BestHalfPelHorzSWD
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_RightBetterThanLeft:
|
|
|
|
add cl,cl
|
|
mov eax,BestBlkFullPelSWD
|
|
cmp eax,ebx
|
|
jle BlkME_CtrIsBestHMV
|
|
|
|
BlkME_RightBestHMV:
|
|
|
|
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
|
|
; ; eax[16:31] -- SWD for upward ref.
|
|
mov BestHalfPelHorzSWD,ebx
|
|
mov ebx,eax
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jg BlkME_RightBestHMV_DownBetterThanUp
|
|
|
|
BlkME_RightBestHMV_UpBetterThanDown:
|
|
|
|
cmp eax,BestBlkFullPelSWD
|
|
jge BlkME_RightIsBest
|
|
|
|
BlkME_RightBestHMV_UpBestVMV:
|
|
|
|
sub esi,ebp ; Try ref 1/2 pel right and up
|
|
mov BestHalfPelVertSWD,eax
|
|
mov al,1
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
lea edi,[edi+ebp*8+8]
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+PITCH*9+8] ; Back to center.
|
|
cmp eax,ebx
|
|
jle BlkME_UpBetterThanUpRight
|
|
|
|
BlkME_UpRightBetterThanUp:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge BlkME_RightIsBest
|
|
|
|
BlkME_UpRightIsBest:
|
|
|
|
inc cl ; Advance the horz MV one to right.
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
|
|
dec ch ; Back up the vert MV one up.
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_UpBetterThanUpRight:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jle BlkME_UpIsBest
|
|
|
|
BlkME_RightIsBest:
|
|
|
|
mov ebx,BestHalfPelHorzSWD
|
|
inc cl ; Advance the horz MV one to right.
|
|
mov eax,esi
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_RightBestHMV_DownBetterThanUp:
|
|
|
|
cmp ebx,BestBlkFullPelSWD
|
|
jge BlkME_RightIsBest
|
|
|
|
BlkME_RightBestHMV_DownBestVMV:
|
|
|
|
mov BestHalfPelVertSWD,ebx
|
|
mov al,1
|
|
|
|
call HalfPelMotionEstimationBothWays
|
|
|
|
lea edi,[edi+ebp*8+8]
|
|
mov eax,BestHalfPelVertSWD
|
|
lea esi,[esi+ebp*8+8] ; Back to center.
|
|
cmp eax,ebx
|
|
jle BlkME_DownBetterThanDownRight
|
|
|
|
BlkME_DownRightBetterThanDown:
|
|
|
|
cmp ebx,BestHalfPelHorzSWD
|
|
jge BlkME_RightIsBest
|
|
|
|
BlkME_DownRightIsBest:
|
|
|
|
inc cl ; Advance the horz MV one to right.
|
|
mov eax,esi
|
|
inc ch ; Advance vert MV one down.
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_DownBetterThanDownRight:
|
|
|
|
cmp eax,BestHalfPelHorzSWD
|
|
jg BlkME_RightIsBest
|
|
|
|
BlkME_DownIsBest:
|
|
|
|
inc ch ; Advance vert MV one down.
|
|
mov ebx,eax
|
|
mov eax,esi
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_CtrIsBestHMV:
|
|
|
|
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
|
|
; ; eax[16:31] -- SWD for upward ref.
|
|
mov ebx,eax
|
|
shr eax,16 ; eax -- SWD for upward ref.
|
|
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
|
|
cmp eax,ebx
|
|
jge BlkME_CtrBestHMV_DownBetterThanUp
|
|
|
|
BlkME_CtrBestHMV_UpBetterThanDown:
|
|
|
|
mov ebx,BestBlkFullPelSWD
|
|
cmp eax,ebx
|
|
jge BlkME_CenterIsBest
|
|
|
|
; Up is best.
|
|
|
|
mov ebx,eax
|
|
dec ch ; Back up the vert MV one up.
|
|
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
BlkME_CtrBestHMV_DownBetterThanUp:
|
|
|
|
mov eax,ebx
|
|
mov ebx,BestBlkFullPelSWD
|
|
cmp eax,ebx
|
|
jge BlkME_CenterIsBest
|
|
|
|
; Down is best.
|
|
|
|
mov ebx,eax
|
|
inc ch ; Advande the vert MV one down.
|
|
mov eax,esi
|
|
jmp BlkME_HalfPelSearchDone
|
|
|
|
SkipHalfPelBlkME:
|
|
|
|
mov eax,esi
|
|
mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
|
|
sub ecx,ebx ; Linearized Motion Vector
|
|
sub eax,ebx ; Linearized Motion Vector
|
|
sar eax,8 ; Full pel vert lin offset div 256.
|
|
and cl,07FH ; Full pel HMV
|
|
add cl,cl
|
|
;
|
|
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
|
|
;
|
|
movdf ebx,mm7 ; SWD for best full pel block MV.
|
|
|
|
BlkME_CenterIsBest:
|
|
|
|
mov eax,esi
|
|
|
|
BlkME_HalfPelSearchDone:
|
|
|
|
mov [edx].BlkY1.BlkLvlSWD,ebx
|
|
mov [edx].BlkY1.PastRef,eax
|
|
mov [edx].BlkY1.PHMV,cl ; Save HMV
|
|
mov eax,LimitForSWDForBlkMV ; Does block's SWD put us over limit?
|
|
mov [edx].BlkY1.PVMV,ch ; Save VMV
|
|
sub eax,ebx
|
|
jl BlkEst_EarlyOut
|
|
|
|
mov LimitForSWDForBlkMV,eax ; Remember how much is left for other blks.
|
|
mov esi,BestBlockRefAddrVP1
|
|
add edi,8 ; Move to blk 2 or 4, V+4.
|
|
mov ecx,Addr0MVRefBlk ; Calc addr of 0MV ref for this blk.
|
|
add esi,8 ; Move to blk 2 or 4, V+4.
|
|
add ecx,8
|
|
mov Addr0MVRefBlk,ecx
|
|
add edx,SIZEOF T_Blk ; Increment to next block.
|
|
test dl,SIZEOF T_Blk
|
|
movdt mm7,SWDForBlock2Or4
|
|
mov ebx,FIRSTBLOCKMESTATE
|
|
jne DoBlkMEForNextBlk ; If so, go do blk 2 or 4.
|
|
|
|
lea esi,[esi+ebp*8-8] ; Move to blk 3
|
|
lea ecx,[ecx+ebp*8-16]
|
|
mov BestBlockRefAddrVP1,esi
|
|
lea edi,[edi+ebp*8-16]
|
|
movdt mm5,SWDULandLR+4 ; Get SWD for best MB level MVs, blk 4.
|
|
movdt mm7,SWDURandLL+4 ; Get SWD for best MB level MVs, blk 3.
|
|
movdf SWDForBlock2Or4,mm5
|
|
test dl,2*SIZEOF T_Blk ; Just finishing blk 2?
|
|
mov Addr0MVRefBlk,ecx
|
|
jne DoBlkMEForNextBlk ; If so, go do blk 3.
|
|
|
|
;==============================================================================
|
|
; Block motion vectors are best.
|
|
|
|
mov esi,[edx-4*SIZEOF T_Blk].BlkY1.BlkLvlSWD
|
|
mov edi,[edx-4*SIZEOF T_Blk].BlkY4.BlkLvlSWD
|
|
mov SWDULandLR,esi
|
|
mov SWDULandLR+4,edi
|
|
mov esi,[edx-4*SIZEOF T_Blk].BlkY3.BlkLvlSWD
|
|
mov edi,[edx-4*SIZEOF T_Blk].BlkY2.BlkLvlSWD
|
|
mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs
|
|
mov ebx,[edx-4*SIZEOF T_Blk].BlkY2.MVs
|
|
mov ecx,eax
|
|
xor eax,ebx
|
|
xor ecx,[edx-4*SIZEOF T_Blk].BlkY3.MVs
|
|
xor ebx,[edx-4*SIZEOF T_Blk].BlkY4.MVs
|
|
mov SWDURandLL,edi
|
|
or eax,ebx
|
|
sub edx,4*SIZEOF T_Blk ; Restore MacroBlockActionStream ptr.
|
|
or eax,ecx
|
|
test eax,0FFFFH
|
|
mov SWDURandLL+4,esi
|
|
je MotionVectorSettled
|
|
|
|
mov al,INTER4MV ; Set type for MB to INTER-coded, 4 MVs.
|
|
mov [edx].BlockType,al
|
|
jmp MotionVectorSettled
|
|
|
|
BlkEst_EarlyOut:
|
|
|
|
and edx,-1-3*SIZEOF T_Blk
|
|
mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
|
|
|
|
BlockMVNotBigEnoughGain: ; Try MB-level motion vector.
|
|
|
|
cmp ecx,SWDForNon0MVToBeat
|
|
jge NonZeroMVNotBigEnoughGain
|
|
|
|
ENDIF ; H263
|
|
|
|
mov ebx,BestMBHalfPelMV
|
|
mov esi,BestMBHalfPelRefAddr ; Reload BestMBHalfPelRefAddr
|
|
|
|
NonZeroMBLevelMVBest:
|
|
|
|
; Non-zero macroblock level motion vector is best.
|
|
|
|
mov [edx].BlkY1.MVs,ebx
|
|
mov [edx].BlkY2.MVs,ebx
|
|
mov [edx].BlkY3.MVs,ebx
|
|
mov [edx].BlkY4.MVs,ebx
|
|
mov [edx].BlkY1.PastRef,esi
|
|
lea ecx,[esi+ebp*8]
|
|
mov [edx].BlkY3.PastRef,ecx
|
|
add esi,8
|
|
mov [edx].BlkY2.PastRef,esi
|
|
add ecx,8
|
|
mov [edx].BlkY4.PastRef,ecx
|
|
jmp MotionVectorSettled
|
|
|
|
NoBlockMotionVectors:
|
|
|
|
mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
|
|
mov eax,SWDForNon0MVToBeat
|
|
cmp eax,ecx
|
|
mov ebx,BestMBHalfPelMV
|
|
mov esi,BestMBHalfPelRefAddr
|
|
jge NonZeroMBLevelMVBest
|
|
|
|
NonZeroMVNotBigEnoughGain:
|
|
|
|
mov esi,Addr0MVRef ; 0-MV ref block.
|
|
movq mm6,SWD0MVULandLR
|
|
movq mm5,SWD0MVURandLL
|
|
movq SWDULandLR,mm6
|
|
movq SWDURandLL,mm5
|
|
|
|
BelowZeroThresh:
|
|
|
|
mov [edx].BlkY1.PastRef,esi ; Save address of ref block, all blks.
|
|
lea eax,[esi+8]
|
|
mov [edx].BlkY2.PastRef,eax
|
|
lea eax,[esi+ebp*8]
|
|
mov [edx].BlkY3.PastRef,eax
|
|
add eax,8
|
|
mov [edx].BlkY4.PastRef,eax
|
|
xor eax,eax
|
|
mov [edx].BlkY1.MVs,eax ; Set horz and vert MVs to 0 in all blks.
|
|
mov [edx].BlkY2.MVs,eax
|
|
mov [edx].BlkY3.MVs,eax
|
|
mov [edx].BestFullPelMBHMV,al
|
|
mov [edx].BlkY4.MVs,eax
|
|
mov [edx].BestFullPelMBVMV,al
|
|
mov BestMBHalfPelMV,eax
|
|
|
|
|
|
MotionVectorSettled:
|
|
|
|
IFDEF H261
|
|
|
|
;===============================================================================
|
|
; For H261, we've settled on the best motion vector. Now we need to determine
|
|
; if spatial filtering should be done.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of block of ref area.
|
|
; edi -- Address of spatially filtred block.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Loop counter.
|
|
; ebx -- Address of constant 0x7F in all 8 bytes.
|
|
; eax -- Scratch
|
|
; mm7 -- Mask to extract bytes 0 and 7. (High bit of bytes 1:6 must be off).
|
|
; mm6 -- All bytes -1.
|
|
; mm5 -- Mask to extract bytes 1:6 and clear bit 8 thereof.
|
|
|
|
movdf esi,mm7 ; Restore non-SLF SWD for macroblock.
|
|
cmp esi,SpatialFiltThreshold
|
|
jle SkipSpatialFiltering
|
|
|
|
mov ecx,DoSpatialFiltering ; Are we doing spatial filtering?
|
|
mov esi,[edx].BlkY1.PastRef
|
|
test cl,cl
|
|
je SkipSpatialFiltering
|
|
|
|
DoSpatialFilterForChroma:
|
|
DoSpatialFilterForLuma:
|
|
|
|
movq mm5,C7F7F7F7F7F7F7F7F ; Mask to extract bytes 1:6.
|
|
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
|
|
psllq mm5,16
|
|
psrlq mm5,8
|
|
pcmpeqb mm7,mm7
|
|
pxor mm7,mm5 ; Mask to extract bytes 0 and 7.
|
|
mov edi,SpatiallyFilteredMB
|
|
lea eax,[esi+ebp*4]
|
|
lea ebx,C7F7F7F7F7F7F7F7F ; Address of this useful constant.
|
|
|
|
SpatialFilterLoop:
|
|
|
|
movq mm0,[esi] ; 0a: <P7 P6 P5 P4 P3 P2 P1 P0>
|
|
pcmpeqb mm6,mm6 ; To add one to all bytes.
|
|
movq mm4,mm0 ; 0b: <P7 P6 P5 P4 P3 P2 P1 P0>
|
|
psllq mm0,16 ; 0c: <P5 P4 P3 P2 P1 P0 0 0>
|
|
movq mm3,[esi+ebp*1]; 1a
|
|
paddb mm0,mm4 ; 0d: <P7+P5 P6+P4 ... P3+P1 P2+P0 jnk jnk >
|
|
movq mm1,mm3 ; 1b
|
|
psrlq mm0,9 ; 0e: <0 (P7+P5)/2 ... (P2+P0)/2 jnk> (dirty)
|
|
|
|
SpatialFilterLoop_BlockToRight:
|
|
|
|
pand mm0,mm5 ; 0f: <0 (P7+P5)/2 ... (P2+P0)/2 0> (clean)
|
|
psllq mm1,16 ; 1c
|
|
paddb mm0,mm4 ; 0g: <jnk (P7+2P6+P5)/2 ... (P2+2P1+P0)/2 jnk>
|
|
paddb mm1,mm3 ; 1d
|
|
psubb mm0,mm6 ; 0h: <jnk (P7+2P6+P5+2)/2 ... (P2+2P1+P0+2)/2 jnk>
|
|
psrlq mm1,9 ; 1e
|
|
psrlq mm0,1 ; 0i: <jnk (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 jnk>
|
|
pand mm4,mm7 ; 0j: <P7 0 0 0 0 0 0 P0>
|
|
pand mm0,mm5 ; 0k: < 0 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 0>
|
|
pand mm1,mm5 ; 1f
|
|
por mm0,mm4 ; 0l: <P7 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/4 P0>
|
|
paddb mm1,mm3 ; 1g
|
|
movq mm2,[esi+ebp*2]; 2a
|
|
psubb mm1,mm6 ; 1h
|
|
movq [edi],mm0 ; 0m: Store line 0 of filtered block. This is R0.
|
|
movq mm4,mm2 ; 2b
|
|
psrlq mm1,1 ; 1i
|
|
pand mm3,mm7 ; 1j
|
|
pand mm1,mm5 ; 1k
|
|
psllq mm2,16 ; 2c
|
|
por mm1,mm3 ; 1l: This is R1
|
|
paddb mm2,mm4 ; 2d
|
|
psubb mm1,mm6 ; 1A: R1+1
|
|
psrlq mm2,9 ; 2e
|
|
pand mm2,mm5 ; 2f
|
|
paddb mm0,mm1 ; 1B: R0+R1+1
|
|
paddb mm2,mm4 ; 2g
|
|
psrlq mm0,1 ; 1C: (R0+R1+1)/2 (dirty)
|
|
pand mm0,[ebx] ; 1D: (R0+R1+1)/2 (clean)
|
|
psubb mm2,mm6 ; 2h
|
|
psrlq mm2,1 ; 2i
|
|
pand mm4,mm7 ; 2j
|
|
movq mm3,[esi+PITCH*3] ; 3a
|
|
pand mm2,mm5 ; 2k
|
|
por mm2,mm4 ; 2l: This is R2.
|
|
movq mm4,mm3 ; 3b
|
|
paddb mm1,mm2 ; 1E & 2B: R1+R2+1
|
|
psllq mm3,16 ; 3c
|
|
psrlq mm1,1 ; 1F & 2C: (R1+R2+1)/2 (dirty)
|
|
paddb mm3,mm4 ; 3d
|
|
pand mm1,[ebx] ; 1G & 2D: (R1+R2+1)/2 (clean)
|
|
psrlq mm3,9 ; 3e
|
|
paddb mm0,mm1 ; 1H: (R0+2R1+R2+2)/2
|
|
pand mm3,mm5 ; 3f
|
|
psrlq mm0,1 ; 1I: (R0+2R1+R2+2)/4 (dirty)
|
|
paddb mm3,mm4 ; 3g
|
|
pand mm0,[ebx] ; 1J: (R0+2R1+R2+2)/4 (clean)
|
|
psubb mm3,mm6 ; 3h
|
|
psrlq mm3,1 ; 3i
|
|
pand mm4,mm7 ; 3j
|
|
movq [edi+ebp*1],mm0 ; 1K: Store line 1 of filtered block.
|
|
pand mm3,mm5 ; 3k
|
|
movq mm0,[eax] ; 4a
|
|
por mm3,mm4 ; 3l
|
|
psubb mm3,mm6 ; 3A: R3+1
|
|
movq mm4,mm0 ; 4b
|
|
paddb mm2,mm3 ; 2E & 3B: R2+R3+1
|
|
psllq mm0,16 ; 4c
|
|
psrlq mm2,1 ; 2F & 3C: (R2+R3+1)/2 (dirty)
|
|
paddb mm0,mm4 ; 4d
|
|
pand mm2,[ebx] ; 2G & 3D: (R2+R3+1)/2 (clean)
|
|
psrlq mm0,9 ; 4e
|
|
paddb mm1,mm2 ; 2H: (R1+2R2+R3+2)/2
|
|
pand mm0,mm5 ; 4f
|
|
psrlq mm1,1 ; 2I: (R1+2R2+R3+2)/4 (dirty)
|
|
paddb mm0,mm4 ; 4g
|
|
pand mm1,[ebx] ; 2J: (R1+2R2+R3+2)/4 (clean)
|
|
psubb mm0,mm6 ; 4h
|
|
psrlq mm0,1 ; 4i
|
|
pand mm4,mm7 ; 4j
|
|
movq [edi+ebp*2],mm1 ; 2K: Store line 2 of filtered block.
|
|
pand mm0,mm5 ; 4k
|
|
movq mm1,[eax+ebp*1] ; 5a
|
|
por mm0,mm4 ; 4l
|
|
movq mm4,mm1 ; 5b
|
|
psllq mm1,16 ; 5c
|
|
paddb mm3,mm0 ; 3E & 4B: R3+R4+1
|
|
paddb mm1,mm4 ; 5d
|
|
add esi,8
|
|
psrlq mm3,1 ; 3F & 4C: (R3+R4+1)/2 (dirty)
|
|
pand mm3,[ebx] ; 3G & 4D: (R3+R4+1)/2 (clean)
|
|
psrlq mm1,9 ; 5e
|
|
paddb mm2,mm3 ; 3H: (R2+2R3+R4+2)/2
|
|
pand mm1,mm5 ; 5f
|
|
psrlq mm2,1 ; 3I: (R2+2R3+R4+2)/4 (dirty)
|
|
paddb mm1,mm4 ; 5g
|
|
pand mm2,[ebx] ; 3J: (R2+2R3+R4+2)/4 (clean)
|
|
psubb mm1,mm6 ; 5h
|
|
psrlq mm1,1 ; 5i
|
|
pand mm4,mm7 ; 5j
|
|
movq [edi+PITCH*3],mm2 ; 3K: Store line 3 of filtered block.
|
|
pand mm1,mm5 ; 5k
|
|
movq mm2,[eax+ebp*2] ; 6a
|
|
por mm1,mm4 ; 5l
|
|
psubb mm1,mm6 ; 5A: R5+1
|
|
movq mm4,mm2 ; 6b
|
|
paddb mm0,mm1 ; 4E & 5B: R4+R5+1
|
|
psllq mm2,16 ; 6c
|
|
psrlq mm0,1 ; 4F & 5C: (R4+R5+1)/2 (dirty)
|
|
paddb mm2,mm4 ; 6d
|
|
pand mm0,[ebx] ; 4G & 5D: (R4+R5+1)/2 (clean)
|
|
psrlq mm2,9 ; 6e
|
|
paddb mm3,mm0 ; 4H: (R3+2R4+R5+2)/2
|
|
pand mm2,mm5 ; 6f
|
|
psrlq mm3,1 ; 4I: (R3+2R4+R5+2)/4 (dirty)
|
|
paddb mm2,mm4 ; 6g
|
|
pand mm3,[ebx] ; 4J: (R3+2R4+R5+2)/4 (clean)
|
|
psubb mm2,mm6 ; 6h
|
|
psrlq mm2,1 ; 6i
|
|
sub cl,2 ; Loop control
|
|
movq [edi+ebp*4],mm3 ; 4K: Store line 4 of filtered block.
|
|
pand mm4,mm7 ; 6j
|
|
movq mm3,[eax+PITCH*3] ; 7a
|
|
pand mm2,mm5 ; 6k
|
|
por mm2,mm4 ; 6l
|
|
movq mm4,mm3 ; 7b
|
|
paddb mm1,mm2 ; 5E & 6B: R5+R6+1
|
|
psllq mm3,16 ; 7c
|
|
psrlq mm1,1 ; 5F & 6C: (R5+R6+1)/2 (dirty)
|
|
paddb mm3,mm4 ; 7d
|
|
pand mm1,[ebx] ; 5G & 6D: (R5+R6+1)/2 (clean)
|
|
psrlq mm3,9 ; 7e
|
|
paddb mm0,mm1 ; 5H: (R4+2R5+R6+2)/2
|
|
pand mm3,mm5 ; 7f
|
|
psrlq mm0,1 ; 5I: (R4+2R5+R6+2)/4 (dirty)
|
|
paddb mm3,mm4 ; 7g
|
|
pand mm0,[ebx] ; 5J: (R4+2R5+R6+2)/4 (clean)
|
|
psubb mm3,mm6 ; 7h
|
|
psrlq mm3,1 ; 7i
|
|
pand mm4,mm7 ; 7j
|
|
movq [edi+PITCH*5],mm0 ; 5K: Store line 5 of filtered block.
|
|
pand mm3,mm5 ; 7k
|
|
psubb mm2,mm6 ; 7A: R6+1
|
|
por mm3,mm4 ; 7l
|
|
paddb mm2,mm3 ; 6E: R6+R7+1
|
|
lea eax,[esi+ebp*4]
|
|
movq mm0,[esi] ; 0a: for next iteration
|
|
psrlq mm2,1 ; 6F: (R6+R7+1)/2 (dirty)
|
|
pand mm2,[ebx] ; 6G: (R6+R7+1)/2 (clean)
|
|
movq mm4,mm0 ; 0b: for next iteration
|
|
movq [edi+PITCH*7],mm3 ; 7m: Store line 7 of filtered block.
|
|
paddb mm1,mm2 ; 6H: (R5+2R6+R7+2)/2
|
|
lea edi,[edi+8] ; Advance output cursor.
|
|
psrlq mm1,1 ; 6I: (R5+2R6+R7+2)/4 (dirty)
|
|
pand mm1,[ebx] ; 6J: (R5+2R6+R7+2)/4 (clean)
|
|
psllq mm0,16 ; 0c: for next iteration
|
|
movq mm3,[esi+ebp*1] ; 1a: for next iteration
|
|
paddb mm0,mm4 ; 0d: for next iteration
|
|
movq [edi+PITCH*6-8],mm1 ; 6K: Store line 6 of filtered block.
|
|
movq mm1,mm3 ; 1b: for next iteration
|
|
psrlq mm0,9 ; 0e: for next iteration
|
|
jg SpatialFilterLoop_BlockToRight
|
|
|
|
lea esi,[esi+ebp*8-16]
|
|
lea eax,[eax+ebp*8-16]
|
|
lea edi,[edi+ebp*8-16]
|
|
mov cl,4
|
|
jl SpatialFilterLoop
|
|
|
|
SpatialFilterDone:
|
|
|
|
mov edi,TargetMacroBlockBaseAddr
|
|
mov esi,SpatiallyFilteredMB
|
|
test ch,ch
|
|
jg ReturnFromSpatialFilterForU
|
|
|
|
; Registers at this point:
|
|
; ebp -- PITCH
|
|
; esi -- Address of upper left block of spatially filtered candidate ref area.
|
|
; edi -- Address of upper left block of target.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Scratch
|
|
; ebx -- Scratch
|
|
; eax -- Loop control
|
|
; mm0-mm4 -- Scratch
|
|
; mm5,mm6 -- SWD for each block
|
|
; mm7 -- SWD for macroblock
|
|
;
|
|
|
|
movq mm0,[esi+ebp*1]
|
|
pxor mm7,mm7
|
|
mov al,3
|
|
jl ReturnFromSpatialFilterForV
|
|
|
|
ComputeSWDforSLFBlock:
|
|
|
|
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
|
|
|
|
ComputeSWDforSLFBlock_BlkToRight:
|
|
|
|
movq mm1,[esi+PITCH*3] ; Ref MB, Line 3.
|
|
psllw mm0,8 ; Extract diffs for line 1 even pels.
|
|
psubw mm1,[edi+PITCH*3] ; Diff for line 3.
|
|
pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
|
|
movq mm2,[esi+PITCH*5]
|
|
psllw mm1,8
|
|
psubw mm2,[edi+PITCH*5]
|
|
pmaddwd mm1,mm1
|
|
movq mm3,[esi+PITCH*7]
|
|
psllw mm2,8
|
|
psubw mm3,[edi+PITCH*7]
|
|
pmaddwd mm2,mm2
|
|
movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
|
|
psllw mm3,8
|
|
psubw mm4,[edi] ; Diff for line 0.
|
|
paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
|
|
movq mm1,[esi+ebp*2]
|
|
pmaddwd mm3,mm3
|
|
psubw mm1,[edi+ebp*2]
|
|
paddusw mm0,mm2
|
|
movq mm2,[esi+ebp*4]
|
|
pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
|
|
psubw mm2,[edi+ebp*4]
|
|
paddusw mm0,mm3
|
|
movq mm3,[esi+PITCH*6]
|
|
pmaddwd mm1,mm1
|
|
psubw mm3,[edi+PITCH*6]
|
|
pmaddwd mm2,mm2
|
|
paddusw mm4,mm0
|
|
pmaddwd mm3,mm3
|
|
paddusw mm4,mm1
|
|
add esi,8
|
|
paddusw mm4,mm2
|
|
add edi,8
|
|
movq mm0,[esi+ebp*1]
|
|
paddusw mm4,mm3
|
|
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
|
|
punpckldq mm1,mm4 ; Get low order SWD accum to high order of mm1.
|
|
paddusw mm1,mm4 ; mm1[48:63] is SWD for block.
|
|
psllq mm6,32 ; Shift previous block's SWD left.
|
|
psrlq mm1,48 ; mm1 is SWD for block.
|
|
sub al,2 ; Loop control.
|
|
paddusw mm7,mm1
|
|
por mm6,mm1 ; Save current block's SWD.
|
|
movq mm4,mm5
|
|
jg ComputeSWDforSLFBlock_BlkToRight
|
|
|
|
movq mm0,[esi+PITCH*9-16]
|
|
movq mm5,mm6
|
|
lea edi,[edi+ebp*8-16]
|
|
lea esi,[esi+ebp*8-16]
|
|
mov al,4
|
|
jl ComputeSWDforSLFBlock
|
|
|
|
mov ebx,BestMBFullPelSWD ; Restore non-SLF SWD for macroblock.
|
|
mov eax,SpatialFiltDifferential
|
|
sub ebx,eax
|
|
sub edi,PITCH*16+16
|
|
movdf eax,mm7 ; SLF SWD for macroblock.
|
|
cmp eax,ebx
|
|
jge SpatialFilterNotAsGood
|
|
|
|
movdf SWDULandLR+4,mm5
|
|
psrlq mm5,32
|
|
movdf SWDURandLL+4,mm5
|
|
movdf SWDURandLL,mm6
|
|
psrlq mm6,32
|
|
movdf SWDULandLR,mm6
|
|
mov al,INTERSLF
|
|
mov ebx,SpatiallyFilteredMB
|
|
mov [edx].BlockType,al
|
|
sub esi,PITCH*8-8
|
|
mov [edx].BlkY4.PastRef,esi
|
|
mov [edx].BlkY1.PastRef,ebx
|
|
sub esi,8
|
|
add ebx,8
|
|
mov [edx].BlkY3.PastRef,esi
|
|
mov [edx].BlkY2.PastRef,ebx
|
|
|
|
SkipSpatialFiltering:
|
|
SpatialFilterNotAsGood:
|
|
ENDIF ; H261
|
|
|
|
;===============================================================================
|
|
; We've settled on the motion vector that will be used if we do indeed code the
|
|
; macroblock with inter-coding. We need to determine if some or all of the
|
|
; blocks can be forced as empty (copy). If all the blocks can be forced
|
|
; empty, we force the whole macroblock to be empty.
|
|
|
|
mov esi,EMPTYTHRESHOLD ; Get threshold for forcing block empty?
|
|
mov ebx,SWDULandLR ; Get SWD for block 1.
|
|
mov al,[edx].CodedBlocks
|
|
cmp ebx,esi ; Is SWD > threshold?
|
|
jg @f
|
|
|
|
and al,0FEH ; If not, indicate block 1 is NOT coded.
|
|
xor ebx,ebx
|
|
|
|
@@:
|
|
|
|
mov ecx,SWDURandLL ; Get SWD for block 2.
|
|
cmp ecx,esi
|
|
jg @f
|
|
|
|
and al,0FDH
|
|
xor ecx,ecx
|
|
|
|
@@:
|
|
|
|
add ebx,ecx
|
|
mov ecx,SWDURandLL+4 ; Get SWD for block 3.
|
|
cmp ecx,esi
|
|
jg @f
|
|
|
|
and al,0FBH
|
|
xor ecx,ecx
|
|
|
|
@@:
|
|
|
|
add ebx,ecx
|
|
mov ecx,SWDULandLR+4 ; Get SWD for block 4.
|
|
cmp ecx,esi
|
|
jg @f
|
|
|
|
and al,0F7H
|
|
xor ecx,ecx
|
|
|
|
@@:
|
|
|
|
mov [edx].CodedBlocks,al ; Store coded block pattern.
|
|
and al,00FH
|
|
add ebx,ecx
|
|
cmp al,00FH ; Are any blks marked empty?
|
|
jne InterBest ; If some blks are empty, can't code as Intra
|
|
|
|
mov edi,TargetMacroBlockBaseAddr
|
|
mov [edx].SWD,ebx
|
|
cmp ebx,INTERCODINGTHRESHOLD ; Is InterSWD below inter-coding thresh?
|
|
jae CalculateIntraSWD
|
|
|
|
InterBestX:
|
|
|
|
mov ebx,[edx].SWD
|
|
|
|
InterBest:
|
|
|
|
mov ecx,SWDTotal ; Add to total for this macroblock class.
|
|
add ecx,ebx
|
|
IFDEF H261
|
|
mov SWDTotal,ecx
|
|
ELSE ;H263
|
|
mov bl,DoAdvancedPrediction
|
|
mov SWDTotal,ecx
|
|
test bl,bl
|
|
jne OBMCDifferencing
|
|
ENDIF
|
|
|
|
;============================================================================
|
|
; Perform differencing for the non-empty luma blocks of an Inter-coded
|
|
; macroblock. This is the non-OBMC case; i.e. Advanced Prediction is
|
|
; not selected.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of reference block.
|
|
; edi -- Address of target block.
|
|
; edx -- MBlockActionStream. Used as cursor over luma blocks.
|
|
; ecx -- Not in use.
|
|
; ebx -- Scratch. Used to test half pel MV resolution.
|
|
; eax[0:3] -- Coded block pattern for luma blocks.
|
|
|
|
mov cl,INTER1MV
|
|
mov ebx,TargetMacroBlockBaseAddr
|
|
mov StashBlockType,cl
|
|
test al,1 ; Don't diff block 1 if marked empty.
|
|
mov edi,ebx
|
|
je @f
|
|
|
|
mov ebx,[edx].BlkY1.MVs
|
|
mov esi,[edx].BlkY1.PastRef
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov ebx,TargetMacroBlockBaseAddr
|
|
mov [edx].CodedBlocks,al
|
|
pop edi ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
|
|
@@:
|
|
|
|
lea edi,[ebx+8] ; Get address of next macroblock to do.
|
|
test al,2 ; Don't diff block 2 if marked empty.
|
|
je @f
|
|
|
|
mov ebx,[edx].BlkY2.MVs
|
|
mov esi,[edx].BlkY2.PastRef
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
shl bl,1
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov ebx,TargetMacroBlockBaseAddr
|
|
mov [edx].CodedBlocks,al
|
|
pop edi ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
|
|
@@:
|
|
|
|
lea edi,[ebx+ebp*8] ; Get address of next macroblock to do.
|
|
test al,4 ; Don't diff block 3 if marked empty.
|
|
je @f
|
|
|
|
mov ebx,[edx].BlkY3.MVs
|
|
mov esi,[edx].BlkY3.PastRef
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
shl bl,2
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov ebx,TargetMacroBlockBaseAddr
|
|
mov [edx].CodedBlocks,al
|
|
pop edi ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
|
|
@@:
|
|
|
|
lea edi,[ebx+ebp*8+8] ; Get address of next macroblock to do.
|
|
test al,8 ; Don't diff block 4 if marked empty.
|
|
je NonOBMCDifferencingDone
|
|
|
|
mov ebx,[edx].BlkY4.MVs
|
|
mov esi,[edx].BlkY4.PastRef
|
|
|
|
call DoNonOBMCDifferencing
|
|
|
|
; (Finish differencing the last four lines.)
|
|
movq mm4,[edi+ebp*4] ; T4
|
|
psrlq mm1,1
|
|
movq mm5,[edi+PITCH*5]
|
|
psubb mm4,mm0 ; D4 = T4 - P4
|
|
movq mm0,[edi+PITCH*6]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*7]
|
|
pand mm2,mm6
|
|
pand mm3,mm6
|
|
psrlq mm2,1
|
|
movq PelDiffsLine4,mm4 ; Store D4.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine5,mm5
|
|
psrlq mm3,1
|
|
movq PelDiffsLine6,mm0
|
|
psubb mm1,mm3
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
|
|
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
shl bl,3
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
pop edi ; Adjust stack pointer
|
|
mov [edx].CodedBlocks,al
|
|
|
|
StackOffset TEXTEQU <0>
|
|
NonOBMCDifferencingDone:
|
|
|
|
IFDEF H261
|
|
ELSE
|
|
mov al,IsPlainPFrame
|
|
test al,al
|
|
jne NextMacroBlock
|
|
|
|
movq mm6,C0101010101010101
|
|
pxor mm7,mm7 ; Initialize SWD accumulator
|
|
|
|
call MMxDoBFrameLumaBlocks
|
|
|
|
ENDIF
|
|
jmp NextMacroBlock
|
|
|
|
;============================================================================
|
|
; Register usage in the following internal function. This function does
|
|
; half pel motion estimation for whole macroblocks, or individual blocks.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of best full pel reference macroblock. For MBME unchanged
|
|
; at exit. For BlkME, adjusted by -8-8*PITCH.
|
|
; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
|
|
; adjusted by -8-8*PITCH.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Reserved.
|
|
; ebx -- For MBME: 240 + Flags to indicate which half pel ME to do:
|
|
; 1 --> right; 2 --> left; 4 --> down; 8 --> up
|
|
; For BlkME: Garbage
|
|
; eax -- Count from -4 to -1 for blocks of macroblock. 0 for single block.
|
|
; mm7 -- Initialized to zero.
|
|
; mm6 -- Initialized to zero.
|
|
; mm0:mm7 -- Scratch
|
|
; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
|
|
; mm3[16:31] -- SWD for ref 1/2 pel leftward
|
|
; mm3[32:47] -- SWD for ref 1/2 pel downward
|
|
; mm3[48:63] -- SWD for ref 1/2 pel upward
|
|
|
|
StackOffset TEXTEQU <4>
|
|
HalfPelMotionEstimation:
|
|
|
|
and bl,15
|
|
|
|
HalfPelMBMEForUpperBlock:
|
|
HalfPelMEForFirst2LinesOfBlock:
|
|
|
|
movq mm0,[esi-PITCH] ; <P^7 P^6 P^5 P^4 P^3 P^2 P^1 P^0>
|
|
movq mm1,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
|
|
paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
|
|
|
|
HalfPelMEForNext2LinesOfBlock:
|
|
|
|
movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
|
|
movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
|
|
|
|
HalfPelMBMEForLowerBlock:
|
|
|
|
psubw mm0,[edi] ; <(P^7+P07)/2-T07 junk (P^5+P05)/2-T05 junk ...>
|
|
paddb mm5,mm2 ; <P07+P17 P06+P16 P05+P15 P04+P14 ...>
|
|
pmullw mm1,C0101010101010101 ; <(P07+P06)*256+P06 ...>
|
|
psllw mm5,8 ; <(P06+P16) 0 (P04+P14) 0 ...>
|
|
pmaddwd mm0,mm0 ; Square diff for line 0 odd pels, upward ref.
|
|
psrlw mm5,1 ; <(P06+P16)/2 0 (P04+P14)/2 0 ...>
|
|
movq mm3,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
|
|
psubw mm4,mm5 ; <T16-(P06+P16)/2 junk ...>
|
|
pmaddwd mm4,mm4 ; Square diff for line 1 even pels, upward ref.
|
|
psrlw mm1,1 ; <(P07+P06)*128+P06/2 ...>
|
|
psllw mm3,8 ; <T06 0 T04 0 T02 0 T00 0>
|
|
lea edi,[edi+ebp*2] ; Advance Target cursor
|
|
psubw mm3,mm1 ; <T06-(P07+P06)/2 junk T04-(P05+P03)/2 junk ...>
|
|
lea esi,[esi+ebp*2] ; Advance Reference cursor
|
|
psubw mm1,[edi-PITCH*2] ; <(P07+P06)/2-T07 junk (P05+P04)/2-T05 junk ...>
|
|
pmaddwd mm3,mm3 ; Square diff for line 0 even pels, rightwrd ref.
|
|
pmaddwd mm1,mm1 ; Square diff for line 0 odd pels, leftward ref.
|
|
paddusw mm0,mm4 ; SSD for line 0 and 1, upward ref.
|
|
pand mm0,CFFFF0000FFFF0000 ; Extract SSD for line 0 and 1, upward ref.
|
|
movq mm4,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
paddusw mm6,mm0 ; Accumulate SSD for line 0 and 1, upward ref.
|
|
psrlq mm4,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
|
|
pand mm1,CFFFF0000FFFF0000 ; Extract SSD for line 0, leftward ref.
|
|
psrld mm3,16 ; Extract SSD for line 0, rightward ref.
|
|
pmullw mm4,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
|
|
paddw mm3,mm1 ; SSD for line 0, leftward and rightward refs.
|
|
movq mm1,[esi] ; <P27 P26 P25 P24 P23 P22 P21 P20>
|
|
movq mm0,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
paddusw mm7,mm3 ; Accumulate SSD for line 0, left and right refs.
|
|
paddb mm2,mm1 ; <P17+P27 P16+P26 P15+P25 P14+P24 ...>
|
|
movq mm3,mm0 ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
psrlw mm4,1 ; <P17 (P16*P15)*128+P15/2 ...>
|
|
psubw mm4,[edi-PITCH*1] ; <P17-T17 junk (P16*P15)/2-T15 junk ...>
|
|
psllq mm3,8 ; <P16 P15 P14 P13 P12 P11 P10 0>
|
|
pmullw mm3,C0101010101010002 ; <(P16+P15)*256+P15 ... P10*256*2>
|
|
psrlw mm2,1 ; <(P17+P27)/2 junk (P15+P25)/2 junk ...>
|
|
movq StashMM6,mm6
|
|
pmaddwd mm4,mm4 ; Square diff for line 1 odd pels, rightward ref.
|
|
movq mm6,[edi-PITCH*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
|
|
psrlw mm3,1 ; <(P16+P15)*128+P15/2 ... P10*256>
|
|
psubw mm2,[edi-PITCH*1] ; <(P17+P27)/2-T17 junk (P15+P25)/2-T15 junk ...>
|
|
psllw mm6,8 ; <T16 0 T14 0 T12 0 T10 0>
|
|
psubw mm3,mm6 ; <(P16+P15)/2-T16 junk ... P10-T10>
|
|
psrld mm4,16 ; Extract SSD for line 1, rightward ref.
|
|
movq mm6,[edi-PITCH*2] ; <T07 T06 T05 T04 T03 T02 T01 T00>
|
|
pmaddwd mm3,mm3 ; Square diff for line 1 even pels, leftward ref.
|
|
pmaddwd mm2,mm2 ; Square diff for line 1 odd pels, downward ref.
|
|
psllw mm6,8 ; <T06 0 T04 0 T02 0 T00 0>
|
|
paddusw mm7,mm4 ; Accumulate SSD for line 1, rightward ref.
|
|
psubw mm6,mm5 ; <T06-(P06+P16)/2 junk ...>
|
|
pand mm3,CFFFF0000FFFF0000 ; Extract SSD for line 1, leftward ref.
|
|
pmaddwd mm6,mm6 ; Square diff for line 0 even pels, downward ref.
|
|
add bl,080H
|
|
psrld mm2,16 ; Extract SSD for line 1, downward ref.
|
|
paddusw mm2,StashMM6 ; Accumulate SSD for line 1, downward ref.
|
|
paddusw mm7,mm3 ; Accumulate SSD for line 1, leftward ref.
|
|
movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
|
|
psrld mm6,16 ; Extract SSD for line 0, downward ref.
|
|
paddusw mm6,mm2 ; Accumulate SSD for line 0, downward ref.
|
|
paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
|
|
punpckldq mm5,mm6 ; Speculatively start to accum partial SWDs.
|
|
jnc HalfPelMEForNext2LinesOfBlock ; Iterate twice, for half a block.
|
|
|
|
punpckldq mm3,mm7
|
|
add bl,040H
|
|
paddusw mm5,mm6
|
|
jns HalfPelMEForNext2LinesOfBlock ; Iterate twice, for a whole block.
|
|
|
|
paddusw mm3,mm7
|
|
psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
|
|
movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
punpckhdq mm3,mm5 ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
|
|
; ; mm3[16:31] -- SWD for ref 1/2 pel leftward
|
|
; ; mm3[32:47] -- SWD for ref 1/2 pel downward
|
|
; ; mm3[48:63] -- SWD for ref 1/2 pel upward
|
|
movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
sub bl,080H
|
|
movq HalfPelMBMESWDAccum[eax*8+32],mm3
|
|
psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
|
|
add eax,2
|
|
jl HalfPelMBMEForLowerBlock ; Iterate twice for 2 blocks.
|
|
|
|
lea edi,[edi-PITCH*16+8]
|
|
lea esi,[esi-PITCH*16+8]
|
|
lea eax,[eax-3]
|
|
je HalfPelMBMEForUpperBlock ; Iterate twice for macroblock.
|
|
|
|
sub edi,16
|
|
xor eax,eax
|
|
sub esi,16
|
|
mov al,bl
|
|
ret
|
|
|
|
StackOffset TEXTEQU <0>
|
|
|
|
;============================================================================
|
|
; Register usage in the following internal function. This function does
|
|
; half pel motion estimation in both directions for whole macroblocks, or
|
|
; individual blocks.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of best full pel reference macroblock. For MBME unchanged
|
|
; at exit. For BlkME, adjusted by -8-8*PITCH.
|
|
; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
|
|
; adjusted by -8-8*PITCH.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Reserved. Contains motion vectors.
|
|
; ebx -- Returns SWD for this reference block or macroblock.
|
|
; al -- Count from 4 to 1 for blocks of macroblock. 1 for blk only.
|
|
; mm0:mm6 -- Scratch
|
|
; mm7 -- Reserved. Contains SWDs for four 1/2 pel refs at main compass points.
|
|
; mm4 -- Returns SWD for this reference block or macroblock.
|
|
|
|
StackOffset TEXTEQU <4>
|
|
HalfPelMotionEstimationBothWays:
|
|
|
|
movq mm3,C0101010101010101
|
|
pxor mm6,mm6 ; Zero out SSD accumulator.
|
|
|
|
HalfPelMBMEForUpperBlockBothWays:
|
|
HalfPelMEForFirst2LinesOfBlockBothWays:
|
|
|
|
movq mm0,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
|
|
HalfPelMEForNext2LinesOfBlockBothWays:
|
|
HalfPelMBMEForLowerBlockBothWays:
|
|
|
|
movq mm1,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
pmullw mm0,mm3 ; <(P07+P06)*256+P06 ...>
|
|
movq mm2,[esi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
|
|
pmullw mm3,mm1 ; <(P17+P16)*256+P16 ...>
|
|
movq mm4,mm2 ; <P27 P26 P25 P24 P23 P22 P21 P20>
|
|
psrlq mm2,8 ; < 0 P27 P26 P25 P24 P23 P22 P21>
|
|
pmullw mm2,C0200010101010101 ; <P27*256*2 (P26+P25)*256+P25 ...>
|
|
psrlq mm1,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
|
|
pmullw mm1,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
|
|
psrlw mm3,2 ; <(P17+P16)/4 junk ...> (w /2 frac bits)
|
|
movq mm5,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
|
|
psrlw mm0,2 ; <(P07+P06)/4 junk ...> (w/ 2 frac bits)
|
|
paddw mm3,mm0 ; <(P07+P06+P17+P16)/4 junk ...>
|
|
psrlw mm2,2 ; <P27/2 junk (P26+P25)/4 junk ...>
|
|
psubw mm2,[edi+ebp*1] ; <P27/2-T17 junk (P26+P25)/4-T15 junk ...>
|
|
psrlw mm1,2 ; <P17/2 junk (P16+P15)/4 junk ...>
|
|
paddw mm2,mm1 ; <(P17+P27)/2-T17 junk (P16+P15+P26+P25)-T15 junk ...>
|
|
psllw mm5,8 ; <T06 0 T04 0 T02 0 T00 0>
|
|
psubw mm5,mm3 ; <T06-(P07+P06+P17+P16)/4 junk ...>
|
|
pmaddwd mm2,mm2 ; Square diffs for odd pels of line 1.
|
|
pmaddwd mm5,mm5 ; Square diffs for even pels of line 0.
|
|
movq mm0,mm4 ; <P27 P26 P25 P24 P23 P22 P21 P20>
|
|
lea edi,[edi+ebp*2] ; Advance target cursor.
|
|
lea esi,[esi+ebp*2] ; Advance reference cursor.
|
|
paddusw mm6,mm2 ; Accumulate SSD for odd pels of line 1.
|
|
add al,080H
|
|
movq mm3,C0101010101010101
|
|
paddusw mm6,mm5 ; Accumulate SSD for even pels of line 0.
|
|
punpckldq mm4,mm6 ; Speculatively start to accum partial SWDs.
|
|
jnc HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for half a block.
|
|
|
|
add al,040H
|
|
paddusw mm4,mm6 ; After whole block, SSD is in mm4[48:63].
|
|
psrlq mm4,48
|
|
jns HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for a whole block.
|
|
|
|
movdf ebx,mm4
|
|
sub al,082H
|
|
jg HalfPelMBMEForLowerBlockBothWays ; Iterate twice for 2 blocks.
|
|
|
|
lea edi,[edi-PITCH*16+8]
|
|
lea esi,[esi-PITCH*16+8]
|
|
mov al,3
|
|
je HalfPelMBMEForUpperBlockBothWays ; Iterate twice for macroblock.
|
|
|
|
sub edi,16
|
|
sub esi,16
|
|
ret
|
|
|
|
StackOffset TEXTEQU <0>
|
|
|
|
;============================================================================
|
|
; Register usage in the following internal function. This function is also
|
|
; called to do frame differencing for chroma blocks.
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Address of reference block.
|
|
; edi -- Address of target block.
|
|
; edx -- Unavailable. In use by caller.
|
|
; ecx -- Not in use.
|
|
; ebx -- Motion vectors for the block. bl[0] indicates whether half-pel
|
|
; horizontal interpolation is required; bh[0] same for vertical.
|
|
; This register is then used for scratch purposes.
|
|
; eax -- Unavailable. In use by caller.
|
|
; mm0-mm5 -- Scratch
|
|
; mm6 -- 8 bytes of 0xFE
|
|
; mm7 -- 8 bytes of -1
|
|
|
|
StackOffset TEXTEQU <4>
|
|
|
|
DoNonOBMCDifferencing: ; Internal Function
|
|
|
|
pcmpeqb mm7,mm7
|
|
pcmpeqb mm6,mm6
|
|
IFDEF H261
|
|
ELSE ;H263
|
|
shr bl,1
|
|
|
|
jc NonOBMCDiff_Horz
|
|
ENDIF
|
|
|
|
|
|
movq mm1,[esi+ebp*1] ; BC . . . R0Dn
|
|
paddb mm6,mm6
|
|
IFDEF H261
|
|
ELSE ;H263
|
|
shr bh,1
|
|
jc NonOBMCDiff_Vert
|
|
ENDIF
|
|
|
|
psubb mm1,[edi+ebp*1] ; P1 - T1
|
|
pxor mm4,mm4
|
|
movq mm0,[edi] ; T0
|
|
psubb mm4,mm1 ; D1 = T1 - P1
|
|
psubb mm0,[esi] ; D0 = T0 - P0
|
|
movq mm2,[edi+ebp*2] ; T2
|
|
movq mm3,[edi+PITCH*3] ; T3
|
|
psubb mm2,[esi+ebp*2] ; D2 = T2 - P2
|
|
psubb mm3,[esi+PITCH*3] ; D3 = T3 - P3
|
|
movq PelDiffsLine0,mm0 ; Store D0.
|
|
movq PelDiffsLine1,mm4 ; Store D1.
|
|
movq PelDiffsLine2,mm2 ; Store D2.
|
|
movq PelDiffsLine3,mm3 ; Store D3.
|
|
movq mm3,[esi+PITCH*7] ; P7
|
|
movq mm2,[esi+PITCH*6] ; P6
|
|
paddb mm3,mm3 ; Double so that return will fix it.
|
|
movq mm1,[esi+PITCH*5] ; P5
|
|
paddb mm2,mm2 ; Double so that return will fix it.
|
|
movq mm0,[esi+ebp*4] ; P4
|
|
paddb mm1,mm1 ; Double so that return will fix it.
|
|
ret
|
|
|
|
IFDEF H261
|
|
ELSE ;H263
|
|
NonOBMCDiff_Vert: ; 0123 Detail for 0
|
|
|
|
movq mm0,[esi] ; C. . R0Up
|
|
psubb mm1,mm7 ; DD . R0Dn+1
|
|
|
|
call Get4LinesOfPred_InterpVert
|
|
|
|
movq mm5,[edi] ; T0
|
|
psrlq mm1,1 ; O .
|
|
movq mm7,[edi+ebp*1]
|
|
psubb mm5,mm0 ; D0 = T0 - P0
|
|
movq mm0,mm4
|
|
psubb mm7,mm1
|
|
movq mm1,[edi+ebp*2]
|
|
pand mm2,mm6 ; .N.
|
|
movq mm4,[edi+PITCH*3]
|
|
pand mm3,mm6 ; . N
|
|
psrlq mm2,1 ; .O.
|
|
movq PelDiffsLine0,mm5 ; Store D0.
|
|
psubb mm1,mm2
|
|
movq PelDiffsLine1,mm7 ; Store D1.
|
|
psrlq mm3,1 ; . O
|
|
movq PelDiffsLine2,mm1 ; Store D2.
|
|
psubb mm4,mm3
|
|
movq mm1,[esi+ebp*1] ; BC . . . R0Dn
|
|
pcmpeqb mm7,mm7
|
|
movq PelDiffsLine3,mm4 ; Store D3.
|
|
psubb mm1,mm7 ; DD . . . R0Dn+1
|
|
; jmp Get4MoreLinesOfPred_InterpVert
|
|
|
|
;===========================================================================
|
|
; Internal function to get 4 lines of prediction, interpolating in the
|
|
; vertical direction. The first 3 lines of the function are scheduled into
|
|
; the caller's space, and so are commented out here. For 8 lines of prediction,
|
|
; a second call, to the second entry point, is called after consuming the
|
|
; outputs of the first function call. Certain registers must remain intact
|
|
; to convey information from the first call to the second.
|
|
;
|
|
; ebp -- PITCH
|
|
; edi -- Points to target block.
|
|
; esi -- Points to Upper left corner of 8 column, 9 row block that will be
|
|
; interpolated vertically to generate prediction.
|
|
; edx -- Reserved (MBlockActionStream)
|
|
; ecx -- Not in use.
|
|
; ebx -- Will be used.
|
|
; eax -- Reserved.
|
|
; mm6 -- 8 bytes of 0xFE.
|
|
; mm7 -- 8 bytes of -1.
|
|
; mm0-mm5 -- Scratch.
|
|
|
|
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
|
|
Get4LinesOfPred_InterpVert: ; 0123 Details for line 0
|
|
; movq mm1,[esi+ebp*1] ; BC . R0Dn
|
|
; movq mm0,[esi] ; C. . R0Up
|
|
; psubb mm1,mm7 ; DD . R0Dn+1
|
|
Get4MoreLinesOfPred_InterpVert:
|
|
movq mm2,[esi+ebp*2] ; BC.
|
|
paddb mm0,mm1 ; E. . R0Up+R0Dn+1
|
|
movq mm3,[esi+PITCH*3] ; .BC
|
|
paddb mm1,mm2 ; E .
|
|
movq mm4,[esi+ebp*4] ; . BC
|
|
psubb mm3,mm7 ; .DD
|
|
paddb mm2,mm3 ; .E.
|
|
pand mm0,mm6 ; F. . Pre-clean
|
|
paddb mm3,mm4 ; E
|
|
pand mm1,mm6 ; F .
|
|
lea esi,[esi+ebp*4] ; Advance to next four lines.
|
|
psrlq mm0,1 ; G. . P0 = (R0Up + R0Dn + 1) / 2
|
|
; pand mm2,mm6 ; G.
|
|
; psrlq mm1,1 ; H .
|
|
; pand mm3,mm6 ; G
|
|
; psrlq mm2,1 ; H.
|
|
; psrlq mm3,1 ; H
|
|
ret
|
|
StackOffset TEXTEQU <4>
|
|
|
|
;===========================================================================
|
|
|
|
NonOBMCDiff_Horz:
|
|
|
|
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
paddb mm6,mm6 ; . . 8 bytes of 0xFE
|
|
shr bh,1
|
|
jc NonOBMCDiff_Both
|
|
|
|
movq mm7,[edi+PITCH*3] ; T3
|
|
|
|
call Get4LinesOfPred_InterpHorz
|
|
|
|
movq mm4,[edi] ; T0
|
|
psrlq mm1,1 ; O .
|
|
movq mm5,[edi+ebp*1]
|
|
psubb mm4,mm0 ; D0 = T0 - P0
|
|
movq mm0,[edi+ebp*2]
|
|
psubb mm5,mm1
|
|
movq mm1,[edi+PITCH*3]
|
|
pand mm2,mm6 ; .N.
|
|
pand mm3,mm6 ; . N
|
|
psrlq mm2,1 ; .O.
|
|
movq PelDiffsLine0,mm4 ; Store D0.
|
|
psubb mm0,mm2
|
|
movq PelDiffsLine1,mm5 ; Store D1.
|
|
psrlq mm3,1 ; . O
|
|
movq PelDiffsLine2,mm0 ; Store D2.
|
|
psubb mm1,mm3
|
|
movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
|
|
;
|
|
movq PelDiffsLine3,mm1 ; Store D3.
|
|
;
|
|
|
|
;===========================================================================
|
|
; Internal function to get 4 lines of prediction, interpolating in the
|
|
; horizontal direction. The first line of the function are scheduled into
|
|
; the caller's space, and so are commented out here. For 8 lines of prediction,
|
|
; a second call, to the second entry point, is called after consuming the
|
|
; outputs of the first function call. Certain registers must remain intact
|
|
; to convey information from the first call to the second.
|
|
;
|
|
; ebp -- PITCH
|
|
; edi -- Points to target block.
|
|
; esi -- Points to Upper left corner of 9 column, 8 row block that will be
|
|
; interpolated horizontally to generate prediction.
|
|
; edx -- Reserved (MBlockActionStream)
|
|
; ecx -- Not in use.
|
|
; ebx -- Will be used.
|
|
; eax -- Reserved.
|
|
; mm6 -- 8 bytes of 0xFE.
|
|
; mm0-mm5 -- Will be used.
|
|
|
|
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
|
|
Get4LinesOfPred_InterpHorz:
|
|
Get4MoreLinesOfPred_InterpHorz:
|
|
|
|
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
xor ebx,ebx ; . .
|
|
movq mm0,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
mov bl,[esi] ; C. . R00
|
|
psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
|
|
movq mm1,[esi+ebp*1+1] ; A .
|
|
paddb mm0,mm5 ; E. . <R08+R07 ... R02+R01 R01 >
|
|
paddb mm0,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
|
|
movq mm4,mm1 ; B .
|
|
mov bl,[esi+ebp*1] ; C .
|
|
psllq mm4,8 ; D .
|
|
movq mm2,[esi+ebp*2+1] ; A.
|
|
paddb mm1,mm4 ; E .
|
|
paddb mm1,Pel_Rnd[ebx*8] ; F .
|
|
movq mm5,mm2 ; B.
|
|
mov bl,[esi+ebp*2] ; C.
|
|
psllq mm5,8 ; D.
|
|
movq mm3,[esi+PITCH*3+1] ; A
|
|
paddb mm2,mm5 ; E.
|
|
paddb mm2,Pel_Rnd[ebx*8] ; F.
|
|
movq mm4,mm3 ; B
|
|
mov bl,[esi+PITCH*3] ; C
|
|
psllq mm4,8 ; D
|
|
paddb mm3,mm4 ; E
|
|
pand mm0,mm6 ; G. . pre-cleaned
|
|
paddb mm3,Pel_Rnd[ebx*8] ; F
|
|
psrlq mm0,1 ; H. . P0=<(R08+R07+1)/2 ... (R01+R00+1)/2>
|
|
lea esi,[esi+ebp*4] ; Advance to next four lines.
|
|
pand mm1,mm6 ; G .
|
|
; pand mm2,mm6 ; G.
|
|
; psrlq mm1,1 ; H .
|
|
; pand mm3,mm6 ; G
|
|
; psrlq mm2,1 ; H.
|
|
; psrlq mm3,1 ; H
|
|
ret
|
|
StackOffset TEXTEQU <4>
|
|
|
|
; The steps commented out above are scheduled into the mem-ops the caller has
|
|
; to do at the point of return. As though these ops were done, the registers
|
|
; look as follows:
|
|
; mm0 -- Prediction for line 0.
|
|
; mm1 -- Prediction for line 1.
|
|
; mm2 -- Prediction for line 2.
|
|
; mm3 -- Prediction for line 3.
|
|
; mm6 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
|
|
;=============================================================================
|
|
|
|
NonOBMCDiff_Both:
|
|
|
|
call Get4LinesOfPred_InterpBoth
|
|
|
|
movq mm7,[edi] ; T0
|
|
psrlq mm1,1 ; O .
|
|
psubb mm7,mm0 ; D0 = T0 - P0
|
|
pand mm2,mm6 ; .N.
|
|
movq mm0,[edi+ebp*1]
|
|
psrlq mm2,1 ; .O.
|
|
movq PelDiffsLine0,mm7 ; Store D0.
|
|
psubb mm0,mm1
|
|
movq mm7,[edi+ebp*2]
|
|
pand mm3,mm6 ; . N
|
|
movq PelDiffsLine1,mm0
|
|
psrlq mm3,1 ; . O
|
|
movq mm1,[edi+PITCH*3]
|
|
psubb mm7,mm2
|
|
psubb mm1,mm3
|
|
movq mm0,mm4
|
|
movq PelDiffsLine2,mm7
|
|
paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
|
|
movq PelDiffsLine3,mm1 ; Store D3.
|
|
pcmpeqb mm7,mm7
|
|
jmp Get4MoreLinesOfPred_InterpBoth
|
|
|
|
;===========================================================================
|
|
; Internal function to get 4 lines of prediction, interpolating in both
|
|
; directions. The first line of the function are scheduled into the
|
|
; caller's space, and so are commented out here. For 8 lines of prediction,
|
|
; a second call, to the second entry point, is called after consuming the
|
|
; outputs of the first function call. Certain registers must remain intact
|
|
; to convey information from the first call to the second.
|
|
;
|
|
; ebp -- PITCH
|
|
; edi -- Points to target block.
|
|
; esi -- Points to Upper left corner of 9*9 block that will be interpolated
|
|
; horizontally and vertically to generate prediction.
|
|
; edx -- Reserved (MBlockActionStream)
|
|
; ecx -- Not in use
|
|
; ebx -- Will be used.
|
|
; eax -- Reserved.
|
|
; mm6 -- 8 bytes of 0xFE.
|
|
; mm7 -- 8 bytes of -1.
|
|
; mm0-mm5 -- Scratch
|
|
|
|
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
|
|
Get4LinesOfPred_InterpBoth: ; 01234 Details for line 0
|
|
|
|
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
movq mm1,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
xor ebx,ebx ; . .
|
|
mov bl,[esi] ; C. . R00
|
|
psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
|
|
paddb mm5,mm1 ; E. . <R08+R07 ... R02+R01 R01>
|
|
paddb mm5,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
|
|
movq mm0,mm6 ; G. . Mask to extract each pel's frac bit.
|
|
pandn mm0,mm5 ; H. . <(R08+R07+1)&1 ...>
|
|
pand mm5,mm6 ; I. . Pre-clean
|
|
Get4MoreLinesOfPred_InterpBoth: ; . .
|
|
movq mm2,[esi+ebp*1+1] ; A .
|
|
psrlq mm5,1 ; J. . <(R08+R07+1)/2 ... (R01+R00+1)/2)>
|
|
xor ebx,ebx ; . .
|
|
movq mm1,mm2 ; B .
|
|
mov bl,[esi+ebp*1] ; C .
|
|
psllq mm2,8 ; D .
|
|
movq mm3,[esi+ebp*2+1] ; .A.
|
|
paddb mm2,mm1 ; E .
|
|
paddb mm2,Pel_Rnd[ebx*8] ; F .
|
|
movq mm1,mm3 ; .B.
|
|
mov bl,[esi+ebp*2] ; .C.
|
|
psllq mm3,8 ; .D.
|
|
movq mm4,[esi+PITCH*3+1] ; . A
|
|
paddb mm3,mm1 ; .E.
|
|
paddb mm3,Pel_Rnd[ebx*8] ; .F.
|
|
movq mm1,mm4 ; . B
|
|
mov bl,[esi+PITCH*3] ; . C
|
|
pand mm0,mm2 ; K. . <(R08+R07+1)&(R18+R17+1)&1 ...>
|
|
paddb mm0,mm5 ; L. . <(R08+R07+1+((R18+R17+1)&1))/2 ...>
|
|
psllq mm4,8 ; . D
|
|
movq mm5,[esi+ebp*4+1] ; . .A
|
|
paddb mm4,mm1 ; . E
|
|
paddb mm4,Pel_Rnd[ebx*8] ; . F
|
|
movq mm1,mm5 ; . .B
|
|
mov bl,[esi+ebp*4] ; . .C
|
|
psllq mm5,8 ; . .D
|
|
paddb mm5,mm1 ; . .E
|
|
movq mm1,mm6 ; G .
|
|
pandn mm1,mm2 ; H .
|
|
pand mm2,mm6 ; I .
|
|
paddb mm5,Pel_Rnd[ebx*8] ; . .F
|
|
psrlq mm2,1 ; J .
|
|
paddb mm0,mm2 ; M. . <(R08+R07+R18+R17+2)/2 ...>
|
|
pand mm1,mm3 ; K .
|
|
paddb mm1,mm2 ; L .
|
|
movq mm2,mm6 ; .G.
|
|
pandn mm2,mm3 ; .H.
|
|
pand mm3,mm6 ; .I.
|
|
pand mm0,mm6 ; N. . Pre-clean
|
|
psrlq mm3,1 ; .J.
|
|
paddb mm1,mm3 ; M .
|
|
pand mm2,mm4 ; .K.
|
|
paddb mm2,mm3 ; .L.
|
|
movq mm3,mm6 ; . G
|
|
pandn mm3,mm4 ; . H
|
|
pand mm4,mm6 ; . I
|
|
pand mm3,mm5 ; . K
|
|
psrlq mm4,1 ; . J
|
|
paddb mm2,mm4 ; .M.
|
|
paddb mm3,mm4 ; . L
|
|
movq mm4,mm6 ; . .G
|
|
psrlq mm0,1 ; O. . P0 = <(R08+R07+R18+R17+2)/4 ...>
|
|
pandn mm4,mm5 ; . .H
|
|
pand mm5,mm6 ; . .I
|
|
pand mm1,mm6 ; N .
|
|
psrlq mm5,1 ; . .J
|
|
paddb mm3,mm5 ; . M
|
|
lea esi,[esi+ebp*4] ; Advance to next four lines.
|
|
; pand mm2,mm6 ; .N.
|
|
; psrlq mm1,1 ; O .
|
|
; pand mm3,mm6 ; . N
|
|
; psrlq mm2,1 ; .O.
|
|
; paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
|
|
; psrlq mm3,1 ; . O
|
|
ret
|
|
StackOffset TEXTEQU <4>
|
|
|
|
; The steps commented out above are scheduled into the mem-ops the caller has
|
|
; to do at the point of return. As though these ops were done, the registers
|
|
; look as follows:
|
|
; mm0 -- Prediction for line 0.
|
|
; mm1 -- Prediction for line 1.
|
|
; mm2 -- Prediction for line 2.
|
|
; mm3 -- Prediction for line 3.
|
|
; mm4 -- Must be moved to mm0 before computing prediction for next 4 lines.
|
|
; mm5 -- Must be doubled before computing prediction for next 4 lines.
|
|
; mm6 -- 8 bytes of 0x01. Must be this when computing pred for next 4 lines.
|
|
; mm7 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
|
|
;=============================================================================
|
|
ENDIF
|
|
|
|
StackOffset TEXTEQU <0>
|
|
|
|
IFDEF H261
|
|
ELSE ;H263
|
|
OBMCDifferencing:
|
|
|
|
mov al,PendingOBMC ; Do OBMC for previous block, if needed..
|
|
mov bl,1
|
|
test al,al
|
|
mov PendingOBMC,bl
|
|
mov cl,INTER1MV
|
|
je NextMacroBlock
|
|
|
|
mov StashBlockType,cl
|
|
|
|
call DoPendingOBMCDiff
|
|
|
|
mov al,IsPlainPFrame
|
|
test al,al
|
|
jne NextMacroBlock
|
|
|
|
add edx,-SIZEOF T_MacroBlockActionDescr
|
|
movq mm6,C0101010101010101
|
|
pxor mm7,mm7 ; Initialize SWD accumulator
|
|
|
|
call MMxDoBFrameLumaBlocks
|
|
|
|
sub edx,-SIZEOF T_MacroBlockActionDescr
|
|
jmp NextMacroBlock
|
|
|
|
ENDIF
|
|
|
|
;============================================================================
|
|
; Calculate the IntraSWD
|
|
;
|
|
; ebp -- PITCH
|
|
; esi -- Accumulation for IntraSWD
|
|
; edi -- Address of target macroblock.
|
|
; edx -- MBlockActionStream
|
|
; ecx -- Scratch
|
|
; ebx -- Amount IntraSWD has to be less than to be the winner.
|
|
; eax -- Reserved. Holds coded blk pattern, (except undef when IntraByDecree).
|
|
; mm7 -- SWD total for macroblock.
|
|
; mm6 -- Average pel value for block 1.
|
|
; mm5 -- Average pel value for block 2.
|
|
; mm4 -- Average pel value for block 3.
|
|
; mm3 -- Average pel value for block 4.
|
|
; mm0-mm2 Scratch
|
|
;
|
|
|
|
IntraByDecree:
|
|
|
|
mov ebx,000080000H ; Set Inter SWD artificially high.
|
|
|
|
CalculateIntraSWD:
|
|
|
|
sub ebx,INTRACODINGDIFFERENTIAL
|
|
mov cl,1
|
|
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
pcmpeqb mm5,mm5
|
|
|
|
ComputeIntraSWDForNextBlock:
|
|
|
|
movq mm2,[edi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
|
|
psrlw mm5,8
|
|
movq mm4,[edi+ebp*4]
|
|
paddw mm0,mm2 ; <junk P06+P26 junk P04+P24 ...>
|
|
movq mm6,[edi+PITCH*6]
|
|
pand mm0,mm5 ; <P06+P26 P04+P24 P02+P22 P00+P20>
|
|
movq mm1,[edi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
|
|
paddw mm4,mm6
|
|
movq mm3,[edi+PITCH*3] ; <P37 P36 P35 P34 P33 P32 P31 P30>
|
|
pand mm4,mm5
|
|
movq mm5,[edi+PITCH*5]
|
|
paddw mm1,mm3 ; <P17+P37 junk P15+P35 junk ...>
|
|
movq mm7,[edi+PITCH*7]
|
|
psrlw mm1,8 ; <P17+P37 P15+P35 P13+P33 P11+P31>
|
|
paddw mm0,mm1
|
|
paddw mm5,mm7
|
|
paddw mm0,mm4
|
|
psrlw mm5,8
|
|
paddw mm0,mm5
|
|
pcmpeqw mm5,mm5 ; Get words of -1
|
|
movq mm4,[edi+ebp*4]
|
|
pmaddwd mm0,mm5 ; <SumHi = Sum3+Sum2 | SumLo = Sum1+Sum0>
|
|
pcmpeqw mm1,mm1
|
|
psllw mm3,8 ; <P36 0 P34 0 P32 0 P30 0>
|
|
movq mm5,[edi+PITCH*5]
|
|
psllw mm1,3 ; 4 words of 0xFFF8
|
|
packssdw mm0,mm0 ; <SumHi | SumLo | SumHi | SumLo>
|
|
mov al,[edx].CodedBlocks ; Fetch coded block pattern.
|
|
pmaddwd mm0,mm1 ; <Sum = SumHi+SumLo | Sum = SumHi+SumLo>
|
|
psllw mm5,8
|
|
movq mm1,[edi+ebp*1]
|
|
psllw mm7,8
|
|
;
|
|
psllw mm1,8
|
|
;
|
|
packssdw mm0,mm0 ; <Sum | Sum | Sum | Sum>
|
|
psubw mm1,mm0 ; <P16-Avg frac P14-Avg frac ...>
|
|
psubw mm2,mm0 ; <P27-Avg frac P25-Avg frac ...>
|
|
pmaddwd mm1,mm1 ; Square of diff
|
|
psubw mm3,mm0
|
|
pmaddwd mm2,mm2
|
|
psubw mm4,mm0
|
|
pmaddwd mm3,mm3
|
|
psubw mm5,mm0
|
|
pmaddwd mm4,mm4
|
|
psubw mm6,mm0
|
|
psubw mm7,mm0
|
|
paddusw mm1,mm2
|
|
psubw mm0,[edi]
|
|
pmaddwd mm5,mm5
|
|
pmaddwd mm6,mm6
|
|
paddusw mm1,mm3
|
|
pmaddwd mm7,mm7
|
|
paddusw mm1,mm4
|
|
pmaddwd mm0,mm0
|
|
paddusw mm1,mm5
|
|
paddusw mm1,mm6
|
|
cmp cl,2
|
|
paddusw mm1,mm7
|
|
;
|
|
paddusw mm0,mm1
|
|
;
|
|
punpckldq mm1,mm0
|
|
;
|
|
paddusw mm0,mm1
|
|
jg LowerBlkIntraDone
|
|
|
|
psrlq mm0,48
|
|
lea edi,[edi+ebp*8+8] ; Speculate going from blk 1 to blk 4
|
|
mov cl,4
|
|
je Blk2IntraDone
|
|
|
|
Blk1IntraDone:
|
|
|
|
movdf esi,mm0
|
|
sub ebx,esi
|
|
jle InterBestX
|
|
|
|
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
pcmpeqb mm5,mm5
|
|
jmp ComputeIntraSWDForNextBlock
|
|
|
|
LowerBlkIntraDone:
|
|
|
|
|
|
psrlq mm0,48
|
|
sub edi,PITCH*8 ; Speculate going from blk 4 to blk 2
|
|
cmp cl,3
|
|
je Blk3IntraDone
|
|
|
|
Blk4IntraDone:
|
|
|
|
movdf ecx,mm0
|
|
add esi,ecx ; Accumulate IntraSWD
|
|
sub ebx,ecx
|
|
jle InterBestX
|
|
|
|
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
pcmpeqb mm5,mm5
|
|
mov cl,2
|
|
jmp ComputeIntraSWDForNextBlock
|
|
|
|
Blk2IntraDone:
|
|
|
|
movdf ecx,mm0
|
|
add esi,ecx ; Accumulate IntraSWD
|
|
sub edi,16 ; Get to blk 3.
|
|
sub ebx,ecx
|
|
jle InterBestX
|
|
|
|
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
|
|
pcmpeqb mm5,mm5
|
|
mov cl,3
|
|
jmp ComputeIntraSWDForNextBlock
|
|
|
|
Blk3IntraDone:
|
|
|
|
movdf ecx,mm0
|
|
add esi,ecx ; Accumulate IntraSWD
|
|
sub ebx,ecx
|
|
jle InterBestX
|
|
|
|
IntraBest:
|
|
|
|
mov ecx,SWDTotal
|
|
and al,07FH ; Turn off FORCE-INTRA bit.
|
|
mov [edx].SWD,esi
|
|
add ecx,esi ; Add to total.
|
|
mov SWDTotal,ecx
|
|
mov cl,INTRA
|
|
mov [edx].BlockType,cl ; Indicate macroblock handling decision.
|
|
xor ecx,ecx
|
|
mov [edx].BlkY1.MVs,ecx
|
|
mov [edx].BlkY2.MVs,ecx
|
|
mov [edx].BlkY3.MVs,ecx
|
|
mov [edx].BlkY4.MVs,ecx
|
|
mov [edx].CodedBlocks,al
|
|
|
|
IFDEF H261
|
|
ELSE ;H263
|
|
mov al,PendingOBMC ; Do Prev MB if it needs to be OBMC'ed.
|
|
mov [edx].BestFullPelMBHMV,cl ; Kill MVs so extended EMV of other
|
|
; ; blocks will work right.
|
|
dec al
|
|
mov [edx].BestFullPelMBVMV,cl
|
|
jne @f
|
|
|
|
mov PendingOBMC,al ; Go on to next MB, unless the prev MB
|
|
; ; needs to be finished (OBMC).
|
|
mov cl,INTER1MV
|
|
mov StashBlockType,cl
|
|
|
|
call DoPendingOBMCDiff
|
|
|
|
mov al,IsPlainPFrame
|
|
test al,al
|
|
jne @f
|
|
|
|
add edx,-SIZEOF T_MacroBlockActionDescr
|
|
movq mm6,C0101010101010101
|
|
pxor mm7,mm7 ; Initialize SWD accumulator
|
|
|
|
call MMxDoBFrameLumaBlocks
|
|
|
|
sub edx,-SIZEOF T_MacroBlockActionDescr
|
|
|
|
@@:
|
|
|
|
ENDIF
|
|
|
|
mov cl,INTRA
|
|
mov esi,TargetMacroBlockBaseAddr
|
|
mov StashBlockType,cl
|
|
push eax ; Adjust stack pointer
|
|
StackOffset TEXTEQU <4>
|
|
call MMxDoForwardDCT
|
|
mov al,[edx].CodedBlocks
|
|
mov esi,TargetMacroBlockBaseAddr
|
|
sub al,bl
|
|
add esi,8
|
|
mov [edx].CodedBlocks,al
|
|
call MMxDoForwardDCT
|
|
shl bl,1
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov esi,TargetMacroBlockBaseAddr
|
|
mov [edx].CodedBlocks,al
|
|
add esi,PITCH*8
|
|
call MMxDoForwardDCT
|
|
shl bl,2
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
mov esi,TargetMacroBlockBaseAddr
|
|
mov [edx].CodedBlocks,al
|
|
add esi,PITCH*8+8
|
|
call MMxDoForwardDCT
|
|
shl bl,3
|
|
mov al,[edx].CodedBlocks
|
|
sub al,bl
|
|
pop edi ; Adjust stack pointer
|
|
StackOffset TEXTEQU <0>
|
|
mov [edx].CodedBlocks,al
|
|
IFDEF H261
|
|
ELSE
|
|
mov al,IsPlainPFrame
|
|
test al,al
|
|
jne NextMacroBlock
|
|
|
|
movq mm6,C0101010101010101
|
|
pxor mm7,mm7 ; Initialize SWD accumulator
|
|
|
|
call MMxDoBFrameLumaBlocks
|
|
ENDIF
|
|
|
|
jmp NextMacroBlock
|
|
|
|
|
|
IFDEF H261
|
|
ELSE; H263
|
|
StackOffset TEXTEQU <4>
|
|
DoPendingOBMCDiff: ; Internal function
|
|
|
|
;============================================================================
|
|
; Perform differencing for the non-empty luma blocks of an Inter-coded
|
|
; macroblock. This is the OBMC case; i.e. Advanced Prediction is selected.
|
|
|
|
PrevMBAD EQU [edx-SIZEOF T_MacroBlockActionDescr]
|
|
|
|
pcmpeqb mm6,mm6
|
|
pcmpeqb mm7,mm7 ; 8 bytes of -1
|
|
paddb mm6,mm6 ; 8 bytes of 0xFE
|
|
mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
|
|
test al,1 ; Check if block 1 empty.
|
|
je OBMCDoneForBlock1
|
|
|
|
xor ebx,ebx
|
|
mov eax,SIZEOF T_Blk ; Blk to right is blk 2 of this MB.
|
|
mov bl,PrevMBAD.MBEdgeType
|
|
mov ecx,1 ; Mask to extract left edge indicator.
|
|
and ecx,ebx ; Extract left edge indicator.
|
|
and ebx,4 ; Extract top edge indicator.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
lea edi,[eax*2] ; Blk below is blk 3 of this MB.
|
|
mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
|
|
mov edi,BlockAbove[ebx] ; Blk above is blk 3 of mb above, or off
|
|
; ; upper edge.
|
|
mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 2 of mb to the
|
|
; ; left, or off left edge.
|
|
mov DistToBADforBlockAbove,edi
|
|
call DoOBMCForBlock
|
|
mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
|
|
sub al,bl
|
|
mov PrevMBAD.CodedBlocks,al
|
|
|
|
OBMCDoneForBlock1:
|
|
|
|
add edx,SIZEOF T_Blk
|
|
test al,2 ; Check if block 2 empty.
|
|
je OBMCDoneForBlock2
|
|
|
|
xor ebx,ebx
|
|
mov eax,2 ; Mask to extract right edge indicator.
|
|
mov bl,PrevMBAD[-SIZEOF T_Blk].MBEdgeType
|
|
mov edi,2*SIZEOF T_Blk ; Blk below is blk 4 of this MB.
|
|
and eax,ebx ; Extract right edge indicator.
|
|
and ebx,4 ; Extract top edge indicator.
|
|
mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
|
|
lea ecx,[edi-3*SIZEOF T_Blk] ; Blk to left is blk 1 of this MB.
|
|
mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
|
|
; ; right, or off right edge.
|
|
mov edi,BlockAbove[ebx] ; Blk above is blk 4 of mb above, or off
|
|
; ; upper edge.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
mov DistToBADforBlockAbove,edi
|
|
call DoOBMCForBlock
|
|
shl bl,1
|
|
mov al,PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks
|
|
sub al,bl
|
|
mov PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks,al
|
|
|
|
OBMCDoneForBlock2:
|
|
|
|
add edx,SIZEOF T_Blk
|
|
test al,4 ; Check if block 3 empty.
|
|
je OBMCDoneForBlock3
|
|
|
|
xor ecx,ecx
|
|
xor ebx,ebx ; Blk below is this block.
|
|
mov cl,PrevMBAD[-2*SIZEOF T_Blk].MBEdgeType
|
|
mov eax,SIZEOF T_Blk ; Blk to right is blk 4 of this MB.
|
|
and ecx,1 ; Extract left edge indicator.
|
|
mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
|
|
lea edi,[eax-3*SIZEOF T_Blk] ; Blk above is blk 1 of this MB.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
mov DistToBADforBlockAbove,edi
|
|
mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 1 of mb to the
|
|
; ; left, or off left edge.
|
|
call DoOBMCForBlock
|
|
shl bl,2
|
|
mov al,PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks
|
|
sub al,bl
|
|
mov PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks,al
|
|
|
|
OBMCDoneForBlock3:
|
|
|
|
add edx,SIZEOF T_Blk
|
|
test al,8 ; Check if block 4 empty.
|
|
je OBMCDoneForBlock4
|
|
|
|
xor eax,eax
|
|
xor ebx,ebx ; Blk below is this block.
|
|
mov al,PrevMBAD[-3*SIZEOF T_Blk].MBEdgeType
|
|
mov ecx,-SIZEOF T_Blk ; Blk to left is blk 3 of this MB.
|
|
and eax,2 ; Extract right edge indicator.
|
|
mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
|
|
lea edi,[ecx*2] ; Blk above is blk 2 of this MB.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
mov DistToBADforBlockAbove,edi
|
|
mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
|
|
; ; right, or off right edge.
|
|
call DoOBMCForBlock
|
|
shl bl,3
|
|
mov al,PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks
|
|
sub al,bl
|
|
mov PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks,al
|
|
|
|
OBMCDoneForBlock4:
|
|
|
|
sub edx,3*SIZEOF T_Blk ; Get back to MacroBlock Action Descriptor
|
|
ret
|
|
|
|
StackOffset TEXTEQU <8>
|
|
DoOBMCForBlock: ; Internal Function
|
|
|
|
; Present register contents.
|
|
; ebp -- PITCH
|
|
; esi -- Motion vectors for current block.
|
|
; ecx -- Distance from BAD of blk we're doing to BAD for block that provides
|
|
; remote MV from left.
|
|
; eax -- Distance from BAD of blk we're doing to BAD for block that provides
|
|
; remote MV from right.
|
|
; edx -- MBlockActionStream, adjusted to reach BAD of blk we are doing OBMC to.
|
|
; doing OBMC)
|
|
; mm7 -- 8 bytes of -1.
|
|
; mm6 -- 8 bytes of 0xFE.
|
|
;
|
|
; In the body of this code:
|
|
;
|
|
; edx -- Unchanged.
|
|
; edi -- Saved to memory. Then used for address of destination for storing
|
|
; remote prediction blocks.
|
|
; ebp -- PITCH.
|
|
; esi -- Pointer to 8*8, 8*9, 9*8, or 9*9 remote reference areas, which are
|
|
; then interpolated and stored at edi.
|
|
; ecx, eax -- Inputs are used, then these are scratch.
|
|
; ebx -- Scratch
|
|
; mm7 -- 8 bytes of -1
|
|
; mm6 -- 8 bytes of 0xFE
|
|
; mm0-mm5 -- Scratch
|
|
|
|
; Compute left remote prediction block.
|
|
|
|
lea edi,PrevMBAD[ecx]
|
|
and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to left.
|
|
lea ebx,CentralPred
|
|
mov AddrOfLeftPred,ebx ; Speculate that left remote MV == center MV.
|
|
mov AddrOfRightPred,ebx ; Speculate that right remote MV == center MV.
|
|
mov bl,[edi].BlockType
|
|
cmp bl,INTRA
|
|
je LeftEqCtr ; Jump if INTRA. (Use central)
|
|
|
|
mov ebx,PrevMBAD[ecx].BlkY1.MVs
|
|
and ebx,00000FFFFH ; Blk to left may have B MVs set. Clear them.
|
|
cmp esi,ebx
|
|
je LeftEqCtr
|
|
|
|
mov edi,PrevMBAD[ecx].BlkY1.BlkOffset
|
|
mov esi,PrevMBAD[ecx].BlkY1.PastRef ; Get ref addr using left remote.
|
|
sub esi,edi
|
|
mov edi,PrevMBAD.BlkY1.BlkOffset
|
|
add esi,edi
|
|
lea edi,LeftPred
|
|
|
|
call GetPredForCenterLeftOrRight
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+32],mm0
|
|
psrlq mm2,1
|
|
movq [edi+40],mm1
|
|
pand mm3,mm6
|
|
movq [edi+48],mm2
|
|
psrlq mm3,1
|
|
lea ecx,PrevMBAD[eax]
|
|
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
movq [edi+56],mm3
|
|
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
|
|
|
|
; Compute right remote prediction block.
|
|
|
|
mov AddrOfLeftPred,edi
|
|
mov bl,[ecx].BlockType
|
|
cmp bl,INTRA
|
|
je RightEqCtrButLeftNeCtr ; Jump if INTRA.(Use central)
|
|
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
cmp esi,ebx
|
|
je RightEqCtrButLeftNeCtr
|
|
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
|
|
mov edi,PrevMBAD[eax].BlkY1.BlkOffset
|
|
|
|
RightNeCtr:
|
|
|
|
sub esi,edi
|
|
mov edi,PrevMBAD.BlkY1.BlkOffset
|
|
add esi,edi
|
|
lea edi,RightPred
|
|
|
|
call GetPredForCenterLeftOrRight
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+32],mm0
|
|
psrlq mm2,1
|
|
movq [edi+40],mm1
|
|
pand mm3,mm6
|
|
movq [edi+48],mm2
|
|
psrlq mm3,1
|
|
mov AddrOfRightPred,edi
|
|
;
|
|
movq [edi+56],mm3
|
|
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
|
|
|
|
RightEqCtrButLeftNeCtr:
|
|
|
|
; Compute central prediction block.
|
|
|
|
mov ebx,PrevMBAD.BlkY1.MVs
|
|
mov esi,PrevMBAD.BlkY1.PastRef
|
|
lea edi,CentralPred
|
|
mov eax,DistToBADforBlockBelow
|
|
|
|
call GetPredForCenterLeftOrRight
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+32],mm0
|
|
psrlq mm2,1
|
|
movq [edi+40],mm1
|
|
pand mm3,mm6
|
|
movq [edi+48],mm2
|
|
psrlq mm3,1
|
|
lea ecx,PrevMBAD[eax]
|
|
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
movq [edi+56],mm3
|
|
pcmpeqb mm7,mm7
|
|
mov bl,[ecx].BlockType
|
|
mov ecx,PrevMBAD.BlkY1.BlkOffset
|
|
cmp bl,INTRA
|
|
je BelowEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
|
|
|
|
; Compute bottom remote prediction block.
|
|
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
mov edi,AddrOfLeftPred
|
|
cmp esi,ebx
|
|
jne BelowNeCtr
|
|
|
|
BelowEqCtrButSidesDiffer:
|
|
|
|
paddb mm1,mm1 ; Prep mm0-3, which have ctr, for reuse below.
|
|
paddb mm2,mm2
|
|
paddb mm3,mm3
|
|
mov edi,AddrOfLeftPred
|
|
jmp BelowEqCtr
|
|
|
|
BelowNeCtr:
|
|
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
|
|
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
|
|
sub esi,eax
|
|
lea eax,[ecx+ebp*4]
|
|
|
|
call GetPredForAboveOrBelow
|
|
|
|
BelowEqCtr:
|
|
|
|
; Compute difference for lines 4 thru 7.
|
|
; Lines 4 and 5: Cols 0,1,6, and 7 treated same. Cols 2-5 treated same.
|
|
|
|
mov esi,AddrOfRightPred
|
|
mov ebx,TargetFrameBaseAddress
|
|
movdt mm5,[edi+48] ; 6B: < 0 0 0 0 R63 R62 R61 R60>
|
|
pand mm2,mm6
|
|
punpckldq mm5,[esi+48+4] ; 6C: <L67 L66 L65 L64 R63 R62 R61 R60>
|
|
pand mm3,mm6
|
|
movq mm4,CFFFF00000000FFFF ; 6D: < FF FF 00 00 00 00 FF FF>
|
|
psrlq mm2,1 ; 6A: <B67 B66 B65 B64 B63 B62 B61 B60>
|
|
pand mm4,mm5 ; 6E: <L67 L66 00 00 00 00 R61 R60>
|
|
paddb mm5,mm2 ; 6F: <B67+L67 ... B65+L65 ...>
|
|
|
|
pand mm2,C0000FFFFFFFF0000 ; 6G: < 00 00 B65 B64 B63 B62 00 00>
|
|
psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
|
|
paddb mm2,mm4 ; 6H: <L67 L66 B65 B64 B63 B62 R61 R60>
|
|
add ecx,ebx ; Address of target block.
|
|
movdt mm4,[edi+56] ; 7B: < 0 0 0 0 R73 R72 R71 R70>
|
|
psubb mm5,mm2 ; 6I: <B67 B66 L65 L64 R63 R62 B61 B60>
|
|
paddb mm5,CentralPred+48 ; 6J: <C67+B67 ... C65+L65 ...>
|
|
psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
|
|
punpckldq mm4,[esi+56+4] ; 7C: <L77 L76 L75 L74 R73 R72 R71 R70>
|
|
pand mm5,mm6 ; 6K: <C67+B67 ... C65+L65 ...> pre-cleaned
|
|
mov eax,DistToBADforBlockAbove
|
|
psrlq mm5,1 ; 6L: <(C67+B67)/2 ... (C65+L65)/2 ...>
|
|
paddb mm2,mm5 ; 6M: <(C67+B67+2L67)/2 ...
|
|
; ; (C65+2B65+L65)/2 ...>
|
|
lea ebx,PelDiffs
|
|
movq mm5,CFF000000000000FF ; 7D: < FF 00 00 00 00 00 00 FF>
|
|
pand mm2,mm6 ; 6N: pre-cleaned
|
|
pandn mm5,CentralPred+56 ; 7E: < 00 C76 C75 C74 C73 C72 C71 00>
|
|
psrlq mm2,1 ; 6O: <(C67+B67+2L67)/4 ...
|
|
; ; (C65+2B65+L65)/4 ...>
|
|
paddb mm2,CentralPred+48 ; 6P: <(5C67+B67+2L67)/4 ...
|
|
; ; (5C65+2B65+L65)/4 ...>
|
|
paddb mm5,mm4 ; 7F: <L77 C76+L76 ...>
|
|
pand mm4,CFF000000000000FF ; 7G: <L77 00 00 00 00 00 00 L70>
|
|
psubb mm2,mm7 ; 6Q: <(5C67+B67+2L67+4)/4 ...
|
|
; ; (5C65+2B65+L65+4)/4 ...>
|
|
paddb mm4,mm5 ; 7H: <2L77 C76+L76 ...>
|
|
pand mm2,mm6 ; 6R: pre-cleaned
|
|
movq mm5,[ecx+PITCH*6] ; 6T: T6
|
|
psrlq mm2,1 ; 6S: P6 = <(5C67+B67+2L67+4)/8 ...
|
|
; ; (5C65+2B65+L65+4)/8 ...>
|
|
psubb mm5,mm2 ; 6U: D6 = T6 - P6
|
|
;
|
|
; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
|
|
movdt mm2,[edi+32] ; 4B: < 0 0 0 0 R43 R42 R41 R40>
|
|
pand mm4,mm6 ; 7I: <2L77 C76+L76 ...> pre-cleaned
|
|
movq [ebx+6*16],mm5 ; 6V: Store D6.
|
|
psrlq mm4,1 ; 7J: <2L77/2 (C76+L76)/2 ...>
|
|
punpckldq mm2,[esi+32+4] ; 4C: <L47 L46 L45 L44 R43 R42 R41 R40>
|
|
paddb mm3,mm4 ; 7K: <(2B77+2L77)/2 (C76+2B76+L76)/2 ...>
|
|
movq mm5,CFFFF00000000FFFF ; 4D: < FF FF 00 00 00 00 FF FF>
|
|
pand mm3,mm6 ; 7L: pre-cleaned
|
|
movq mm4,CentralPred+32 ; 4E: <C47 C46 C45 C44 C43 C42 C41 C40>
|
|
psrlq mm3,1 ; 7M: <(2B77+2L77)/4 (C76+2B76+L76)/4 ...>
|
|
paddb mm3,CentralPred+56 ; 7N: <(4C77+2B77+2L77)/4
|
|
; ; (5C76+2B76+L76)/4 ...>
|
|
pand mm5,mm4 ; 4F: <C47 C46 00 00 00 00 C41 C40>
|
|
psubb mm3,mm7 ; 7O: <(4C77+2B77+2L77+4)/4
|
|
; ; (5C76+2B76+L76+4)/4 ...>
|
|
paddb mm4,mm2 ; 4G: <C47+L47 ... C45+L45 ...>
|
|
pand mm2,C0000FFFFFFFF0000 ; 4H: < 00 00 L45 L44 R43 R42 00 00>
|
|
pand mm3,mm6 ; 7P: <(4C77+2B77+2L77+4)/4
|
|
; ; (5C76+2B76+L76+4)/4 ...> pre-cleaned
|
|
paddb mm2,mm5 ; 4I: <C47 C46 L45 L44 R43 R42 C41 C40>
|
|
psrlq mm3,1 ; 7Q: P7 = <(4C77+2B77+2L77+4)/8
|
|
; ; (5C76+2B76+L76+4)/8 ...>
|
|
movdt mm5,[edi+40] ; 5B: < 0 0 0 0 R53 R52 R51 R50>
|
|
psubb mm4,mm2 ; 4J: <L47 L46 C45 C44 C43 C42 R41 R40>
|
|
punpckldq mm5,[esi+40+4] ; 5C: <L57 L56 L55 L54 R53 R52 R51 R50>
|
|
paddb mm0,mm2 ; 4K: <C47+B47 ... B45+L45 ...>
|
|
movq mm2,[ecx+PITCH*7] ; 7R: T7
|
|
pand mm0,mm6 ; 4L: <C47+B47 ... B45+L45 ...> pre-cleaned
|
|
psubb mm2,mm3 ; 7S: D7 = T7 - P7
|
|
psrlq mm0,1 ; 4M: <(C47+B47)/2 ... (B45+L45)/2 ...>
|
|
movq mm3,CFFFF00000000FFFF ; 5D: < FF FF 00 00 00 00 FF FF>
|
|
paddb mm0,mm4 ; 4N: <(C47+B47+2L47)/2 ...
|
|
; ; (2C45+B45+L45)/2 ...>
|
|
movq mm4,CentralPred+40 ; 5E: <C57 C56 C55 C54 C53 C52 C51 C50>
|
|
pand mm0,mm6 ; 4O: pre-cleaned
|
|
pand mm3,mm4 ; 5F: <C57 C56 00 00 00 00 C51 C50>
|
|
paddb mm4,mm5 ; 5G: <C57+L57 ... C55+L55 ...>
|
|
pand mm5,C0000FFFFFFFF0000 ; 5H: < 00 00 L55 L54 R53 R52 00 00>
|
|
psrlq mm0,1 ; 4P: <(C47+B47+2L47)/4 ...
|
|
; ; (2C45+B45+L45)/4 ...>
|
|
paddb mm0,CentralPred+32 ; 4Q: <(5C47+B47+2L47)/4 ...
|
|
; ; (6C45+B45+L45)/4 ...>
|
|
paddb mm5,mm3 ; 5I: <C57 C56 L55 L54 R53 R52 C51 C50>
|
|
psubb mm4,mm5 ; 5J: <L57 L56 C55 C54 C53 C52 R51 R50>
|
|
paddb mm1,mm5 ; 5K: <C57+B57 ... B55+L55 ...>
|
|
pand mm1,mm6 ; 5L: <C57+B57 ... B55+L55 ...> pre-cleaned
|
|
psubb mm0,mm7 ; 4R: <(5C47+B47+2L47+4)/4 ...
|
|
; ; (6C45+B45+L45+4)/4 ...>
|
|
pand mm0,mm6 ; 4S: pre-cleaned
|
|
psrlq mm1,1 ; 5M: <(C57+B57)/2 ... (B55+L55)/2 ...>
|
|
paddb mm1,mm4 ; 5N: <(C57+B57+2L57)/2 ...
|
|
; ; (2C55+B55+L55)/2 ...>
|
|
psrlq mm0,1 ; 4T: P4 = <(5C47+B47+2L47+4)/8 ...
|
|
; ; (6C45+B45+L45+4)/8 ...>
|
|
movq mm3,[ecx+PITCH*5] ; 5U: T5
|
|
pand mm1,mm6 ; 5O: pre-cleaned
|
|
movq mm4,[ecx+ebp*4] ; 4U: T4
|
|
psrlq mm1,1 ; 5P: <(C57+B57+2L57)/4 ...
|
|
; ; (2C55+B55+L55)/4 ...>
|
|
paddb mm1,CentralPred+40 ; 5Q: <(5C57+B57+2L57)/4 ...
|
|
; ; (6C55+B55+L55)/4 ...>
|
|
psubb mm4,mm0 ; 4V: D4 = T4 - P4
|
|
lea esi,PrevMBAD[eax]
|
|
psubb mm1,mm7 ; 5R: <(5C57+B57+2L57+4)/4 ...
|
|
; ; (6C55+B55+L55+4)/4 ...>
|
|
and esi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
|
|
pand mm1,mm6 ; 5S: pre-cleaned
|
|
movq [ebx+7*16],mm2 ; 7T
|
|
psrlq mm1,1 ; 5T: P5 = <(5C57+B57+2L57+4)/8 ...
|
|
; ; (6C55+B55+L55+4)/8 ...>
|
|
movq [ebx+4*16],mm4 ; 4W: Store D4.
|
|
psubb mm3,mm1 ; 5V: D5 = T5 - P5
|
|
mov cl,[esi].BlockType ; Bottom bit set if above neighbor is INTRA.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
movq [ebx+5*16],mm3 ; 5W: Store D5.
|
|
cmp cl,INTRA
|
|
je AboveEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
|
|
|
|
; Compute top remote prediction block.
|
|
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
|
|
mov ecx,PrevMBAD.BlkY1.BlkOffset
|
|
cmp esi,ebx
|
|
jne AboveNeCtr
|
|
|
|
AboveEqCtrButSidesDiffer:
|
|
|
|
movq mm3,CentralPred+24 ; Prep mm0-3, which have ctr, for reuse below.
|
|
movq mm2,CentralPred+16
|
|
paddb mm3,mm3
|
|
movq mm1,CentralPred+8
|
|
paddb mm2,mm2
|
|
movq mm0,CentralPred
|
|
paddb mm1,mm1
|
|
mov ecx,PrevMBAD.BlkY1.BlkOffset
|
|
jmp AboveEqCtr
|
|
|
|
AboveNeCtr:
|
|
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
|
|
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
|
|
sub esi,eax
|
|
mov eax,ecx
|
|
|
|
call GetPredForAboveOrBelow
|
|
|
|
AboveEqCtr:
|
|
|
|
; Compute difference for lines 0 thru 3.
|
|
|
|
mov esi,AddrOfRightPred
|
|
mov ebx,TargetFrameBaseAddress
|
|
movdt mm5,[edi+8] ; 1B: < 0 0 0 0 R13 R12 R11 R10>
|
|
psrlq mm1,1 ; 1A: <A17 A16 A15 A14 A13 A12 A11 A10>
|
|
punpckldq mm5,[esi+8+4] ; 1C: <L17 L16 L15 L14 R13 R12 R11 R10>
|
|
pand mm3,mm6
|
|
movq mm4,CFFFF00000000FFFF ; 1D: < FF FF 00 00 00 00 FF FF>
|
|
psrlq mm3,1 ; 3A: <A37 A36 A35 A34 A33 A32 A31 A30>: mm0
|
|
pand mm4,mm5 ; 1E: <L17 L16 00 00 00 00 R11 R10>
|
|
paddb mm5,mm1 ; 1F: <A17+L17 ... A15+L15 ...>
|
|
pand mm1,C0000FFFFFFFF0000 ; 1G: < 00 00 A15 A14 A13 A12 00 00>
|
|
pand mm2,mm6
|
|
paddb mm5,CentralPred+8 ; 1H: <C17+A17+L17 ... C15+A15+L15 ...>
|
|
paddb mm1,mm4 ; 1I: <L17 L16 A15 A14 A13 A12 R11 R10>
|
|
; 0A: <A07 A06 A05 A04 A03 A02 A01 A00>:mm0
|
|
movdt mm4,[edi] ; 0B: < 0 0 0 0 R03 R02 R01 R00>
|
|
psubb mm5,mm1 ; 1J: <C17+A17 ... C15+L15 ...>
|
|
punpckldq mm4,[esi+4] ; 0C: <L07 L06 L05 L04 R03 R02 R01 R00>
|
|
pand mm5,mm6 ; 1K: <C17+A17 ... C15+L15 ...> pre-cleaned
|
|
add ecx,ebx ; Address of target block.
|
|
psrlq mm5,1 ; 1L: <(C17+A17)/2 ... (C15+L15)/2 ...>
|
|
paddb mm1,mm5 ; 1M: <(C17+A17+2L17)/2 ...
|
|
; ; (C15+2A15+L15)/2 ...>
|
|
psrlq mm2,1 ; 2A: <A27 A26 A25 A24 A23 A22 A21 A20>
|
|
movq mm5,CFF000000000000FF ; 0D: < FF 00 00 00 00 00 00 FF>
|
|
pand mm1,mm6 ; 1N: pre-cleaned
|
|
pandn mm5,CentralPred ; 0E: < 00 C06 C05 C04 C03 C02 C01 00>
|
|
psrlq mm1,1 ; 1O: <(C17+A17+2L17)/4 ...
|
|
; ; (C15+2A15+L15)/4 ...>
|
|
paddb mm1,CentralPred+8 ; 1P: <(5C17+A17+2L17)/4 ...
|
|
; ; (5C15+2A15+L15)/4 ...>
|
|
paddb mm5,mm4 ; 0F: <L07 C06+L06 ...>
|
|
pand mm4,CFF000000000000FF ; 0G: <L07 00 00 00 00 00 00 L00>
|
|
psubb mm1,mm7 ; 1Q: <(5C17+A17+2L17+4)/4 ...
|
|
; ; (5C15+2A15+L15+4)/4 ...>
|
|
paddb mm4,mm5 ; 0H: <2L07 C06+L06 ...>
|
|
pand mm1,mm6 ; 1R: pre-cleaned
|
|
movq mm5,[ecx+ebp*1] ; 1T: T1
|
|
psrlq mm1,1 ; 1S: P1 = <(5C17+A17+2L17+4)/8 ...
|
|
; ; (5C15+2A15+L15+4)/8 ...>
|
|
psubb mm5,mm1 ; 1U: D1 = T1 - P1
|
|
;
|
|
movdt mm1,[edi+24] ; 3B: < 0 0 0 0 R33 R32 R31 R30>
|
|
pand mm4,mm6 ; 0I: <2L07 C06+L06 ...> pre-cleaned
|
|
movq PelDiffsLine1,mm5 ; 1V: Store D1.
|
|
psrlq mm4,1 ; 0J: <2L07/2 (C06+L06)/2 ...>
|
|
punpckldq mm1,[esi+24+4] ; 3C: <L37 L36 L35 L34 R33 R32 R31 R30>
|
|
paddb mm0,mm4 ; 0K: <(2A07+2L07)/2 (C06+2A06+L06)/2 ...>
|
|
movq mm5,CFFFF00000000FFFF ; 3D: < FF FF 00 00 00 00 FF FF>
|
|
pand mm0,mm6 ; 0L: pre-cleaned
|
|
movq mm4,CentralPred+24 ; 3E: <C37 C36 C35 C34 C33 C32 C31 C30>
|
|
psrlq mm0,1 ; 0M: <(2A07+2L07)/4 (C06+2A06+L06)/4 ...>
|
|
paddb mm0,CentralPred ; 0N: <(4C07+2A07+2L07)/4
|
|
; ; (5C06+2A06+L06)/4 ...>
|
|
pand mm5,mm4 ; 3F: <C37 C36 00 00 00 00 C31 C30>
|
|
psubb mm0,mm7 ; 0O: <(4C07+2A07+2L07+4)/4
|
|
; ; (5C06+2A06+L06+4)/4 ...>
|
|
paddb mm4,mm1 ; 3G: <C37+L37 ... C35+L35 ...>
|
|
pand mm1,C0000FFFFFFFF0000 ; 3H: < 00 00 L35 L34 R33 R32 00 00>
|
|
pand mm0,mm6 ; 0P: <(4C07+2A07+2L07+4)/4
|
|
; ; (5C06+2A06+L06+4)/4 ...> pre-cleaned
|
|
paddb mm1,mm5 ; 3I: <C37 C36 L35 L34 R33 R32 C31 C30>
|
|
psrlq mm0,1 ; 0Q: P0 = <(4C07+2A07+2L07+4)/8
|
|
; ; (5C06+2A06+L06+4)/8 ...>
|
|
movdt mm5,[edi+16] ; 2B: < 0 0 0 0 R23 R22 R21 R20>
|
|
psubb mm4,mm1 ; 3J: <L37 L36 C35 C34 C33 C32 R31 R30>
|
|
punpckldq mm5,[esi+16+4] ; 2C: <L27 L26 L25 L24 R23 R22 R21 R20>
|
|
paddb mm3,mm1 ; 3K: <C37+A37 ... A35+L35 ...>
|
|
movq mm1,[ecx] ; 0R: T0
|
|
pand mm3,mm6 ; 3L: <C37+A37 ... A35+L35 ...> pre-cleaned
|
|
psubb mm1,mm0 ; 0S: D0 = T0 - P0
|
|
psrlq mm3,1 ; 3M: <(C37+A37)/2 ... (A35+L35)/2 ...>
|
|
movq mm0,CFFFF00000000FFFF ; 2D: < FF FF 00 00 00 00 FF FF>
|
|
paddb mm3,mm4 ; 3N: <(C37+A37+2L37)/2 ...
|
|
; ; (2C35+A35+L35)/2 ...>
|
|
movq mm4,CentralPred+16 ; 2E: <C27 C26 C25 C24 C23 C22 C21 C20>
|
|
pand mm3,mm6 ; 3O: pre-cleaned
|
|
pand mm0,mm4 ; 2F: <C27 C26 00 00 00 00 C21 C20>
|
|
paddb mm4,mm5 ; 2G: <C27+L27 ... C25+L25 ...>
|
|
pand mm5,C0000FFFFFFFF0000 ; 2H: < 00 00 L25 L24 R23 R22 00 00>
|
|
psrlq mm3,1 ; 3P: <(C37+A37+2L37)/4 ...
|
|
; ; (2C35+A35+L35)/4 ...>
|
|
paddb mm3,CentralPred+24 ; 3Q: <(5C37+A37+2L37)/4 ...
|
|
; ; (6C35+A35+L35)/4 ...>
|
|
paddb mm5,mm0 ; 2I: <C27 C26 L25 L24 R23 R22 C21 C20>
|
|
psubb mm4,mm5 ; 2J: <L27 L26 C25 C24 C23 C22 R21 R20>
|
|
paddb mm2,mm5 ; 2K: <C27+A27 ... A25+L25 ...>
|
|
pand mm2,mm6 ; 2L: <C27+A27 ... A25+L25 ...> pre-cleaned
|
|
psubb mm3,mm7 ; 3R: <(5C37+A37+2L37+4)/4 ...
|
|
; ; (6C35+A35+L35+4)/4 ...>
|
|
pand mm3,mm6 ; 3S: pre-cleaned
|
|
psrlq mm2,1 ; 2M: <(C27+A27)/2 ... (A25+L25)/2 ...>
|
|
paddb mm2,mm4 ; 2N: <(C27+A27+2L27)/2 ...
|
|
; ; (2C25+A25+L25)/2 ...>
|
|
psrlq mm3,1 ; 3T: P3 = <(5C37+A37+2L37+4)/8 ...
|
|
; ; (6C35+A35+L35+4)/8 ...>
|
|
movq mm0,[ecx+ebp*2] ; 2U: T2
|
|
pand mm2,mm6 ; 2O: pre-cleaned
|
|
movq mm4,[ecx+PITCH*3] ; 3U: T3
|
|
psrlq mm2,1 ; 2P: <(C27+A27+2L27)/4 ...
|
|
; ; (2C25+A25+L25)/4 ...>
|
|
paddb mm2,CentralPred+16 ; 2Q: <(5C27+A27+2L27)/4 ...
|
|
; ; (6C25+A25+L25)/4 ...>
|
|
psubb mm4,mm3 ; 3V: D3 = T3 - P3
|
|
movq PelDiffsLine0,mm1 ; 0T
|
|
psubb mm2,mm7 ; 2R: <(5C27+A27+2L27+4)/4 ...
|
|
; ; (6C25+A25+L25+4)/4 ...>
|
|
movq PelDiffsLine3,mm4 ; 3W: Store D3.
|
|
pand mm2,mm6 ; 2S: pre-cleaned
|
|
psrlq mm2,1 ; 2T: P2 = <(5C27+A27+2L27+4)/8 ...
|
|
; ; (6C25+A25+L25+4)/8 ...>
|
|
;
|
|
psubb mm0,mm2 ; 2V: D2 = T2 - P2
|
|
;
|
|
;
|
|
;
|
|
movq PelDiffsLine2,mm0 ; 2W: Store D2.
|
|
;
|
|
jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
LeftEqCtr:
|
|
|
|
; Left remote motion vector was same as center.
|
|
; Compute right remote prediction block.
|
|
|
|
lea edi,PrevMBAD[eax]
|
|
and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
;
|
|
;
|
|
mov cl,[edi].BlockType
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
cmp cl,INTRA
|
|
je LeftEqCtrAndRightEqCtr ; Jump if INTRA. (Use central)
|
|
|
|
cmp esi,ebx
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
|
|
mov edi,PrevMBAD[eax].BlkY1.BlkOffset
|
|
jne RightNeCtr
|
|
|
|
; Left and right remote motion vectors were same as center.
|
|
; Compute central prediction block.
|
|
|
|
LeftEqCtrAndRightEqCtr:
|
|
|
|
mov ebx,PrevMBAD.BlkY1.MVs
|
|
mov esi,PrevMBAD.BlkY1.PastRef
|
|
lea edi,CentralPred
|
|
mov eax,DistToBADforBlockBelow
|
|
|
|
call GetPredForCenterLeftOrRight
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+32],mm0
|
|
psrlq mm2,1
|
|
movq [edi+40],mm1
|
|
pand mm3,mm6
|
|
movq [edi+48],mm2
|
|
psrlq mm3,1
|
|
lea ecx,PrevMBAD[eax]
|
|
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
movq [edi+56],mm3
|
|
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
|
|
mov bl,[ecx].BlockType
|
|
mov ecx,PrevMBAD.BlkY1.BlkOffset
|
|
cmp bl,INTRA
|
|
mov edi,AddrOfLeftPred
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
je BottomHalfAllSame ; Jump if INTRA. (Use central)
|
|
|
|
; Compute bottom remote prediction block.
|
|
|
|
cmp esi,ebx
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
|
|
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
|
|
je BottomHalfAllSame
|
|
|
|
sub esi,eax
|
|
lea eax,[ecx+ebp*4]
|
|
|
|
call GetPredForAboveOrBelow
|
|
|
|
; Compute difference for lines 4 thru 7. Only the remote motion vector below
|
|
; was different than the central motion vector.
|
|
|
|
; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
|
|
movq mm5,CentralPred+48 ; 6b: <C67 C66 C65 C64 C63 C62 C61 C60>
|
|
pand mm2,mm6
|
|
movq mm4,CentralPred+32 ; 4B: <C47 C46 C45 C44 C43 C42 C41 C40>
|
|
psrlq mm2,1 ; 6a: <B67 B66 B65 B64 B63 B62 B61 B60>
|
|
paddb mm2,mm5 ; 6c: <C67+B67 ... C65+B65 ...>
|
|
paddb mm0,mm4 ; 4C: <C47+B47>
|
|
pand mm0,mm6 ; 4D: <C47+B47> pre-cleaned
|
|
psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
|
|
pand mm2,mm6 ; 6d: <C67+B67 ... C65+B65 ...> pre-cleaned
|
|
psrlq mm0,1 ; 4E: <(C47+B47)/2 ...>
|
|
paddb mm0,mm4 ; 4F: <(3C47+B47)/2 ...>
|
|
psrlq mm2,1 ; 6e: <(C67+B67)/2 ... (C65+B65)/2 ...>
|
|
pmullw mm2,C0001000200020001 ; 6f: <(C67+B67)/2 ... (2C65+2B65)/2 ...>
|
|
pand mm0,mm6 ; 4G: <(3C47+B47)/2 ...> pre-cleaned
|
|
pand mm3,mm6
|
|
psrlq mm0,1 ; 4H: <(3C47+B47)/4 ...>
|
|
paddb mm0,mm4 ; 4I: <(7C47+B47)/4 ...>
|
|
psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
|
|
movq mm4,C0000FFFFFFFF0000 ; 6g: < 00 00 FF FF FF FF 00 00>
|
|
psubb mm0,mm7 ; 4J: <(7C47+B47+4)/4 ...>
|
|
pandn mm4,mm5 ; 6h: <C67 C66 00 00 00 00 C61 C60>
|
|
psubb mm5,mm7 ; 6i: <C67+1 ... C65+1 ...>
|
|
paddb mm2,mm4 ; 6j: <(3C67+B67)/2 ... (2C65+2B65)/2 ...>
|
|
pand mm0,mm6 ; 4K: <(7C47+B47+4)/4 ...> pre-cleaned
|
|
movq mm4,CentralPred+40 ; 5B
|
|
pand mm2,mm6 ; 6k: pre-cleaned
|
|
paddb mm1,mm4 ; 5C
|
|
psrlq mm0,1 ; 4L: <(7C47+B47+4)/8 ...>
|
|
pand mm1,mm6 ; 5D
|
|
psrlq mm2,1 ; 6l: <(3C67+B67)/4 ... (2C65+2B65)/4 ...>
|
|
paddb mm2,mm5 ; 6m: <(7C67+B67+4)/4 ... (6C65+2B65+4)/4...>
|
|
psrlq mm1,1 ; 5E
|
|
movq mm5,CentralPred+56 ; 7B: <C77 C76 C75 C74 C73 C72 C71 C70>
|
|
paddb mm1,mm4 ; 5F
|
|
paddb mm3,mm5 ; 7C: <C77+B47>
|
|
pand mm1,mm6 ; 5G
|
|
pand mm3,mm6 ; 7D: <C77+B47> pre-cleaned
|
|
psrlq mm1,1 ; 5H
|
|
paddb mm1,mm4 ; 5I
|
|
psrlq mm3,1 ; 7E: <(C77+B47)/2 ...>
|
|
psubb mm1,mm7 ; 5J
|
|
paddb mm3,mm5 ; 7F: <(3C77+B47)/2 ...>
|
|
pand mm1,mm6 ; 5K
|
|
psubb mm3,mm7 ; 7G: <(3C77+B47+2)/2 ...>
|
|
pand mm2,mm6 ; 6n: pre-cleaned
|
|
psrlq mm1,1 ; 5L
|
|
pand mm3,mm6 ; 7H: <(3C77+B47+2)/2 ...> pre-cleaned
|
|
psrlq mm2,1 ; 6o: <(7C67+B67+4)/8 ... (6C65+2B65+4)/8...>
|
|
psrlq mm3,1 ; 7I: <(3C77+B47+2)/4 ...>
|
|
|
|
BottomHalfAllSame:
|
|
|
|
mov ebx,TargetFrameBaseAddress
|
|
mov eax,DistToBADforBlockAbove
|
|
mov esi,PrevMBAD.BlkY1.MVs
|
|
movq mm5,[ecx+ebx+PITCH*5] ; 5M
|
|
add ecx,ebx ; Address of target block.
|
|
lea ebx,PrevMBAD[eax]
|
|
|
|
and ebx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
|
|
psubb mm5,mm1 ; 5N
|
|
movq mm4,[ecx+ebp*4] ; 4M: T4
|
|
movq mm1,[ecx+PITCH*7] ; 7J: T7
|
|
psubb mm4,mm0 ; 4N: D4 = T4 - P4
|
|
movq mm0,[ecx+PITCH*6] ; 6p: T6
|
|
psubb mm1,mm3 ; 7K: D7 = T7 - P7
|
|
movq PelDiffsLine4,mm4 ; 4O: Store D4.
|
|
psubb mm0,mm2 ; 6q: D6 = T6 - P6
|
|
movq PelDiffsLine5,mm5 ; 5O
|
|
movq PelDiffsLine6,mm0 ; 6r
|
|
movq PelDiffsLine7,mm1 ; 7L
|
|
mov cl,[ebx].BlockType
|
|
cmp cl,INTRA
|
|
mov ecx,PrevMBAD.BlkY1.BlkOffset
|
|
mov ebx,PrevMBAD[eax].BlkY1.MVs
|
|
je SidesEqCtrAndAboveEqCtr ; Jump if INTRA. (Use central)
|
|
|
|
; Compute top remote prediction block.
|
|
|
|
and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
|
|
cmp esi,ebx
|
|
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
|
|
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
|
|
jne SidesEqCtrButAboveNeCtr
|
|
|
|
SidesEqCtrAndAboveEqCtr:
|
|
|
|
movq mm0,CentralPred
|
|
movq mm1,CentralPred+8
|
|
paddb mm0,mm0
|
|
movq mm2,CentralPred+16
|
|
paddb mm1,mm1
|
|
movq mm3,CentralPred+24
|
|
paddb mm2,mm2
|
|
jmp TopHalfAllSame
|
|
|
|
SidesEqCtrButAboveNeCtr:
|
|
|
|
sub esi,eax
|
|
mov eax,ecx
|
|
|
|
call GetPredForAboveOrBelow
|
|
|
|
; Compute difference for lines 0 thru 3. Only the remote motion vector above
|
|
; was different than the central motion vector.
|
|
|
|
movq mm5,CentralPred+8 ; 1b
|
|
pand mm3,mm6
|
|
movq mm4,CentralPred+24 ; 3B
|
|
psrlq mm3,1 ; 3A
|
|
paddb mm3,mm4 ; 3C
|
|
psrlq mm1,1 ; 1A
|
|
paddb mm1,mm5 ; 1c
|
|
pand mm3,mm6 ; 3D
|
|
pand mm1,mm6 ; 1d
|
|
psrlq mm3,1 ; 3E
|
|
paddb mm3,mm4 ; 3F
|
|
psrlq mm1,1 ; 1e
|
|
pmullw mm1,C0001000200020001 ; 1f
|
|
pand mm3,mm6 ; 3G
|
|
pand mm2,mm6
|
|
psrlq mm3,1 ; 3H
|
|
paddb mm3,mm4 ; 3I
|
|
psrlq mm2,1 ; 2a
|
|
movq mm4,C0000FFFFFFFF0000 ; 1g
|
|
psubb mm3,mm7 ; 3J
|
|
pandn mm4,mm5 ; 1h
|
|
psubb mm5,mm7 ; 1i
|
|
paddb mm1,mm4 ; 1j
|
|
pand mm3,mm6 ; 3K
|
|
movq mm4,CentralPred+16 ; 2B
|
|
pand mm1,mm6 ; 1k
|
|
paddb mm2,mm4 ; 2C
|
|
psrlq mm3,1 ; 3L
|
|
pand mm2,mm6 ; 2D
|
|
psrlq mm1,1 ; 1l
|
|
paddb mm1,mm5 ; 1m
|
|
psrlq mm2,1 ; 2E
|
|
movq mm5,CentralPred ; 0B
|
|
paddb mm2,mm4 ; 2F
|
|
paddb mm0,mm5 ; 0C
|
|
pand mm2,mm6 ; 2G
|
|
pand mm0,mm6 ; 0D
|
|
psrlq mm2,1 ; 2H
|
|
paddb mm2,mm4 ; 2I
|
|
psrlq mm0,1 ; 0E
|
|
psubb mm2,mm7 ; 2J
|
|
paddb mm0,mm5 ; 0F
|
|
pand mm2,mm6 ; 2K
|
|
psubb mm0,mm7 ; 0G
|
|
|
|
TopHalfAllSame:
|
|
|
|
mov ebx,TargetFrameBaseAddress
|
|
lea edi,[ecx+ebx]
|
|
pand mm1,mm6 ; 1n
|
|
movq mm7,[ecx+ebx] ; 0J
|
|
pand mm0,mm6 ; 0H
|
|
movq mm5,[edi+PITCH*3] ; 3M
|
|
psrlq mm2,1 ; 2L
|
|
movq mm4,[edi+ebp*2] ; 2M
|
|
psubb mm5,mm3 ; 3N
|
|
psubb mm4,mm2 ; 2N
|
|
psrlq mm1,1 ; 1o
|
|
movq mm3,[edi+ebp*1] ; 1p
|
|
psubb mm3,mm1 ; 1q
|
|
movq PelDiffsLine3,mm5 ; 3O
|
|
psrlq mm0,1 ; 0I
|
|
movq PelDiffsLine2,mm4 ; 2O
|
|
psubb mm7,mm0 ; 0K
|
|
movq PelDiffsLine1,mm3 ; 1r
|
|
movq PelDiffsLine0,mm7 ; 0L
|
|
jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
|
|
|
|
;=============================================================================
|
|
; This internal function computes the OBMC contribution for the reference
|
|
; block that uses the left, central, or right remote motion vector.
|
|
;
|
|
; ebp -- PITCH
|
|
; edi -- Address of where to put the contribution.
|
|
; esi -- Address of reference block.
|
|
; edx -- Reserved. MBlockActionStream
|
|
; ecx -- Unavailable.
|
|
; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
|
|
; eax -- Unavailable.
|
|
; mm7 -- 8 bytes of -1
|
|
; mm6 -- 8 bytes of 0xFE
|
|
; mm0-mm5 -- Scratch
|
|
|
|
StackOffset TEXTEQU <12_ButAccessToLocalVariablesShouldNotBeNeeded>
|
|
|
|
GetPredForCenterLeftOrRight:
|
|
|
|
shr ebx,1
|
|
jc HorzInterpInCLRPred
|
|
|
|
movq mm1,[esi+ebp*1]
|
|
and bl,080H
|
|
je NoInterpInCLRPred
|
|
|
|
VertInterpInCLRPred:
|
|
|
|
movq mm0,[esi]
|
|
psubb mm1,mm7
|
|
|
|
call Get4LinesOfPred_InterpVert
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+0],mm0
|
|
pand mm3,mm6
|
|
movq [edi+8],mm1
|
|
psrlq mm2,1
|
|
movq mm1,[esi+ebp*1]
|
|
psrlq mm3,1
|
|
movq [edi+16],mm2
|
|
movq mm0,mm4
|
|
movq [edi+24],mm3
|
|
psubb mm1,mm7
|
|
jmp Get4MoreLinesOfPred_InterpVert
|
|
|
|
HorzInterpInCLRPred:
|
|
|
|
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
and bl,080H
|
|
jne BothInterpInCLRPred
|
|
|
|
call Get4LinesOfPred_InterpHorz
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+0],mm0
|
|
pand mm3,mm6
|
|
movq [edi+8],mm1
|
|
psrlq mm2,1
|
|
movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
|
|
psrlq mm3,1
|
|
movq [edi+16],mm2
|
|
;
|
|
movq [edi+24],mm3
|
|
;
|
|
jmp Get4MoreLinesOfPred_InterpHorz
|
|
|
|
BothInterpInCLRPred:
|
|
|
|
call Get4LinesOfPred_InterpBoth
|
|
|
|
pand mm2,mm6
|
|
psrlq mm1,1
|
|
movq [edi+0],mm0
|
|
pand mm3,mm6
|
|
movq [edi+8],mm1
|
|
psrlq mm2,1
|
|
movq mm1,[esi+ebp*1]
|
|
psrlq mm3,1
|
|
movq [edi+16],mm2
|
|
movq mm0,mm4
|
|
movq [edi+24],mm3
|
|
psubb mm1,mm7
|
|
paddb mm5,mm5
|
|
jmp Get4MoreLinesOfPred_InterpBoth
|
|
|
|
NoInterpInCLRPred:
|
|
|
|
movq mm0,[esi]
|
|
movq mm2,[esi+ebp*2]
|
|
movq mm3,[esi+PITCH*3]
|
|
movq [edi+0],mm0
|
|
movq [edi+8],mm1
|
|
movq [edi+16],mm2
|
|
movq [edi+24],mm3
|
|
movq mm3,[esi+PITCH*7]
|
|
movq mm2,[esi+PITCH*6]
|
|
paddb mm3,mm3
|
|
movq mm1,[esi+PITCH*5]
|
|
paddb mm2,mm2
|
|
movq mm0,[esi+ebp*4]
|
|
paddb mm1,mm1
|
|
ret
|
|
|
|
;=============================================================================
|
|
; This internal function computes the OBMC contribution for the reference
|
|
; block that uses the remote motion vector from block above or below.
|
|
;
|
|
; ebp -- PITCH
|
|
; edi -- Not used.
|
|
; esi -- Address of reference block (after ecx is added in).
|
|
; edx -- Reserved. MBlockActionStream
|
|
; ecx -- Unavailable. Must not be changed.
|
|
; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
|
|
; eax -- Offset within frame for block being worked on.
|
|
; mm7 -- 8 bytes of -1
|
|
; mm6 -- 8 bytes of 0xFE
|
|
; mm0-mm5 -- Scratch
|
|
|
|
GetPredForAboveOrBelow:
|
|
|
|
shr ebx,1
|
|
lea esi,[esi+eax]
|
|
jc HorzInterpInABPred
|
|
|
|
movq mm1,[esi+ebp*1]
|
|
movq mm0,[esi]
|
|
psubb mm1,mm7
|
|
and bl,080H
|
|
jne Get4LinesOfPred_InterpVert
|
|
|
|
movq mm2,[esi+ebp*2]
|
|
paddb mm1,mm7
|
|
movq mm3,[esi+PITCH*3]
|
|
paddb mm1,mm1
|
|
paddb mm2,mm2
|
|
paddb mm3,mm3
|
|
ret
|
|
|
|
HorzInterpInABPred:
|
|
|
|
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
|
|
and bl,080H
|
|
jne Get4LinesOfPred_InterpBoth
|
|
|
|
jmp Get4LinesOfPred_InterpHorz
|
|
|
|
StackOffset TEXTEQU <0>
|
|
;=============================================================================
|
|
ENDIF
|
|
|
|
Done:
|
|
|
|
IFDEF H261
|
|
ELSE; H263
|
|
mov bl,PendingOBMC
|
|
mov cl,INTER1MV
|
|
test bl,bl
|
|
je TrulyDone
|
|
|
|
mov StashBlockType,cl
|
|
|
|
call DoPendingOBMCDiff
|
|
|
|
mov al,IsPlainPFrame
|
|
add edx,-SIZEOF T_MacroBlockActionDescr
|
|
test al,al
|
|
jne TrulyDone
|
|
|
|
movq mm6,C0101010101010101
|
|
pxor mm7,mm7 ; Initialize SWD accumulator
|
|
|
|
call MMxDoBFrameLumaBlocks
|
|
|
|
ENDIF
|
|
TrulyDone:
|
|
|
|
emms
|
|
IFDEF H261
|
|
mov eax,SWDTotal
|
|
mov esp,StashESP
|
|
mov edi,[esp+PSWDTotal]
|
|
mov [edi],eax
|
|
ELSE
|
|
mov eax,SWDTotal
|
|
mov ebx,BSWDTotal
|
|
mov esp,StashESP
|
|
mov edi,[esp+PSWDTotal]
|
|
mov esi,[esp+PBSWDTotal]
|
|
mov [edi],eax
|
|
mov [esi],ebx
|
|
ENDIF
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
rturn
|
|
MMxEDTQ endp
END
|