Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

6016 lines
222 KiB

;////////////////////////////////////////////////////////////////////////////
;//
;// INTEL CORPORATION PROPRIETARY INFORMATION
;//
;// This software is supplied under the terms of a license
;// agreement or nondisclosure agreement with Intel Corporation
;// and may not be copied or disclosed except in accordance
;// with the terms of that agreement.
;//
;////////////////////////////////////////////////////////////////////////////
;//
;// $Header: S:\h26x\src\enc\exmme.asv 1.37 13 Dec 1996 17:19:38 MBODART $
;//
;// $Log: S:\h26x\src\enc\exmme.asv $
;//
;// Rev 1.37 13 Dec 1996 17:19:38 MBODART
;// Tuned the ME parameters for H.261.
;//
;// Rev 1.36 06 Nov 1996 16:18:24 BNICKERS
;// Improve performance.
;//
;// Rev 1.35 30 Oct 1996 17:30:36 BNICKERS
;// Fix UMV table for right edge macroblocks.
;//
;// Rev 1.34 30 Oct 1996 14:49:20 KLILLEVO
;// zero motion vectors for intra blocks in PB-frame mode.
;// This is necesseary in the Extended Motion Vector mode
;//
;// Rev 1.33 18 Oct 1996 16:57:16 BNICKERS
;// Fixes for EMV
;//
;// Rev 1.32 15 Oct 1996 17:53:04 BNICKERS
;//
;// Fix major bug w.r.t. EMV ME.
;//
;// Rev 1.31 14 Oct 1996 13:10:14 BNICKERS
;//
;// Correct several problems wrt H261 ME.
;//
;// Rev 1.30 11 Oct 1996 16:53:12 KLILLEVO
;//
;// Fix threshold
;//
;// Rev 1.29 11 Oct 1996 16:52:18 KLILLEVO
;// Another EMV fix.
;//
;// Rev 1.28 11 Oct 1996 15:43:16 KLILLEVO
;// Really fix the handling of the top row of MBs for EMV ME.
;//
;// Rev 1.27 11 Oct 1996 15:24:38 BNICKERS
;// Special handling of top row of MBs for EMV ME.
;//
;// Rev 1.26 11 Oct 1996 14:47:42 KLILLEVO
;// Kill full pel MV for Intra blocks so that EMV of adjacent blocks will work.
;//
;// Rev 1.25 10 Oct 1996 16:42:56 BNICKERS
;// Initial debugging of Extended Motion Vectors.
;//
;// Rev 1.24 04 Oct 1996 08:48:02 BNICKERS
;// Add EMV.
;//
;// Rev 1.23 24 Sep 1996 10:42:24 BNICKERS
;// For H261, zero out motion vectors when classifying MB as intra.
;//
;// Rev 1.22 12 Sep 1996 10:56:24 BNICKERS
;// Add arguments for thresholds and differentials.
;//
;// Rev 1.21 22 Jul 1996 15:23:24 BNICKERS
;// Reduce code size. Implement H261 spatial filter.
;//
;// Rev 1.20 18 Jul 1996 16:54:26 KLILLEVO
;// changed emptythreshold to 40 instead of 128 to remove some blockiness
;// from the still frame mode on MMX
;//
;// Rev 1.19 26 Jun 1996 12:49:02 KLILLEVO
;// Fix minor booboo left in by Brian.
;//
;// Rev 1.18 26 Jun 1996 12:21:50 BNICKERS
;// Make heuristic ME work without unrestricted motion vectors.
;//
;// Rev 1.17 25 Jun 1996 14:24:58 BNICKERS
;// Implement heuristic motion estimation for MMX, AP mode.
;//
;// Rev 1.16 15 May 1996 16:57:14 BNICKERS
;// Fix SWD tabulation (again)! @#$%!%
;//
;// Rev 1.15 15 May 1996 16:53:24 BNICKERS
;//
;// Fix SWD tabulation.
;//
;// Rev 1.14 15 May 1996 11:33:28 BNICKERS
;// Bug fix for calc of total SWD.
;//
;// Rev 1.13 14 May 1996 12:18:58 BNICKERS
;// Initial debugging of MMx B-Frame ME.
;//
;// Rev 1.12 03 May 1996 14:03:50 BNICKERS
;//
;// Minor bug fixes and integration refinements.
;//
;// Rev 1.11 02 May 1996 12:00:32 BNICKERS
;// Initial integration of B Frame ME, MMX version.
;//
;// Rev 1.10 16 Apr 1996 16:40:14 BNICKERS
;// Fix some important but simple bugs. Start adding table inits for B frm ME.
;//
;// Rev 1.9 10 Apr 1996 13:13:44 BNICKERS
;// Recoding of Motion Estimation, Advanced Prediction.
;//
;// Rev 1.8 05 Apr 1996 12:28:10 BNICKERS
;// Improvements to baseline half pel ME.
;//
;// Rev 1.7 26 Mar 1996 12:00:22 BNICKERS
;// Did some tuning for MMx encode.
;//
;// Rev 1.6 20 Mar 1996 17:01:44 KLILLEVO
;// fixed bug in new quant code
;//
;// Rev 1.5 20 Mar 1996 15:26:40 KLILLEVO
;// changed quantization to match IA quantization
;//
;// Rev 1.3 15 Mar 1996 15:51:16 BECHOLS
;// Completed monolithic - Brian
;//
;// Rev 1.0 16 Feb 1996 17:12:12 BNICKERS
;// Initial revision.
;//
;////////////////////////////////////////////////////////////////////////////
;
; MMxMotionEstimation -- This function performs motion estimation for the
; macroblocks identified in the input list. This is
; the MMx version. Conditional assembly selects either
; the H263 or H261 version.
;
; Arguments: See ex5me.asm.
;
; Other assumptions: See ex5me.asm. Most of the read-only tables needed in
; ex5me.asm are not needed here.
;
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
OPTION M510
OPTION CASEMAP:NONE
IFDEF H261
ZEROVECTORTHRESHOLD = 600
NONZEROMVDIFFERENTIAL = 256
BLOCKMOTIONTHRESHOLD = 1152
BLOCKMVDIFFERENTIAL = 768
EMPTYTHRESHOLD = 40
INTERCODINGTHRESHOLD = 300
INTRACODINGDIFFERENTIAL = 200
ELSE
ZEROVECTORTHRESHOLD = 450
NONZEROMVDIFFERENTIAL = 375
BLOCKMOTIONTHRESHOLD = 1152
BLOCKMVDIFFERENTIAL = 768
EMPTYTHRESHOLD = 40
INTERCODINGTHRESHOLD = 1152
INTRACODINGDIFFERENTIAL = 1000
ENDIF
include iammx.inc
include e3inst.inc
include e3mbad.inc
.xlist
include memmodel.inc
.list
include exEDTQ.inc
MMXMEDATA SEGMENT PAGE
ALIGN 16
; Storage for Target and Reference frames can interleave into 8K of the 16K
; cache. Pitch must be 384.
;
; C# -- Stands for row number "#" of target macroblock in *C*urrent P frame.
; B# -- Stands for row number "#" of target macroblock in current *B* frame.
; R# -- Stands for row number "#" of 0MV *R*ef macroblock in past frame.
; v -- Stands for a row below 0MV, reference macroblock.
; These same cache lines would hit reference lines >8 above the 0MV.
; ^ -- Stands for a row below 0MV, reference macroblock.
; These same cache lines would hit reference lines >8 below the 0MV.
; +-+-+
; | | -- A cache line (32 bytes). Position of letters,<, and > indicate
; +-+-+ which 16 bytes may be used in the cache line.
;
; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
; |C0 | | v| |Cb | | ^| |B6 | | R6| |
; |C1 | | v| |Cc | | ^| |B7 | | R7| |
; |C2 | | v| |Cd | | ^| |B8 | | R8| |
; |C3 | | v| |Ce | | ^| |B9 | | R9| |
; |C4 | | v| |Cf | | ^| |Ba | | Ra| |
; |C5 | | v| |B0 | | R0| |Bb | | Rb| |
; |C6 | | v| |B1 | | R1| |Bc | | Rc| |
; |C7 | | v| |B2 | | R2| |Bd | | Rd| |
; |C8 | | ^| |B3 | | R3| |Be | | Re| |
; |C9 | | ^| |B4 | | R4| |Bf | | Rf| |
; |Ca | | ^| |B5 | | R5| +-+-+-+-+-+-+-+-+
; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
;
; The static storage space used for read-only tables, and the stack usage
; are coordinated such that they mesh in the data cache, and use only one
; 4K way of the 4-way, 16K cache.
;
; The first 32 bytes of the static storage space are unallocated, because
; the top of stack ranges in this area. As local procedure calls are made
; within this function, return addresses get pushed into these 32 bytes.
; (32 bytes; 0: 31)
DB 32 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
;
; The next 608 bytes of the static storage space are unallocated, because
; the local stack frame is made to hit cache at these addresses. More of
; the local stack frame is allocated after a gap of 64 bytes.
; (608 bytes; 32: 639)
LocalStorage LABEL DWORD
DB 608 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
; Motion Estimation State Engine adjustments to reference block address to get
; to next candidate reference block.
; (64 bytes; 640: 703)
FullPelMotionVectorAdjustment LABEL DWORD
DD -16*PITCH-8
VMG EQU 000H+0+8
VMGHM8 EQU 000H-8+8
DD -8*PITCH-8-010H
VM8HM8 EQU 010H
DD -8*PITCH-020H
VM8 EQU 020H
VM8HP8 EQU 020H+8
DD -4*PITCH-8-030H
VM4HM8 EQU 030H-8+8
VM4HM4 EQU 030H-4+8
VM4 EQU 030H+0+8
VM4HP4 EQU 030H+4+8
DD -4*PITCH+8-040H
VM4HP8 EQU 040H+8-8
VM4HPG EQU 040H+16-8
DD -2*PITCH-4-050H
VM2HM4 EQU 050H-4+4
VM2HM2 EQU 050H-2+4
VM2HM1 EQU 050H-1+4
VM2 EQU 050H+0+4
VM2HP1 EQU 050H+1+4
VM2HP2 EQU 050H+2+4
VM2HP4 EQU 050H+4+4
VM2HP8 EQU 050H+8+4
DD -1*PITCH-2-060H
VM1HM2 EQU 060H-2+2
VM1HM1 EQU 060H-1+2
VM1 EQU 060H+0+2
VM1HP1 EQU 060H+1+2
VM1HP2 EQU 060H+2+2
VM1HP4 EQU 060H+4+2
DD -16-070H
HMG EQU 070H-16+16
HM8 EQU 070H-8+16
HM4 EQU 070H-4+16
HM3 EQU 070H-3+16
HM2 EQU 070H-2+16
HM1 EQU 070H-1+16
DD -080H
NOADJ EQU 080H
HP1 EQU 080H+1
HP2 EQU 080H+2
HP4 EQU 080H+4
HP8 EQU 080H+8
DD 1*PITCH-2-090H
VP1HM2 EQU 090H-2+2
VP1HM1 EQU 090H-1+2
VP1 EQU 090H+0+2
VP1HP1 EQU 090H+1+2
VP1HP2 EQU 090H+2+2
VP1HP4 EQU 090H+4+2
DD 2*PITCH-4-0A0H
VP2HM4 EQU 0A0H-4+4
VP2HM2 EQU 0A0H-2+4
VP2HM1 EQU 0A0H-1+4
VP2 EQU 0A0H+0+4
VP2HP1 EQU 0A0H+1+4
VP2HP2 EQU 0A0H+2+4
VP2HP4 EQU 0A0H+4+4
VP2HP8 EQU 0A0H+8+4
DD 4*PITCH-8-0B0H
VP4HM8 EQU 0B0H-8+8
VP4HM4 EQU 0B0H-4+8
VP4HM2 EQU 0B0H-2+8
VP4 EQU 0B0H+0+8
VP4HP2 EQU 0B0H+2+8
VP4HP4 EQU 0B0H+4+8
DD 4*PITCH+8-0C0H
VP4HP8 EQU 0C0H+8-8
VP4HPG EQU 0C0H+16-8
DD 8*PITCH-8-0D0H
VP8HM8 EQU 0D0H-8+8
VP8HM4 EQU 0D0H-4+8
DD 8*PITCH-0E0H
VP8 EQU 0E0H+0
VP8HP4 EQU 0E0H+4
VP8HP8 EQU 0E0H+8
DD 16*PITCH-0F0H
VPG EQU 0F0H+0
VPGHP8 EQU 0F0H+8
; Additional space reserved for stack variables. If more space is needed,
; it should go here.
; (160 bytes; 704: 863)
DB 160 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
; QWORD Constants used by motion estimation, frame differencing, and FDCT.
; (144 bytes; 864:1007)
C0101010101010101 DD 001010101H, 001010101H
CFFFF0000FFFF0000 DD 0FFFF0000H, 0FFFF0000H
C0200010101010101 DD 001010101H, 002000101H
C0001000200020001 DD 000020001H, 000010002H
CFFFF00000000FFFF DD 00000FFFFH, 0FFFF0000H
C0000FFFFFFFF0000 DD 0FFFF0000H, 00000FFFFH
CFF000000000000FF DD 0000000FFH, 0FF000000H
C0101010101010002 DD 001010002H, 001010101H
C0100010001000100 DD 001000100H, 001000100H
C0001000100010001 DD 000010001H, 000010001H
C7F7F7F7F7F7F7F7F DD 07F7F7F7FH, 07F7F7F7FH
C1 DD 07D8A7D8AH, 07D8A7D8AH
C2 DD 076417641H, 076417641H
C3 DD 06A6D6A6DH, 06A6D6A6DH
C4 DD 05A825A82H, 05A825A82H
C5 DD 0471D471DH, 0471D471DH
C6 DD 030FC30FCH, 030FC30FCH
C7 DD 018F818F8H, 018F818F8H
; Distances to Block Action Descriptors for blocks that provide remote vectors
; for OBMC. Which element accessed depends on edge condition. Top edge is
; stack based variable, since different instances may have different distances
; to BAD of block above. Bottom edge is always a constant, regardless of
; edge condition. This is used in OBMC frame differencing.
; (16 bytes; 1008:1023)
BlockToLeft DD 0, -SIZEOF T_MacroBlockActionDescr+SIZEOF T_Blk
BlockToRight DD 0, SIZEOF T_MacroBlockActionDescr-SIZEOF T_Blk
; Table to map linearized motion vector to vertical part, used by motion
; estimation. (Shift linearized motion vector right by 8 bits, and then
; use result as index into this array to get vertical MV.)
; (96 bytes; 1024:1119)
IF PITCH-384
*** error: The magic of this table assumes a pitch of 384.
ENDIF
DB -64, -64
DB -62
DB -60, -60
DB -58
DB -56, -56
DB -54
DB -52, -52
DB -50
DB -48, -48
DB -46
DB -44, -44
DB -42
DB -40, -40
DB -38
DB -36, -36
DB -34
DB -32, -32
DB -30
DB -28, -28
DB -26
DB -24, -24
DB -22
DB -20, -20
DB -18
DB -16, -16
DB -14
DB -12, -12
DB -10
DB -8, -8
DB -6
DB -4, -4
DB -2
DB 0
UnlinearizedVertMV DB 0
DB 2
DB 4, 4
DB 6
DB 8, 8
DB 10
DB 12, 12
DB 14
DB 16, 16
DB 18
DB 20, 20
DB 22
DB 24, 24
DB 26
DB 28, 28
DB 30
DB 32, 32
DB 34
DB 36, 36
DB 38
DB 40, 40
DB 42
DB 44, 44
DB 46
DB 48, 48
DB 50
DB 52, 52
DB 54
DB 56, 56
DB 58
DB 60, 60
DB 62
; Table to provide index value in low byte, and rounding term of 1 in all bytes.
; Used in frame differencing, when half pel horizontal interpolation is needed.
; (1024 bytes; 1120:2143)
Pel_Rnd LABEL DWORD
CNT = 0
REPEAT 128
DD CNT+001010101H, 001010101H
CNT = CNT + 1
ENDM
; Motion Estimation State Engine Rules.
; (896 bytes;2144:3039)
StateEngineFirstRule LABEL BYTE ; Rules that govern state engine of estimator.
StateEngine EQU StateEngineFirstRule-20+2
; Starting States:
IF PITCH-384
*** error: The magic of this table assumes a pitch of 384.
ENDIF
DB ? ; 0: not used.
DB 3 ; 1: Upper left corner.
DB 3 ; 2: Upper edge.
DB 3 ; 3: Upper right corner.
DB 3 ; 4: Left edge.
DB 3 ; 5: Interior MB, not doing block search.
DB 0 ; 6: Right edge.
DB 0 ; 7: Lower left corner.
DB 0 ; 8: Lower edge.
DB 0 ; 9: Lower right corner.
DB ? ; 0: not used.
DB 34 ; 1: Upper left corner.
DB 66 ; 2: Upper edge.
DB 42 ; 3: Upper right corner.
DB 98 ; 4: Left edge.
DB 16 ; 5: Interior MB, not doing block search.
DB 114 ; 6: Right edge.
DB 50 ; 7: Lower left corner.
DB 82 ; 8: Lower edge.
DB 58 ; 9: Lower right corner.
DB ?,? ; Skip 2 bytes.
LASTINITIALMESTATE EQU 9
; Interior Telescoping States:
; Try +/- 8,4,2,1, vertically first, then horizontally.
FIRSTBLOCKMESTATE EQU 10
DB VM2, VM2, 12, 11 ; 10: V+1 better/worse than central. Try V-1.
DB VP2HP1, HP1, 13, 13 ; 11: Accept V+1/V-1 as best. Try H+1.
DB VP1HP1, HP1, 13, 13 ; 12: Accept central/V-1 as best. Try H+1.
DB HM2, HM2, 15, 14 ; 13: H+1 better/worse than central. Try H-1.
DB HP2, NOADJ, 0FFH, 0FFH ; 14: Accept H+1/H-1 as best. Done.
DB HP1, NOADJ, 0FFH, 0FFH ; 15: Accept central/H-1 as best. Done.
DB VMG, VMG, 18, 17 ; 16: V+8 better/worse than central. Try V-8.
DB VPGHP8, HP8, 19, 19 ; 17: Accept V+8/V-8 as best. Try H+8.
DB VP8HP8, HP8, 19, 19 ; 18: Accept central/V-8 as best. Try H+8.
DB HMG, HMG, 21, 20 ; 19: H+8 better/worse than central. Try H-8.
DB VP4HPG, VP4, 22, 22 ; 20: Accept H+8/H-8 as best. Try V+4.
DB VP4HP8, VP4, 22, 22 ; 21: Accept central/H-8 as best. Try V+4.
DB VM8, VM8, 24, 23 ; 22: V+4 better/worse than central. Try V-4.
DB VP8HP4, HP4, 25, 25 ; 23: Accept V+4/V-4 as best. Try H+4.
DB VP4HP4, HP4, 25, 25 ; 24: Accept central/V-4 as best. Try H+4.
DB HM8, HM8, 27, 26 ; 25: H+4 better/worse than central. Try H-4.
DB VP2HP8, VP2, 28, 28 ; 26: Accept H+4/H-4 as best. Try V+2.
DB VP2HP4, VP2, 28, 28 ; 27: Accept central/H-4 as best. Try V+2.
DB VM4, VM4, 30, 29 ; 28: V+2 better/worse than central. Try V-2.
DB VP4HP2, HP2, 31, 31 ; 29: Accept V+2/V-2 as best. Try H+2.
DB VP2HP2, HP2, 31, 31 ; 30: Accept central/V-2 as best. Try H+2.
DB HM4, HM4, 33, 32 ; 31: H+2 better/worse than central. Try H-2.
DB VP1HP4, VP1, 10, 10 ; 32: Accept H+2/H-2 as best. Try V+1.
DB VP1HP2, VP1, 10, 10 ; 33: Accept central/H-2 as best. Try V+1.
; Boundary States:
; Upper left corner:
DB VM8HP8, HP8, 35, 101 ; 34: Accept corner/V+8. Try H+8.
DB VP4HM8, VP4, 36, 70 ; 35: Accept corner/H+8. Try V+4.
DB VM4HP4, HP4, 37, 105 ; 36: Accept corner/V+4. Try H+4.
DB VP2HM4, VP2, 38, 74 ; 37: Accept corner/H+4. Try V+2.
DB VM2HP2, HP2, 39, 109 ; 38: Accept corner/V+2. Try H+2.
DB VP1HM2, VP1, 40, 78 ; 39: Accept corner/H+2. Try V+1.
DB VM1HP1, HP1, 41, 113 ; 40: Accept corner/V+1. Try H+1.
DB HM1, NOADJ, 0F5H, 0F7H ; 41: Accept corner/H+1. Done.
; Upper right corner:
DB VM8HM8, HM8, 43, 117 ; 42: Accept corner/V+8. Try H-8.
DB VP4HP8, VP4, 44, 70 ; 43: Accept corner/H-8. Try V+4.
DB VM4HM4, HM4, 45, 121 ; 44: Accept corner/V+4. Try H-4.
DB VP2HP4, VP2, 46, 74 ; 45: Accept corner/H-4. Try V+2.
DB VM2HM2, HM2, 47, 125 ; 46: Accept corner/V+2. Try H-2.
DB VP1HP2, VP1, 48, 78 ; 47: Accept corner/H-2. Try V+1.
DB VM1HM1, HM1, 49, 129 ; 48: Accept corner/V+1. Try H-1.
DB HP1, NOADJ, 0F6H, 0F7H ; 49: Accept corner/H-1. Done
; Lower left corner:
DB VP8HP8, HP8, 51, 101 ; 50: Accept corner/V-8. Try H+8.
DB VM4HM8, VM4, 52, 86 ; 51: Accept corner/H+8. Try V-4.
DB VP4HP4, HP4, 53, 105 ; 52: Accept corner/V-4. Try H+4.
DB VM2HM4, VM2, 54, 90 ; 53: Accept corner/H+4. Try V-2.
DB VP2HP2, HP2, 55, 109 ; 54: Accept corner/V-2. Try H+2.
DB VM1HM2, VM1, 56, 94 ; 55: Accept corner/H+2. Try V-1.
DB VP1HP1, HP1, 57, 113 ; 56: Accept corner/V-1. Try H+1.
DB HM1, NOADJ, 0F9H, 0FBH ; 57: Accept corner/H+1. Done.
; Lower right corner:
DB VP8HM8, HM8, 59, 117 ; 58: Accept corner/V-8. Try H-8.
DB VM4HP8, VM4, 60, 86 ; 59: Accept corner/H-8. Try V-4.
DB VP4HM4, HM4, 61, 121 ; 60: Accept corner/V-4. Try H-4.
DB VM2HP4, VM2, 62, 90 ; 61: Accept corner/H-4. Try V-2.
DB VP2HM2, HM2, 63, 125 ; 62: Accept corner/V-2. Try H-2.
DB VM1HP2, VM1, 64, 94 ; 63: Accept corner/H-2. Try V-1.
DB VP1HM1, HM1, 65, 129 ; 64: Accept corner/V-1. Try H-1.
DB HP1, NOADJ, 0FAH, 0FBH ; 65: Accept corner/H-1. Done.
; Upper edge:
DB VM8HP8, HP8, 67, 19 ; 66: Accept central/V+8 as best. Try H+8.
DB HMG, HMG, 69, 68 ; 67: H+8 worse/better than central. Try H-8.
DB VP4HPG, VP4, 70, 70 ; 68: Accept H+8/H-8 as best. Try V+4.
DB VP4HP8, VP4, 70, 70 ; 69: Accept central/H-8 as best. Try V+4.
DB VM4HP4, HP4, 71, 25 ; 70: Accept central/V+4 as best. Try H+4.
DB HM8, HM8, 73, 72 ; 71: H+4 worse/better than central. Try H-4.
DB VP2HP8, VP2, 74, 74 ; 72: Accept H+4/H-4 as best. Try V+2.
DB VP2HP4, VP2, 74, 74 ; 73: Accept central/H-4 as best. Try V+2.
DB VM2HP2, HP2, 75, 31 ; 74: Accept central/V+2 as best. Try H+2.
DB HM4, HM4, 77, 76 ; 75: H+2 worse/better than central. Try H-2.
DB VP1HP4, VP1, 78, 78 ; 76: Accept H+2/H-2 as best. Try V+1.
DB VP1HP2, VP1, 78, 78 ; 77: Accept central/H-2 as best. Try V+1.
DB VM1HP1, HP1, 79, 13 ; 78: Accept central/V+1 as best. Try H+1.
DB HM2, HM2, 81, 80 ; 79: H+1 worse/better than central. Try H-1.
DB HP2, NOADJ, 0F7H, 0F7H ; 80: Accept H+1/H-1 as best. Done.
DB HP1, NOADJ, 0F7H, 0F7H ; 81: Accept central/H-1 as best. Done.
; Lower edge:
DB VP8HP8, HP8, 83, 19 ; 82: Accept central/V-8 as best. Try H+8.
DB HMG, HMG, 85, 84 ; 83: H+8 worse/better than central. Try H-8.
DB VM4HPG, VM4, 86, 86 ; 84: Accept H+8/H-8 as best. Try V-4.
DB VM4HP8, VM4, 86, 86 ; 85: Accept central/H-8 as best. Try V-4.
DB VP4HP4, HP4, 87, 25 ; 86: Accept central/V-4 as best. Try H+4.
DB HM8, HM8, 89, 88 ; 87: H+4 worse/better than central. Try H-4.
DB VM2HP8, VM2, 90, 90 ; 88: Accept H+4/H-4 as best. Try V-2.
DB VM2HP4, VM2, 90, 90 ; 89: Accept central/H-4 as best. Try V-2.
DB VP2HP2, HP2, 91, 31 ; 90: Accept central/V-2 as best. Try H+2.
DB HM4, HM4, 93, 92 ; 91: H+2 worse/better than central. Try H-2.
DB VM1HP4, VM1, 94, 94 ; 92: Accept H+2/H-2 as best. Try V-1.
DB VM1HP2, VM1, 94, 94 ; 93: Accept central/H-2 as best. Try V-1.
DB VP1HP1, HP1, 95, 13 ; 94: Accept central/V-1 as best. Try H+1.
DB HM2, HM2, 97, 96 ; 95: H+1 worse/better than central. Try H-1.
DB HP2, NOADJ, 0FBH, 0FBH ; 96: Accept H+1/H-1 as best. Done.
DB HP1, NOADJ, 0FBH, 0FBH ; 97: Accept central/H-1 as best. Done.
; Left edge:
DB VMG, VMG, 100, 99 ; 98: V+8 worse/better than central. Try V-8.
DB VPGHP8, HP8, 101, 101 ; 99: Accept V+8/V-8 as best. Try H+8.
DB VP8HP8, HP8, 101, 101 ; 100: Accept central/V-8 as best. Try H+8.
DB VP4HM8, VP4, 102, 22 ; 101: Accept central/H+8 as best. Try V+4.
DB VM8, VM8, 104, 103 ; 102: V+4 worse/better than central. Try V-4.
DB VP8HP4, HP4, 105, 105 ; 103: Accept V+4/V-4 as best. Try H+4.
DB VP4HP4, HP4, 105, 105 ; 104: Accept central/V-4 as best. Try H+4.
DB VP2HM4, VP2, 106, 28 ; 105: Accept central/H+4 as best. Try V+2.
DB VM4, VM4, 108, 107 ; 106: V+2 worse/better than central. Try V-2.
DB VP4HP2, HP2, 109, 109 ; 107: Accept V+2/V-2 as best. Try H+2.
DB VP2HP2, HP2, 109, 109 ; 108: Accept central/V-2 as best. Try H+2.
DB VP1HM2, VP1, 110, 10 ; 109: Accept central/H+2 as best. Try V+1.
DB VM2, VM2, 112, 111 ; 110: V+1 worse/better than central. Try V-1.
DB VP2HP1, HP1, 113, 113 ; 111: Accept V+1/V-1 as best. Try H+1.
DB VP1HP1, HP1, 113, 113 ; 112: Accept central/V-1 as best. Try H+1.
DB HM1, NOADJ, 0FDH, 0FDH ; 113: Accept central/H+1 as best. Done.
; Right edge:
DB VPG, VPG, 116, 115 ; 114: V-8 worse/better than central. Try V+8.
DB VMGHM8, HM8, 117, 117 ; 115: Accept V-8/V+8 as best. Try H-8.
DB VM8HM8, HM8, 117, 117 ; 116: Accept central/V+8 as best. Try H-8.
DB VP4HP8, VP4, 118, 22 ; 117: Accept central/H+8 as best. Try V+4.
DB VM8, VM8, 120, 119 ; 118: V+4 worse/better than central. Try V-4.
DB VP8HM4, HM4, 121, 121 ; 119: Accept V+4/V-4 as best. Try H-4.
DB VP4HM4, HM4, 121, 121 ; 120: Accept central/V-4 as best. Try H-4.
DB VP2HP4, VP2, 122, 28 ; 121: Accept central/H+4 as best. Try V+2.
DB VM4, VM4, 124, 123 ; 122: V+2 worse/better than central. Try V-2.
DB VP4HM2, HM2, 125, 125 ; 123: Accept V+2/V-2 as best. Try H-2.
DB VP2HM2, HM2, 125, 125 ; 124: Accept central/V-2 as best. Try H-2.
DB VP1HP2, VP1, 126, 10 ; 125: Accept central/H+2 as best. Try V+1.
DB VM2, VM2, 128, 127 ; 126: V+1 worse/better than central. Try V-1.
DB VP2HM1, HM1, 129, 129 ; 127: Accept V+1/V-1 as best. Try H-1.
DB VP1HM1, HM1, 129, 129 ; 128: Accept central/V-1 as best. Try H-1.
DB HP1, NOADJ, 0FEH, 0FEH ; 129: Accept central/H+1 as best. Done.
; Exhaustive search, radius 1 here, reaching out to radius 2 further below.
; . . . . .
; . 2 5 3 . C = center.
; . 7 C 8 .
; . 4 6 1 . # = order to try additional candidates.
; . . . . .
FIRST_HEURISTIC_EXHAUSTIVE = 130
DB VM2HM2, VM2HM2, 131, 138 ; 130: #1 worse/better than C. Try #2.
DB HP2, HP2, 132, 145 ; 131: #2 worse/better than C. Try #3.
DB VP2HM2, VP2HM2, 133, 151 ; 132: #3 worse/better than C. Try #4.
DB VM2HP1, VM2HP1, 134, 156 ; 133: #4 worse/better than C. Try #5.
DB VP2, VP2, 135, 160 ; 134: #5 worse/better than C. Try #6.
DB VM1HM1, VM1HM1, 136, 163 ; 135: #6 worse/better than C. Try #7.
DB HP2, HP2, 137, 165 ; 136: #7 worse/better than C. Try #8.
DB HM1, HP1, 0FFH, 166 ; 137: If C best, quit. If 8 best, keep going.
DB HP2, HP2, 139, 145 ; 138: #2 worse/better than #1. Try #3.
DB VP2HM2, VP2HM2, 140, 151 ; 139: #3 worse/better than #1. Try #4.
DB VM2HP1, VM2HP1, 141, 156 ; 140: #4 worse/better than #1. Try #5.
DB VP2, VP2, 142, 160 ; 141: #5 worse/better than #1. Try #6.
DB VM1HM1, VM1HM1, 143, 163 ; 142: #6 worse/better than #1. Try #7.
DB HP2, HP2, 144, 165 ; 143: #7 worse/better than #1. Try #8.
DB HP1, HP1, 199, 166 ; 144: #8 worse/better than #1. Take best, go on.
DB VP2HM2, VP2HM2, 146, 151 ; 145: #3 worse/better than #2. Try #4.
DB VM2HP1, VM2HP1, 147, 156 ; 146: #4 worse/better than #2. Try #5.
DB VP2, VP2, 148, 160 ; 147: #5 worse/better than #2. Try #6.
DB VM1HM1, VM1HM1, 149, 163 ; 148: #6 worse/better than #2. Try #7.
DB HP2, HP2, 150, 165 ; 149: #7 worse/better than #2. Try #8.
DB HM3, HP1, 208, 166 ; 150: #8 worse/better than #2. Take best, go on.
DB VM2HP1, VM2HP1, 152, 156 ; 151: #4 worse/better than #3. Try #5.
DB VP2, VP2, 153, 160 ; 152: #5 worse/better than #3. Try #6.
DB VM1HM1, VM1HM1, 154, 163 ; 153: #6 worse/better than #3. Try #7.
DB HP2, HP2, 155, 165 ; 154: #7 worse/better than #3. Try #8.
DB HP1, HP1, 217, 166 ; 155: #8 worse/better than #3. Take best, go on.
DB VP2, VP2, 157, 160 ; 156: #5 worse/better than #4. Try #6.
DB VM1HM1, VM1HM1, 158, 163 ; 157: #6 worse/better than #4. Try #7.
DB HP2, HP2, 159, 165 ; 158: #7 worse/better than #4. Try #8.
DB HM3, HP1, 190, 166 ; 159: #8 worse/better than #4. Take best, go on.
DB VM1HM1, VM1HM1, 161, 163 ; 160: #6 worse/better than #5. Try #7.
DB HP2, HP2, 162, 165 ; 161: #7 worse/better than #5. Try #8.
DB VM2HM1, HP1, 184, 166 ; 162: #8 worse/better than #5. Take best, go on.
DB HP2, HP2, 164, 165 ; 163: #7 worse/better than #6. Try #8.
DB VP2HM1, HP1, 176, 166 ; 164: #8 worse/better than #6. Take best, go on.
DB HM3, HP1, 172, 166 ; 165: #8 worse/better than #7. Take best, go on.
; . . . . . C = center.
; . ~ ~ ~ 2 ~ = tried, but not as good.
; . ~ C X 1 X = best so far.
; . ~ ~ ~ 3 # = order to try additional candidates.
; . . . . .
DB VM1, VM1, 167, 169 ; 166: #1 better/worse than X. Try #2.
DB VP2, VP2, 168, 171 ; 167: #2 better/worse than X. Try #3.
DB VM1HM1, NOADJ, 0FFH,0FFH ; 168: #3 better/worse than X. Take best, quit.
DB VP2, VP2, 170, 171 ; 169: #2 better/worse than #1. Try #3.
DB VM1, NOADJ, 0FFH,0FFH ; 170: #3 better/worse than #1. Take best, quit.
DB VM2, NOADJ, 0FFH,0FFH ; 171: #3 better/worse than #2. Take best, quit.
; . . . . . C = center.
; 2 ~ ~ ~ . ~ = tried, but not as good.
; 1 X C ~ . X = best so far.
; 3 ~ ~ ~ . # = order to try additional candidates.
; . . . . .
DB VM1, VM1, 173, 175 ; 172: #1 better/worse than X. Try #2.
DB VP2, VP2, 174, 177 ; 173: #2 better/worse than X. Try #3.
DB VM1HP1, NOADJ, 0FFH,0FFH ; 174: #3 better/worse than X. Take best, quit.
DB VP2, VP2, 176, 177 ; 175: #2 better/worse than #1. Try #3.
DB VM1, NOADJ, 0FFH,0FFH ; 176: #3 better/worse than #1. Take best, quit.
DB VM2, NOADJ, 0FFH,0FFH ; 177: #3 better/worse than #2. Take best, quit.
; . . . . . C = center.
; . ~ ~ ~ . ~ = tried, but not as good.
; . ~ C ~ . X = best so far.
; . ~ X ~ . # = order to try additional candidates.
; . 2 1 3 .
DB HM1, HM1, 179, 181 ; 178: #1 better/worse than X. Try #2.
DB HP2, HP2, 180, 183 ; 179: #2 better/worse than X. Try #3.
DB VM1HM1, NOADJ, 0FFH,0FFH ; 180: #3 better/worse than X. Take best, quit.
DB HP2, HP2, 182, 183 ; 181: #2 better/worse than #1. Try #3.
DB HM1, NOADJ, 0FFH,0FFH ; 182: #3 better/worse than #1. Take best, quit.
DB HM2, NOADJ, 0FFH,0FFH ; 183: #3 better/worse than #2. Take best, quit.
; . 2 1 3 . C = center.
; . ~ X ~ . ~ = tried, but not as good.
; . ~ C ~ . X = best so far.
; . ~ ~ ~ . # = order to try additional candidates.
; . . . . .
DB HM1, HM1, 185, 187 ; 184: #1 better/worse than X. Try #2.
DB HP2, HP2, 186, 189 ; 185: #2 better/worse than X. Try #3.
DB VP1HM1, NOADJ, 0FFH,0FFH ; 186: #3 better/worse than X. Take best, quit.
DB HP2, HP2, 188, 189 ; 187: #2 better/worse than #1. Try #3.
DB HM1, NOADJ, 0FFH,0FFH ; 188: #3 better/worse than #1. Take best, quit.
DB HM2, NOADJ, 0FFH,0FFH ; 189: #3 better/worse than #2. Take best, quit.
; . . . . . C = center.
; . ~ ~ ~ . ~ = tried, but not as good.
; 1 ~ C ~ . X = best so far.
; 2 X ~ ~ . # = order to try additional candidates.
; 4 3 5 . .
DB VP1, VP1, 191, 195 ; 190: #1 better/worse than X. Try #2.
DB VP1HP1, VP1HP1, 178, 192 ; 191: #2 better/worse than X. Try #3.
DB HM1, HM1, 193, 181 ; 192: #3 better/worse than #2. Try #4.
DB HP2, HP2, 194, 183 ; 193: #4 better/worse than #2. Try #5.
DB VM1HM2, NOADJ, 0FFH,0FFH ; 194: #5 better/worse than #2. Take best, quit.
DB VP1HP1, VP1HP1, 196, 192 ; 195: #2 better/worse than #1. Try #3.
DB HM1, HM1, 197, 181 ; 196: #3 better/worse than #1. Try #4.
DB HP2, HP2, 198, 183 ; 197: #4 better/worse than #1. Try #5.
DB VM2HM2, NOADJ, 0FFH,0FFH ; 198: #5 better/worse than #1. Take best, quit.
; . . . . . C = center.
; . ~ ~ ~ . ~ = tried, but not as good.
; . ~ C ~ 1 X = best so far.
; . ~ ~ X 2 # = order to try additional candidates.
; . . 4 3 5
DB VP1, VP1, 200, 204 ; 199: #1 better/worse than X. Try #2.
DB VP1HM1, VP1HM1, 178, 201 ; 200: #2 better/worse than X. Try #3.
DB HM1, HM1, 202, 181 ; 201: #3 better/worse than #2. Try #4.
DB HP2, HP2, 203, 183 ; 202: #4 better/worse than #2. Try #5.
DB VM1, NOADJ, 0FFH,0FFH ; 203: #5 better/worse than #2. Take best, quit.
DB VP1HM1, VP1HM1, 205, 201 ; 204: #2 better/worse than #1. Try #3.
DB HM1, HM1, 206, 181 ; 205: #3 better/worse than #1. Try #4.
DB HP2, HP2, 207, 183 ; 206: #4 better/worse than #1. Try #5.
DB VM2, NOADJ, 0FFH,0FFH ; 207: #5 better/worse than #1. Take best, quit.
; 4 3 5 . . C = center.
; 2 X ~ ~ . ~ = tried, but not as good.
; 1 ~ C ~ . X = best so far.
; . ~ ~ ~ . # = order to try additional candidates.
; . . . . .
DB VM1, VM1, 209, 213 ; 208: #1 better/worse than X. Try #2.
DB VM1HP1, VM1HP1, 184, 210 ; 209: #2 better/worse than X. Try #3.
DB HM1, HM1, 211, 187 ; 210: #3 better/worse than #2. Try #4.
DB HP2, HP2, 212, 189 ; 211: #4 better/worse than #2. Try #5.
DB VP1HM2, NOADJ, 0FFH,0FFH ; 212: #5 better/worse than #2. Take best, quit.
DB VM1HP1, VM1HP1, 214, 210 ; 213: #2 better/worse than #1. Try #3.
DB HM1, HM1, 215, 187 ; 214: #3 better/worse than #1. Try #4.
DB HP2, HP2, 216, 189 ; 215: #4 better/worse than #1. Try #5.
DB VP2HM2, NOADJ, 0FFH,0FFH ; 216: #5 better/worse than #1. Take best, quit.
; . . 4 3 5 C = center.
; . ~ ~ X 2 ~ = tried, but not as good.
; . ~ C ~ 1 X = best so far.
; . ~ ~ ~ . # = order to try additional candidates.
; . . . . .
DB VM1, VM1, 218, 222 ; 217: #1 better/worse than X. Try #2.
DB VM1HM1, VM1HM1, 184, 219 ; 218: #2 better/worse than X. Try #3.
DB HM1, HM1, 220, 187 ; 219: #3 better/worse than #2. Try #4.
DB HP2, HP2, 221, 189 ; 220: #4 better/worse than #2. Try #5.
DB VP1, NOADJ, 0FFH,0FFH ; 221: #5 better/worse than #2. Take best, quit.
DB VM1HM1, VM1HM1, 223, 219 ; 222: #2 better/worse than #1. Try #3.
DB HM1, HM1, 224, 187 ; 223: #3 better/worse than #1. Try #4.
DB HP2, HP2, 225, 189 ; 224: #4 better/worse than #1. Try #5.
DB VP2, NOADJ, 0FFH,0FFH ; 225: #5 better/worse than #1. Take best, quit.
FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR = 226
DB VP1HP1, VP1HP1, 130, 130 ; 226: Redoing ctr, away from limiting edge.
DB ?, ?, ?, ?, ?, ?
; Table of values to add to SWDs for half pel reference macroblocks, to cause
; those that are off the edge of the frame to produce artificially high SWDs.
; (64 bytes;3040:3103)
InvalidateBadHalfPelMVs LABEL DWORD
DD 0FFFFFFFFH, 0FFFFFF00H, 0FFFF00FFH, 0FFFF0000H
DD 0FF00FFFFH, 0FF00FF00H, 0FF0000FFH, 0FF000000H
DD 000FFFFFFH, 000FFFF00H, 000FF00FFH, 000FF0000H
DD 00000FFFFH, 00000FF00H, 0000000FFH, 000000000H
; Tables (interleaved) to select case from next table (below these) to drive
; the weighting of the future and past predictions in the construction of
; B-frame reference blocks.
; (448 bytes;3104:3551)
VertWtSel LABEL BYTE
DB 0
HorzWtSel LABEL BYTE
DB 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 1, 0
DB 1, 0
DB 2, 16
DB 2, 16
DB 3, 32
DB 3, 32
DB 4, 48
DB 4, 48
DB 5, 64
DB 5, 64
DB 6, 80
DB 6, 80
DB 7, 96
DB 7, 96
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 8, 112
DB 9, 128
DB 9, 128
DB 10, 144
DB 10, 144
DB 11, 160
DB 11, 160
DB 12, 176
DB 12, 176
DB 13, 192
DB 13, 192
DB 14, 208
DB 14, 208
DB 15, 224
DB 15, 224
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240 ; Chroma starts here
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240 ; Luma ends here
DB 0, 240
DB 0, 240
DB 1, 0
DB 1, 0
DB 2, 16
DB 2, 16
DB 3, 32
DB 3, 32
DB 4, 48
DB 4, 48
DB 5, 64
DB 5, 64
DB 6, 80
DB 6, 80
DB 7, 96
DB 7, 96
DB 8, 112
DB 9, 128
DB 9, 128
DB 10, 144
DB 10, 144
DB 11, 160
DB 11, 160
DB 12, 176
DB 12, 176
DB 13, 192
DB 13, 192
DB 14, 208
DB 14, 208
DB 15, 224
DB 15, 224
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
DB 0, 240
; Table indexed by VertWtSel and HorzWtSel to get index of weight to apply to
; future and past predictions in the construction of B-frame reference blocks
; for frame differencing.
; (264 bytes;3552:3815)
;
; Indexed by VertWtSel[VMV]+HorzWtSel[HMV]+N to get idx of weight for line N.
P8F0 = 0*8
F1P7 = 1*8
F2P6 = 2*8
F3P5 = 3*8
F4P4 = 4*8
F5P3 = 5*8
F6P2 = 6*8
F7P1 = 7*8
F8P0 = 8*8
P1F7 = 9*8
P2F6 = 10*8
P3F5 = 11*8
P4F4 = 12*8
P5F3 = 13*8
P6F2 = 14*8
P7F1 = 15*8
Diff_IdxRefWts LABEL BYTE
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
BFrmSWDState LABEL BYTE ; State engine rules for finding best motion vector.
; (48 bytes; 3816:3863)
; 1st number: Horizontal Motion displacement to try, in half pel increments.
; 2nd number: Vertical Motion displacement to try, in half pel increments.
; 3rd number: Next state to enter if previous best is still best.
; 4th number: Next state to enter if this motion is better than previous best.
DB -2, 0, 4, 8 ; 0 -- ( 0, 0) Try (-2, 0)
DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0)
DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0)
DB 0, -2, 16, 20 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2})
DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2)
DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
DB -1, 0, 28, 32 ; 24
DB 1, 0, 36, 36 ; 28
DB 2, 0, 36, 36 ; 32
DB 0, -1, 40, 44 ; 36
DB 0, 1, 0, 0 ; 40
DB 0, 2, 0, 0 ; 44
; Table used by Quant RLE to navigate the zigzag order of quantized coeffs.
; Contents of this table are initialized by first entry to MMxEDTQ. In
; unlikely event of race condition, it will just get initialized by more
; than one encoder instance.
; (128 bytes; 3864:3991)
NextZigZagCoeff LABEL BYTE
DB 128 DUP (0FFH)
; Table used to initial above table.
; (64 bytes: 3992:4055)
InitZigZagCoeff LABEL BYTE
DB Q01,Q10,Q20,Q11,Q02,Q03,Q12,Q21,Q30,Q40,Q31,Q22,Q13,Q04,Q05,Q14
DB Q23,Q32,Q41,Q50,Q60,Q51,Q42,Q33,Q24,Q15,Q06,Q07,Q16,Q25,Q34,Q43
DB Q52,Q61,Q70,Q71,Q62,Q53,Q44,Q35,Q26,Q17,Q27,Q36,Q45,Q54,Q63,Q72
DB Q73,Q64,Q55,Q46,Q37,Q47,Q56,Q65,Q74,Q75,Q66,Q57,Q67,Q76,Q77, 0
; Constants needed by the Quant RLE phase.
; (128 bytes; 4056:4183)
Recip2QP LABEL DWORD
WORD 0H, 0H ; QP = 000h
WORD 04000H, 04000H ; QP = 001h
WORD 02000H, 02000H ; QP = 002h
WORD 01555H, 01555H ; QP = 003h
WORD 01000H, 01000H ; QP = 004h
WORD 00CCCH, 00CCCH ; QP = 005h
WORD 00AAAH, 00AAAH ; QP = 006h
WORD 00924H, 00924H ; QP = 007h
WORD 00800H, 00800H ; QP = 008h
WORD 0071CH, 0071CH ; QP = 009h
WORD 00666H, 00666H ; QP = 00Ah
WORD 005D1H, 005D1H ; QP = 00Bh
WORD 00555H, 00555H ; QP = 00Ch
WORD 004ECH, 004ECH ; QP = 00Dh
WORD 00492H, 00492H ; QP = 00Eh
WORD 00444H, 00444H ; QP = 00Fh
WORD 00400H, 00400H ; QP = 010h
WORD 003C3H, 003C3H ; QP = 011h
WORD 0038EH, 0038EH ; QP = 012h
WORD 0035EH, 0035EH ; QP = 013h
WORD 00333H, 00333H ; QP = 014h
WORD 0030CH, 0030CH ; QP = 015h
WORD 002E8H, 002E8H ; QP = 016h
WORD 002C8H, 002C8H ; QP = 017h
WORD 002AAH, 002AAH ; QP = 018h
WORD 0028FH, 0028FH ; QP = 019h
WORD 00276H, 00276H ; QP = 01Ah
WORD 0025EH, 0025EH ; QP = 01Bh
WORD 00249H, 00249H ; QP = 01Ch
WORD 00234H, 00234H ; QP = 01Dh
WORD 00222H, 00222H ; QP = 01Eh
WORD 00210H, 00210H ; QP = 01Fh
; Skip over space to get to where the following tables can go. They will
; hit the cache at the same point as a portion of the StateEngine states
; that aren't used in the heuristic ME mode.
; (2056 bytes; 4184:6239)
DB 2056 DUP (?) ; Static space place-holder.
; Table to select base address in next table below to use for particular block
; of macroblock. First column provides address of base element of HorzWtSel
; to use to map horizontal MV to list of weighting indices to use. ; Second
; column is similar, but for Vertical MV. Third and fourth columns not used.
; 6 rows; one for each block in a macroblock.
; (88 bytes; 6240:6327)
LeftRightBlkPosition LABEL DWORD
DD HorzWtSel+0-64
UpDownBlkPosition LABEL DWORD
DD VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
DD HorzWtSel+32-64, VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
DD HorzWtSel+0-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
DD HorzWtSel+32-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
DD HorzWtSel+128, VertWtSel+128, 0DEADBEEFH
BlkEmptyFlag LABEL BYTE ; sneak this in here
DB 16, 0, 32, 0
DD HorzWtSel+128, VertWtSel+128
; The following table, indexed by MBEdgeType&7, returns a mask which is used to
; zero-out the motion vectors for predictors that are off the edge of the
; frame. The index is a 3 bit value, each bit being set if the macroblock
; is NOT on the corresponding edge. 1 == left; 2 == right; 4 == top;
; The value gotten out is (where A==left; B==above; C==above right):
; <mask(A) mask(A) mask(C) mask(C) mask(B) mask(B) mask(A) mask(A)>
; The mask is 0xFF if the corresponding remote block is NOT off the edge, and
; 0x00 if it is off the edge.
; (32 bytes: 6328: 6359)
ValidRemoteVectors LABEL DWORD
DWORD 0DEADBEEFH ; 0: Can't be on left and right edges at once.
DWORD 0FF0000FFH ; 1: Top right corner.
DWORD 000000000H ; 2: Top left corner.
DWORD 0FF0000FFH ; 3: Top edge.
DWORD 0DEADBEEFH ; 4: Can't be on left and right edges at once.
DWORD 0FF00FFFFH ; 5: Right edge.
DWORD 000FFFF00H ; 6: Left edge.
DWORD 0FFFFFFFFH ; 7: Central macroblock.
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
; to be subtracted with saturation to the predicted motion vector for extended
; motion vector search. Since saturation occurs at 0, the values here are
; such that the motion vectors are biased to the appropriate point for the
; clamping effect. The index is a 4 bit value, each bit being set if the
; macroblock is NOT on the corresponding edge. 1 == left; 2 == right;
; 4 == top; 8 == bottom. The 8 values being calculated are as follows:
; ; [ 0: 7] -- HMV lower limit for signature search
; ; [ 8:15] -- HMV lower limit
; ; [16:23] -- HMV upper limit for signature search
; ; [24:31] -- HMV upper limit
; ; [32:39] -- VMV lower limit for signature search
; ; [40:47] -- VMV lower limit
; ; [48:55] -- VMV upper limit for signature search
; ; [56:63] -- VMV upper limit
; (88 bytes: 6360:6447)
EMV_ClampLowerEnd LABEL DWORD
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
BYTE 87, 94, 97, 100, ; 5: Bottom right corner.
87, 94, 97, 100
BYTE 119, 126, 97, 100, ; 6: Bottom left corner.
87, 94, 97, 100
BYTE 87, 94, 97, 100, ; 7: Bottom edge.
87, 94, 97, 100
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
BYTE 87, 94, 97, 100, ; 9: Top right corner.
119, 126, 97, 100
BYTE 119, 126, 97, 100, ; 10: Top left corner.
119, 126, 97, 100
BYTE 87, 94, 97, 100, ; 11: Top edge.
119, 126, 97, 100
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
BYTE 87, 94, 97, 100, ; 13: Right edge.
87, 94, 97, 100
BYTE 119, 126, 97, 100, ; 14: Left edge.
87, 94, 97, 100
BYTE 87, 94, 97, 100, ; 15: Central macroblock.
87, 94, 97, 100
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
; to be added with saturation to the result of the application of the preceed-
; ing table, to clamp the upper limit on the motion vector search parameters.
; Since saturation occurs at 255, the values here are such that the motion
; vectors are biased to the appropriate point for the clamping effect.
; (88 bytes: 6448:6535)
EMV_ClampUpperEnd LABEL DWORD
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
BYTE 184, 193, 216, 225, ; 5: Bottom right corner.
184, 193, 216, 225
BYTE 216, 225, 184, 193, ; 6: Bottom left corner.
184, 193, 216, 225
BYTE 184, 193, 184, 193, ; 7: Bottom edge.
184, 193, 216, 225
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
BYTE 184, 193, 216, 225, ; 9: Top right corner.
216, 225, 184, 193
BYTE 216, 225, 184, 193, ; 10: Top left corner.
216, 225, 184, 193
BYTE 184, 193, 184, 193, ; 11: Top edge.
216, 225, 184, 193
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
BYTE 184, 193, 216, 225, ; 13: Right edge.
184, 193, 184, 193
BYTE 216, 225, 184, 193, ; 14: Left edge.
184, 193, 184, 193
BYTE 184, 193, 184, 193, ; 15: Central macroblock.
184, 193, 184, 193
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
; to be added without saturation to the result of the application of the
; preceeding table, to return the the motion vector search parameters to the
; proper range for subsequent use.
; (88 bytes: 6536:6623)
EMV_RestoreRange LABEL DWORD
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
BYTE 120, 255, 88, 225, ; 5: Bottom right corner.
120, 255, 88, 225
BYTE 120, 255, 56, 193, ; 6: Bottom left corner.
120, 255, 88, 225
BYTE 120, 255, 56, 193, ; 7: Bottom edge.
120, 255, 88, 225
DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
BYTE 120, 255, 88, 225, ; 9: Top right corner.
120, 255, 56, 193
BYTE 120, 255, 56, 193, ; 10: Top left corner.
120, 255, 56, 193
BYTE 120, 255, 56, 193, ; 11: Top edge.
120, 255, 56, 193
DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
BYTE 120, 255, 88, 225, ; 13: Right edge.
120, 255, 56, 193
BYTE 120, 255, 56, 193, ; 14: Left edge.
120, 255, 56, 193
BYTE 120, 255, 56, 193, ; 15: Central macroblock.
120, 255, 56, 193
; Tables indexed by indices fetched from Diff_IdxRefWts. These tables return
; a multipler to apply to past or future predictions to construct the
; B-frame candidate reference blocks.
; (128 bytes;6624:6751)
FutureWt_FF_or_00 LABEL DWORD
DD 000000000H, 000000000H
DD 000000000H, 0FF000000H
DD 000000000H, 0FFFF0000H
DD 000000000H, 0FFFFFF00H
DD 000000000H, 0FFFFFFFFH
DD 0FF000000H, 0FFFFFFFFH
DD 0FFFF0000H, 0FFFFFFFFH
DD 0FFFFFF00H, 0FFFFFFFFH
DD 0FFFFFFFFH, 0FFFFFFFFH
DD 0FFFFFFFFH, 000FFFFFFH
DD 0FFFFFFFFH, 00000FFFFH
DD 0FFFFFFFFH, 0000000FFH
DD 0FFFFFFFFH, 000000000H
DD 000FFFFFFH, 000000000H
DD 00000FFFFH, 000000000H
DD 0000000FFH, 000000000H
MMXMEDATA ENDS
;=============================================================================
.CODE EDTQ
ASSUME cs : FLAT
ASSUME ds : FLAT
ASSUME es : FLAT
ASSUME fs : FLAT
ASSUME gs : FLAT
ASSUME ss : FLAT
EXTERN MMxDoForwardDCT:NEAR
EXTERN MMxDoForwardDCTx:NEAR
EXTERN MMxDoForwardDCTy:NEAR
IFDEF H261
ELSE
EXTERN MMxDoBFrameLumaBlocks:NEAR
EXTERN MMxDoBFrameChromaBlocks:NEAR
ENDIF
MMxEDTQ proc C AMBAS: DWORD,
ATarg: DWORD,
APrev: DWORD,
ABTarg: DWORD,
AWtFwd: DWORD,
AWtBwd: DWORD,
AFrmWd: DWORD,
ADoHalf: DWORD,
ADoBlk: DWORD,
ADoSF: DWORD,
ADoAP: DWORD,
ADoB: DWORD,
ADoLuma: DWORD,
ADoExtMV:DWORD,
AQP: DWORD,
ABQP: DWORD,
AB0VecT: DWORD,
ASpaFilT:DWORD,
ASpaFilD:DWORD,
ASWDTot: DWORD,
ABSWDTot:DWORD,
ACodStr: DWORD,
ABCodStr:DWORD
LocalFrameSize = 1536 ; Space needed for locals
RegStoSize = 16
; Arguments:
MBlockActionStream_arg = RegStoSize + 4
TargetFrameBaseAddress_arg = RegStoSize + 8
PreviousFrameBaseAddress_arg = RegStoSize + 12
BTargetFrameBaseAddress_arg = RegStoSize + 16
SignatureBaseAddress_arg = RegStoSize + 20
WeightForwardMotion_arg = RegStoSize + 24
WeightBackwardMotion_arg = RegStoSize + 28
FrameWidth = RegStoSize + 32
DoHalfPelEstimation_arg = RegStoSize + 36
DoBlockLevelVectors_arg = RegStoSize + 40
DoSpatialFiltering_arg = RegStoSize + 44
DoAdvancedPrediction_arg = RegStoSize + 48
DoBFrame_arg = RegStoSize + 52
DoLumaBlocksInThisPass_arg = RegStoSize + 56
DoExtendedMotionVectors_arg = RegStoSize + 60
QuantizationLevel = RegStoSize + 64
BQuantizationLevel = RegStoSize + 68
BFrmZeroVectorThreshold_arg = RegStoSize + 72
SpatialFiltThreshold_arg = RegStoSize + 76
SpatialFiltDifferential_arg = RegStoSize + 80
PSWDTotal = RegStoSize + 84
PBSWDTotal = RegStoSize + 88
CodeStreamCursor_arg = RegStoSize + 92
BCodeStreamCursor_arg = RegStoSize + 96
EndOfArgList = RegStoSize + 100
StackOffset TEXTEQU <0>
CONST_384 TEXTEQU <384>
push esi
push edi
push ebp
push ebx
; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
mov esi,esp
and esp,0FFFFF000H
sub esp,000000FE0H
IFDEF H261
mov ebp,PITCH
CONST_384 TEXTEQU <ebp>
mov eax,[esi+SpatialFiltThreshold_arg]
mov ebx,[esi+SpatialFiltDifferential_arg]
mov SpatialFiltThreshold,eax
mov SpatialFiltDifferential,ebx
mov ecx,[esi+TargetFrameBaseAddress_arg]
mov ebx,[esi+SignatureBaseAddress_arg]
sub ecx,ebx
mov eax,[esi+TargetFrameBaseAddress_arg]
mov SigToTarget,ecx
add ecx,PITCH*80+64
neg ecx
mov TargetToSig_Debiased,ecx
mov ebx,[esi+PreviousFrameBaseAddress_arg]
mov PreviousFrameBaseAddress,ebx
mov TargetFrameBaseAddress,eax
sub ebx,eax
mov ecx,[esi+QuantizationLevel]
mov TargToRef,ebx
mov eax,[esi+CodeStreamCursor_arg]
mov ebx,ecx
mov CodeStreamCursor,eax
shl ebx,16
xor edx,edx
or ebx,ecx
mov ecx,Recip2QP[ecx*4]
mov QPDiv2,ebx
mov Recip2QPToUse,ecx
mov eax,[esi+DoSpatialFiltering_arg]
mov DoExtendedMotionVectors,edx
test eax,eax
je @f
mov eax,3
@@:
mov DoSpatialFiltering,al
mov SWDTotal,edx
mov BestMBHalfPelMV,edx
mov ebx,PreviousFrameBaseAddress
mov BlockAbove[0],edx
sub ebx,16
mov edx,[esi+FrameWidth]
mov SpatiallyFilteredMB,ebx
imul edx,-SIZEOF T_MacroBlockActionDescr/16
add edx,2*SIZEOF T_Blk
mov eax,14 ; 14 if restricted MVs and doing heuristic ME.
mov BlockAbove[4],edx
mov DoHeuristicME,eax
ELSE
mov eax,[esi+DoExtendedMotionVectors_arg]
test eax,eax
je @f
mov eax,7
@@:
mov DoExtendedMotionVectors,eax
mov eax,[esi+BFrmZeroVectorThreshold_arg]
mov edi,[esi+WeightForwardMotion_arg]
mov BFrmZeroVectorThreshold,eax
mov ecx,60
mov ebx,060606060H
lea edx,WeightForwardMotion+128
@@:
mov eax,[edi+ecx]
and eax,03F3F3F3FH ; ???
mov ebp,[edi+ecx+64]
and ebp,03F3F3F3FH ; ???
xor eax,ebx
xor ebp,ebx
mov [edx+ecx+64],eax
mov [edx+ecx-128],ebp
sub ecx,4
mov ebp,PITCH
jge @b
mov edi,[esi+WeightBackwardMotion_arg]
mov eax,edx
lea edx,WeightBackwardMotion+128
mov ecx,60
sub eax,edx
jne @b
CONST_384 TEXTEQU <ebp>
mov ebx,[esi+PreviousFrameBaseAddress_arg]
mov eax,[esi+TargetFrameBaseAddress_arg]
mov PreviousFrameBaseAddress,ebx
mov TargetFrameBaseAddress,eax
mov ecx,[esi+BTargetFrameBaseAddress_arg]
sub ebx,eax
mov TargToRef,ebx
sub eax,ecx
mov BFrameBaseAddress,ecx
mov BFrameToFuture,eax
mov ecx,[esi+TargetFrameBaseAddress_arg]
mov ebx,[esi+SignatureBaseAddress_arg]
sub ecx,ebx
mov edx,[esi+FrameWidth]
mov SigToTarget,ecx
add ecx,PITCH*80+64
neg ecx
imul edx,-SIZEOF T_MacroBlockActionDescr/16
mov TargetToSig_Debiased,ecx
mov ecx,[esi+DoBFrame_arg]
add edx,2*SIZEOF T_Blk
xor cl,1
mov BlockAbove[4],edx
mov IsPlainPFrame,cl
mov ecx,[esi+QuantizationLevel]
mov eax,[esi+CodeStreamCursor_arg]
mov ebx,ecx
mov CodeStreamCursor,eax
mov eax,[esi+BCodeStreamCursor_arg]
mov BCodeStreamCursor,eax
shl ebx,16
mov eax,[esi+DoHalfPelEstimation_arg]
or ebx,ecx
mov ecx,Recip2QP[ecx*4]
mov QPDiv2,ebx
mov Recip2QPToUse,ecx
mov ecx,[esi+BQuantizationLevel]
xor edx,edx
mov ebx,ecx
shl ebx,16
mov BestMBHalfPelMV,edx
or ebx,ecx
mov ecx,Recip2QP[ecx*4]
mov BQPDiv2,ebx
mov BRecip2QPToUse,ecx
test eax,eax
je @f
mov eax,-4
@@:
mov DoHalfPelME,eax
mov eax,[esi+DoBlockLevelVectors_arg]
mov DoBlockLevelVectors,al
mov eax,[esi+DoAdvancedPrediction_arg]
mov DoAdvancedPrediction,al
mov SWDTotal,edx
test eax,eax
lea eax,[eax+14] ; 14 if restricted MVs and doing heuristic ME.
je @f
xor eax,eax ; 0 if unrestricted MVs and doing heuristic ME.
@@:
mov DoHeuristicME,eax
mov BSWDTotal,edx
mov PendingOBMC,edx
mov BlockAbove[0],edx
ENDIF
mov eax,01E98E268H
mov EMVLimitsForThisMB,eax
; ; [ 0: 7] -- HMV lower limit for sig search (biased 128)
; ; [ 8:15] -- HMV lower limit (signed)
; ; [16:23] -- HMV upper limit for sig search (biased 128)
; ; [24:31] -- HMV upper limit (signed)
mov EMVLimitsForThisMB+4,eax ; Same as for HMV.
mov edx,[esi+MBlockActionStream_arg]
mov al,NextZigZagCoeff[Q77]
test al,al
je ZigZagCoeffInitialized
xor ecx,ecx
lea ebx,InitZigZagCoeff
xor eax,eax
@@:
mov al,[ebx]
inc ebx
mov NextZigZagCoeff[ecx],al
mov ecx,eax
test eax,eax
jne @b
ZigZagCoeffInitialized:
mov StashESP,esi
mov eax,[esi+DoLumaBlocksInThisPass_arg]
test eax,eax
jne FirstMacroBlock ; Jump if doing luma plane
jmp FirstMacroBlock_ChromaProcessing
IntraCodedChromaProcessingDone:
IFDEF H261
ELSE
mov al,IsPlainPFrame
test al,al
jne NextMacroBlock_ChromaProcessing
mov eax,QPDiv2
mov ebx,BQPDiv2
call MMxDoBFrameChromaBlocks
ENDIF
NextMacroBlock_ChromaProcessing:
mov bl,[edx].CodedBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr
and bl,040H ; Check for end-of-stream
jne TrulyDone
FirstMacroBlock_ChromaProcessing:
mov al,[edx].BlockType ; Chroma handling. Intra? Or Inter?
mov ecx,TargetFrameBaseAddress
cmp al,INTRA
jne ChromaIsInterCoded
mov esi,[edx].BlkU.BlkOffset
mov StashBlockType,al
add esi,ecx
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
shl bl,4
mov al,[edx].CodedBlocks
sub al,bl
mov esi,[edx].BlkV.BlkOffset
mov [edx].CodedBlocks,al
mov ecx,TargetFrameBaseAddress
add esi,ecx
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
shl bl,5
mov al,[edx].CodedBlocks
sub al,bl
pop ecx ; Adjust stack pointer
StackOffset TEXTEQU <0>
mov [edx].CodedBlocks,al
jmp IntraCodedChromaProcessingDone
ChromaIsInterCoded:
mov edi,[edx].BlkU.BlkOffset ; Get address of next macroblock to do.
mov ebx,[edx].BlkU.MVs
add edi,ecx
mov esi,[edx].BlkU.PastRef
mov StashBlockType,al
IFDEF H261
mov ecx,2+256*1 ; cl==2 tells SpatialLoopFilter code to do one
; ; block. ch==1 causes it to return to here.
mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
cmp al,INTERSLF
je DoSpatialFilterForChroma
ReturnFromSpatialFilterForU:
ENDIF
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,4
mov al,[edx].CodedBlocks
sub al,bl
mov ecx,TargetFrameBaseAddress
mov [edx].CodedBlocks,al
pop edi ; Adjust stack pointer
StackOffset TEXTEQU <0>
mov edi,[edx].BlkV.BlkOffset ; Get address of next macroblock to do.
mov ebx,[edx].BlkV.MVs
add edi,ecx
mov esi,[edx].BlkV.PastRef
IFDEF H261
mov ecx,2-256*1 ; cl==2 tells SpatialLoopFilter code to do one
; ; block. ch==-1 causes it to return to here.
mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
mov al,[edx].BlockType
cmp al,INTERSLF
je DoSpatialFilterForChroma
ReturnFromSpatialFilterForV:
ENDIF
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,5
mov al,[edx].CodedBlocks
sub al,bl
pop ecx ; Adjust stack pointer
StackOffset TEXTEQU <0>
mov [edx].CodedBlocks,al
jmp IntraCodedChromaProcessingDone
;============================================================================
; Here we copy the target macroblock, and interpolate left, right, and both.
; We also accumulate the target pels for each block. Result is four partial
; sums in four packed words. After summing them all up, the final sum will
; be the sum of the 64 pels of each block, divided by 2.
NextMacroBlock:
mov bl,[edx].CodedBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr
and bl,040H ; Check for end-of-stream
jne Done
FirstMacroBlock:
mov edi,TargetFrameBaseAddress
mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
add edi,esi
mov esi,TargToRef
add esi,edi
mov TargetMacroBlockBaseAddr,edi
mov Addr0MVRef,esi
;============================================================================
; We calculate the 0-motion SWD. We use 32 match points per block, and
; write the result seperately for each block. If the SWD for the 0-motion
; vector is below a threshold, we don't bother searching for other possibly
; better motion vectors.
;
; ebp -- PITCH
; esi -- Address of ref block.
; edi -- Address of target block.
; edx -- MBlockActionStream
; ecx -- Not used. Will be linearized MV in non-zero MV search.
; ebx -- CurrSWDState, i.e. FirstMEState, times 8
; eax -- Scratch
; mm7 -- Best SWD for macroblock.
; mm0-mm6 Scratch
;
mov cl,[edx].CodedBlocks ; Init CBP for macroblock.
or cl,03FH ; Indicate all 6 blocks are coded.
mov eax,DoHeuristicME ; 0 if unrestricted MVs and heur ME.
; ; 14 if restricted MVs and heur ME.
; ; 15 if suppressing heuristic ME.
mov [edx].CodedBlocks,cl
js IntraByDecree
xor ebx,ebx ; Avoid partial register stall.
xor ecx,ecx
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
pcmpeqd mm7,mm7 ; Init previous best SWD to huge.
mov bl,[edx].FirstMEState ; Test for INTRA-BY-DECREE.
sub eax,ecx ; Negative iff should do heuristic ME
; ; for this macroblock.
test bl,bl
je IntraByDecree
sar eax,31
psrlq mm7,2
or ebx,eax ; -1 if doing heuristic ME.
mov al,INTER1MV ; Speculate INTER, 1 motion vector.
mov [edx].BlockType,al
psrld mm7,14 ; mm7[32:63]: Previous best SWD = 0x0000FFFF.
; ; mm7[ 0:31]: Prev SWD that we diminish = 0x0003FFFF.
; ; Since we can't diminish it below 0x00020000, we
; ; won't take the short circuit exit from MblkEstQWA.
; At this point:
; ebp -- PITCH
; esi -- Address of upper left block of 0,0 ref area.
; edi -- Address of upper left block of target.
; edx -- MBlockActionStream
; ecx -- Scratch
; ebx -- CurrSWDState, i.e. FirstMEState.
; eax -- Scratch
; mm7 -- Previous best SWD initialized to huge (0xFFFF, 0x3FFFF).
; mm0-mm6 -- Scratch
;============================================================================
; Compute SWD for macroblock.
ComputeMBSWD:
; Registers at this point:
; ebp -- PITCH
; esi -- Address of upper left block of candidate ref area.
; edi -- Address of upper left block of target.
; edx -- MBlockActionStream
; ecx -- Scratch
; ebx -- CurrSWDState
; eax -- Scratch
; mm7 -- Previous best SWD.
; mm0-mm6 -- Scratch
;
lea ecx,[ebp+ebp*4] ; Get PITCH*5
lea eax,[ebp+ebp*2] ; Get PITCH*3
movq mm0,[esi+PITCH*15] ; FL A: Ref MB, lower left block, line 15.
psubw mm0,[edi+PITCH*15] ; FL B: Diff for lower left block, line 15.
movq mm6,[esi+PITCH*15+8] ; FR A
psllw mm0,8 ; FL C: Extract diffs for line 15 even pels.
psubw mm6,[edi+PITCH*15+8] ; FR B
pmaddwd mm0,mm0 ; FL D: Square of diffs for even pels.
movq mm1,[esi+PITCH*9] ; 9L A
psllw mm6,8 ; FR C
psubw mm1,[edi+PITCH*9] ; 9L B
pmaddwd mm6,mm6 ; FR D
movq mm5,[esi+PITCH*9+8] ; 9R A
psllw mm1,8 ; 9L C
psubw mm5,[edi+PITCH*9+8] ; 9R B
pmaddwd mm1,mm1 ; 9L D
movq mm2,[esi+eax*4] ; CL a
psllw mm5,8 ; 9R C
psubw mm2,[edi+eax*4] ; CL b
pmaddwd mm5,mm5 ; 9R D
movq mm3,[esi+eax*4+8] ; CR a
pmaddwd mm2,mm2 ; CL c: Square of diffs for odd pels.
psubw mm3,[edi+eax*4+8] ; CR b
paddusw mm0,mm1 ; LL + Accumulate SWD for lower left block.
movq mm1,[esi+eax*1] ; 3L A
pmaddwd mm3,mm3 ; CR c
psubw mm1,[edi+eax*1] ; 3L B
paddusw mm6,mm5 ; LR +
movq mm5,[esi+eax*1+8] ; 3R A
psllw mm1,8 ; 3L C
psubw mm5,[edi+eax*1+8] ; 3R B
paddusw mm0,mm2 ; LL +
movq mm2,[esi] ; 0L a
pmaddwd mm1,mm1 ; 3L D
psubw mm2,[edi] ; 0L b
paddusw mm6,mm3 ; LR +
movq mm3,[esi+8] ; 0R a
psllw mm5,8 ; 3R C
psubw mm3,[edi+8] ; 0R b
pmaddwd mm5,mm5 ; 3R D
movq mm4,[esi+eax*2] ; 6L a
pmaddwd mm2,mm2 ; 0L c
psubw mm4,[edi+eax*2] ; 6L b
pmaddwd mm3,mm3 ; 0R c
movq PartSWDForLLBlk,mm0 ; Stash SWD for lines 9,12,15, LL blk.
paddusw mm0,mm6 ; Sum SWD for lines 9,12,15 LL and LR.
movq PartSWDForLRBlk,mm6 ; Stash SWD for lines 9,12,15, LR blk.
pmaddwd mm4,mm4 ; 6L c
movq mm6,[esi+eax*2+8] ; 6R a
paddusw mm1,mm2 ; UL +
psubw mm6,[edi+eax*2+8] ; 6R b
paddusw mm5,mm3 ; UR +
movq mm2,[esi+ebp*1] ; 1L A
pmaddwd mm6,mm6 ; 6R c
psubw mm2,[edi+ebp*1] ; 1L B
paddusw mm1,mm4 ; UL +
movq mm3,[esi+ecx*1] ; 5L A
paddusw mm0,mm1 ; Sum partial SWD for LL, LR, and UL.
psubw mm3,[edi+ecx*1] ; 5L B
paddusw mm5,mm6 ; UR +
movq mm6,[esi+ebp*4] ; 4L a
paddusw mm0,mm5 ; Sum partial SWD for all blocks.
movq PartSWDForURBlk,mm5 ; Stash SWD for lines 0,3,6, UR blk.
punpckldq mm5,mm0 ; Get low sum into high bits.
psubw mm6,[edi+ebp*4] ; 4L b
paddusw mm5,mm0 ; Total up SWD for every third line.
movq mm0,[esi+ebp*2] ; 2L a
psrlq mm5,47 ; Position, and double.
psubw mm0,[edi+ebp*2] ; 2L b
pcmpgtd mm5,mm7 ; Is 2 * SWD for 6 lines > prev SWD?
pmaddwd mm0,mm0 ; 2L c
psllw mm2,8 ; 1L C
movdf eax,mm5
pmaddwd mm2,mm2 ; 1L D
test eax,eax
jne MblkEst_EarlyOut
lea eax,[ecx+ebp*2] ; PITCH*7
psllw mm3,8 ; 5L C
paddusw mm1,mm2 ; UL +
pmaddwd mm3,mm3 ; 5L D
movq mm5,[esi+eax*1] ; 7L A
psubw mm5,[edi+eax*1] ; 7L B
pmaddwd mm6,mm6 ; 4L c
movq mm2,[esi+PITCH*11+8] ; BR A
psllw mm5,8 ; 7L C
psubw mm2,[edi+PITCH*11+8] ; BR B
paddusw mm1,mm3 ; UL +
movq mm3,[esi+PITCH*13+8] ; DR A
paddusw mm1,mm0 ; UL +
psubw mm3,[edi+PITCH*13+8] ; DR B
pmaddwd mm5,mm5 ; 7L D
movq mm0,[esi+ebp*8+8] ; 8R a
paddusw mm1,mm6 ; UL +
psubw mm0,[edi+ebp*8+8] ; 8R b
psllw mm2,8 ; BR C
movq mm4,[esi+ecx*2+8] ; AR a
paddusw mm1,mm5 ; UL +
psubw mm4,[edi+ecx*2+8] ; AR b
punpckldq mm6,mm1 ; Get low SWD accum to hi order of mm6.
movq mm5,[esi+eax*2+8] ; ER a
paddusw mm6,mm1 ; mm6[48:63] is SWD for upper left blk.
psubw mm5,[edi+eax*2+8] ; ER b
psrlq mm6,48 ; mm6 is SWD for upper left block.
psubusw mm7,mm6 ; Diminish prev best SWD by cand UL blk.
pmaddwd mm2,mm2 ; BR D
pmaddwd mm0,mm0 ; 8R c
psllw mm3,8 ; DR C
movq mm1,[esi+ebp*1+8] ; 1R A
pmaddwd mm3,mm3 ; DR D
paddusw mm2,PartSWDForLRBlk ; LR +
pmaddwd mm4,mm4 ; AR c
psubw mm1,[edi+ebp*1+8] ; 1R B
paddusw mm2,mm0 ; LR +
movq mm0,[esi+ecx*1+8] ; 5R A
pmaddwd mm5,mm5 ; ER c
psubw mm0,[edi+ecx*1+8] ; 5R B
paddusw mm2,mm3 ; LR +
movq mm3,[esi+eax*1+8] ; 7R A
paddusw mm2,mm4 ; LR +
paddusw mm2,mm5 ; LR +
psllw mm1,8 ; 1R C
psubw mm3,[edi+eax*1+8] ; 7R B
punpckldq mm5,mm2 ; Get low SWD accum to hi order of mm5.
paddusw mm5,mm2 ; mm5[48:63] is SWD for lower right blk.
pmaddwd mm1,mm1 ; 1R D
movq mm2,[esi+ebp*2+8] ; 2R a
psrlq mm5,48 ; mm5 is SWD for lower right block.
psubusw mm7,mm5 ; Diminish prev best SWD by cand LR blk.
punpckldq mm6,mm5 ; mm6[0:31] UL SWD; mm6[32:63] LR SWD.
psubw mm2,[edi+ebp*2+8] ; 2R b
psllw mm0,8 ; 5R C
movq mm5,[esi+ebp*4+8] ; 4R a
pmaddwd mm0,mm0 ; 5R D
psubw mm5,[edi+ebp*4+8] ; 4R b
psllw mm3,8 ; 7R C
paddusw mm1,PartSWDForURBlk ; UR +
pmaddwd mm3,mm3 ; 7R D
paddusw mm1,mm0 ; UR +
pmaddwd mm2,mm2 ; 2R c
movq mm0,[esi+PITCH*11] ; BL A
pmaddwd mm5,mm5 ; 4R c
psubw mm0,[edi+PITCH*11] ; BL B
paddusw mm1,mm3 ; UR +
movq mm3,[esi+ecx*2] ; AL a
paddusw mm1,mm2 ; UR +
psubw mm3,[edi+ecx*2] ; AL b
paddusw mm1,mm5 ; UR +
pmaddwd mm3,mm3 ; AL c
psllw mm0,8 ; BL C
movq mm2,[esi+PITCH*13] ; DL A
pmaddwd mm0,mm0 ; BL D
psubw mm2,[edi+PITCH*13] ; DL B
punpckldq mm5,mm1 ; Get low SWD accum to hi order of mm5.
movq mm4,[esi+ebp*8] ; 8L a
paddusw mm5,mm1 ; mm5[48:63] is SWD for upper right blk.
psubw mm4,[edi+ebp*8] ; 8L b
psllw mm2,8 ; DL C
movq mm1,[esi+eax*2] ; EL a
pmaddwd mm2,mm2 ; DL D
psubw mm1,[edi+eax*2] ; EL b
pmaddwd mm4,mm4 ; 8L c
paddusw mm3,PartSWDForLLBlk ; LL +
pmaddwd mm1,mm1 ; EL c
paddusw mm3,mm0 ; LL +
psrlq mm5,48 ; mm5 is SWD for upper right block.
paddusw mm3,mm2 ; LL +
psubusw mm7,mm5 ; Diminish prev best SWD by cand UR blk.
paddusw mm3,mm4 ; LL +
movq mm0,mm7
paddusw mm3,mm1 ; LL +
psrlq mm7,32 ; Get original Best SWD
punpckldq mm1,mm3
pxor mm2,mm2
paddusw mm1,mm3
psrlq mm1,48
punpckldq mm5,mm1 ; mm5[32:63] SWD for LL. mm5[0:31] SWD for UR.
psubusw mm0,mm1
psubusw mm7,mm0 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
pcmpeqd mm2,mm0 ; [0:31] == 0 iff cand better, else -1.
; Registers at this point:
; ebp -- PITCH
; edi -- Target MacroBlock Base Address.
; esi -- Address of upper left block of candidate ref area.
; edx -- MBlockActionStream
; ebx -- CurrSWDState
; mm7 -- New best SWD for macroblock.
; mm6 -- [0:31] SWD for upper left; [32:63] SWD for lower right.
; mm5 -- [0:31] SWD for upper right; [32:63] SWD for lower left.
; mm2 -- [0:31] 0 if cand better, else -1.
cmp ebx,LASTINITIALMESTATE ; Did we just do zero motion vector?
jg MEForNonZeroMVDone
movdf eax,mm7 ; SWD for this candidate.
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
test ebx,ebx
jns ZeroMVDoneForNonHeuristicME
HeuristicME_EarlyOut:
movq mm0,EMVLimitsForThisMB ; Speculate no extended motion vectors.
pcmpeqb mm1,mm1 ; <FFFF FFFF FFFF FFFF>
xor ecx,ecx
cmp bl,-3
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
jle HeuristicME_CaseSigMVDone_or_CaseAboveMVDone
sub eax,NONZEROMVDIFFERENTIAL
inc bl
mov ebx,DoExtendedMotionVectors ; 7 iff doing extende MVs, else 0.
jne HeuristicME_CaseLeftMVDone
HeuristicME_Case0MVDone:
movq SWDULandLR,mm6
pcmpeqb mm4,mm4 ; <FFFF FFFF FFFF FFFF>
movq SWDURandLL,mm5
psllw mm4,15 ; <8000 8000 8000 8000>
cmp eax,ZEROVECTORTHRESHOLD-NONZEROMVDIFFERENTIAL
; ; Compare 0-MV against ZeroVectorThreshold.
jl BelowZeroThresh ; Jump if 0-MV is good enough.
mov SWDForNon0MVToBeat,eax
and ebx,ecx ; Elim flag for bottom row. 0 iff no ExtMV.
mov eax,BlockAbove[4]
je NotExtendedMVs ; Jump if not doing extended MVs?
; Below: A==left; B==above; C==above rt.
movdt mm3,ValidRemoteVectors[ebx*4] ; <mask(A) (C) (B) (A)>
movq mm2,mm4 ; <8000 8000 8000 8000>
IF SIZEOF T_MacroBlockActionDescr-128
**** error: Due to assembler weakness, can't use spaces here, so SIZEOF
**** T_MacroBlockActionDescr is replaced by constant. If assembly error
**** occurs, the constant has been changed, and the three instructions in
**** the next 10 lines have to change.
ENDIF
IF SIZEOF T_Blk-16
**** error: Due to assembler weakness, can't use spaces here, so SIZEOF T_Blk
**** is replaced by constant. If assembly error occurs, the constant has been
**** changed, and the three instructions in the next 10 lines have to change.
ENDIF
movdt mm0,[edx-128].BestFullPelMBMVs ; <x x Av,h x >
punpcklbw mm3,mm3 ; mask for both MV parts
movdt mm1,[edx+eax-2*16+128].BestFullPelMBMVs ; <x x Cv,h x >
psrlw mm2,8 ; <0080 0080 0080 0080>
por mm4,mm2 ; <8080 ...> bias value.
punpcklwd mm1,mm0 ; <Av,h Cv,h x x >
punpcklwd mm0,[edx+eax-2*16].BestFullPelMBMVs ; <Bv,h Av,h x x >
;
punpckhdq mm0,mm1 ; <Av,h Cv,h Bv,h Av,h>
;
pand mm0,mm3 ; Set to 0 any off edge.
and ebx,4 ; If zero, we're on the top edge.
paddb mm0,mm4 ; <Av,h Cv,h Bv,h Av,h> biased
je @f ; If on top edge, cause LEFT to be taken.
movq mm1,mm0 ; <Av,h Cv,h Bv,h Av,h>
psrlq mm0,16 ; <x Av,h Cv,h Bv,h>
psubusb mm0,mm1 ; <x floor(A-C) floor(C-B) floor(B-A)>
;
paddb mm0,mm1 ; <x max(A,C) max(C,B) max(B,A)>
;
movq mm1,mm0 ; <x max(A,C) max(C,B) max(B,A)>
psrlq mm0,16 ; <x x max(A,C) max(C,B)>
pxor mm1,mm0 ; Part of median calc.
psrlq mm0,16 ; <x x x max(A,C)>
pxor mm0,mm1 ; <x x x median(A,B,C)> biased by +128.
;
@@:
punpcklbw mm0,mm0 ; 2 copies of median predictor MVs.
pcmpeqb mm1,mm1
punpcklwd mm0,mm0 ; 4 copies. Will now calc the following:
; ; [ 0: 7] -- HMV lower limit for sig search
; ; [ 8:15] -- HMV lower limit
; ; [16:23] -- HMV upper limit for sig search
; ; [24:31] -- HMV upper limit
; ; [32:39] -- VMV lower limit for sig search
; ; [40:47] -- VMV lower limit
; ; [48:55] -- VMV upper limit for sig search
; ; [56:63] -- VMV upper limit
;
psubusb mm0,EMV_ClampLowerEnd[ecx*8-40]
psllw mm1,3 ; <FF F8 FF F8 FF F8 FF F8> i.e. Mask to
; ; set sig srch range to mult of 8.
paddusb mm0,EMV_ClampUpperEnd[ecx*8-40]
psubb mm0,EMV_RestoreRange[ecx*8-40]
NotExtendedMVs:
movq SWD0MVURandLL,mm5
pand mm0,mm1 ; Set sig search at multiples of four.
movq SWD0MVULandLR,mm6
pcmpeqb mm2,mm2 ; Set cand as worse than 0MV, in case skip.
movq EMVLimitsForThisMB,mm0
and cl,1
je HeuristicME_SkipLeftMV
mov BestOfFourStartingPoints,esi
mov ebx,-2 ; Indicate trying MV of MB to left.
movsx ecx,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBVMV
movsx eax,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBHMV
ClampHeurMECandidateToRange:
movsx esi,PB EMVLimitsForThisMB+5 ; VMV lower limit.
cmp ecx,esi
jl ClampVMV_1
movsx esi,PB EMVLimitsForThisMB+7 ; VMV upper limit.
cmp ecx,esi
jle @f
ClampVMV_1:
mov ecx,esi
@@:
movsx esi,PB EMVLimitsForThisMB+1 ; HMV lower limit.
cmp eax,esi
jl ClampHMV_1
movsx esi,PB EMVLimitsForThisMB+3 ; HMV upper limit.
cmp eax,esi
jle @f
ClampHMV_1:
mov eax,esi
@@:
sar eax,1
lea ecx,[ecx+ecx*2]
IF PITCH-384
*** error: The magic here assumes a pitch of 384.
ENDIF
shl ecx,6
mov esi,Addr0MVRef
add eax,ecx ; Clamped Linearized Motion Vector
;
sub eax,1
jc MblkEst_EarlyOut ; Jump if Lin MV is zero.
lea esi,[esi+eax+1] ; Candidate reference address.
jmp ComputeMBSWD
HeuristicME_SkipLeftMV:
mov BestOfFourStartingPoints,esi
mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
HeuristicME_CaseLeftMVDone:
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
mov ebx,BlockAbove[4]
and cl,4
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
movq SWDURandLL[eax*8],mm5
pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
mov BestOfFourStartingPoints[eax*4],esi
je HeuristicME_SkipAboveMV
movsx ecx,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBVMV
movsx eax,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBHMV
mov ebx,-3 ; Indicate trying MV of MB above.
jmp ClampHeurMECandidateToRange
HeuristicME_CaseSigMVDone_or_CaseAboveMVDone:
HeuristicME_SkipAboveMV:
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
jne HeuristicME_CaseSigMVDone
HeuristicME_CaseAboveMVDone:
mov cl,4
lea ebx,C0001000100010001
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
pxor mm0,mm0
movq SWDURandLL[eax*8],mm5
pxor mm1,mm1
mov BestOfFourStartingPoints[eax*4],esi
lea esi,TargetSigContribForRowPairs
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
pcmpeqb mm7,mm7 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
; ebp -- Pitch
; edi -- Address of target macroblock.
; esi -- Address at which to store target macroblock's signature contributions.
; cl -- Loop counter.
; mm0 -- Accumulator for target MB's sig contrib for first four even columns.
; mm1 -- Accumulator for target MB's sig contrib for last four even columns.
movq mm2,[edi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
pcmpeqb mm5,mm5 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
paddb mm2,[edi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
psrlw mm5,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
@@:
movq mm3,[edi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20>
movq mm4,mm2 ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
paddb mm3,[edi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>
pmaddwd mm2,[ebx] ; D:<P07+P17+P05+P15 P03+P13+P01+P11>
movq mm7,mm5 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
pand mm5,mm3 ; W:<P26+P36 P24+P34 P22+P32 P20+P30>
psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>
pmaddwd mm3,[ebx] ; D:<P27+P37+P25+P35 P23+P33+P21+P31>
paddw mm0,mm5 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
movq mm5,[edi+ebp*2+8] ; B:<P2F P2E P2D P2C P2B P2A P29 P28>
pand mm4,mm7 ; W:<P06+P16 P04+P14 P02+P12 P00+P10>
paddb mm5,[edi+PITCH*3+8] ; B:<P2F+P3F P2E+P3E P2D+P3D P2C+P3C ...>
paddw mm0,mm4 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
movq mm4,[edi+8] ; B:<P0F P0E P0D P0C P0B P0A P09 P08>
movq mm6,mm7 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
paddb mm4,[edi+ebp*1+8] ; B:<P0F+P1F P0E+P1E P0D+P1D P0C+P1C ...>
pand mm7,mm5 ; W:<P2E+P3E P2C+P3C P2A+P3A P28+P38>
pand mm6,mm4 ; W:<P0E+P1E P0C+P1C P0A+P1A P08+P18>
psrlw mm5,8 ; W:<P2F+P3F P2D+P3D P2B+P3B P29+P39>
pmaddwd mm5,[ebx] ; D:<P2F+P3F+P2D+P3D P2B+P3B+P29+P39>
psrlw mm4,8 ; W:<P0F+P1F P0D+P1D P0B+P1B P09+P19>
pmaddwd mm4,[ebx] ; D:<P0F+P1F+P0D+P1D P0B+P1B+P09+P19>
paddw mm1,mm7 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
paddw mm1,mm6 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
lea edi,[edi+ebp*4] ; Advance input cursor
paddw mm3,mm5 ; D:<P2F+P3F+P2D+P3D+P27+P37+P25+P35
; ; P2B+P3B+P29+P39+P23+P33+P21+P31>
pcmpeqb mm5,mm5 ; Next W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
paddw mm4,mm2 ; D:<P0F+P1F+P0D+P1D+P07+P17+P05+P15
; ; P0B+P1B+P09+P19+P03+P13+P01+P11>
punpckldq mm7,mm3 ; D:<P0B+P1B+P09+P19+P03+P13+P01+P11 junk>
paddw mm7,mm3 ; [32:47]:<sum of odd pels of lines 0 and 1>
punpckldq mm6,mm4 ; W:<P2B+P3B+P29+P39+P23+P33+P21+P31 junk>
movq mm2,[edi] ; Next B:<P07 P06 P05 P04 P03 P02 P01 P00>
paddw mm6,mm4 ; [32:47]:<sum of odd pels of lines 2 and 3>
paddb mm2,[edi+ebp*1] ; Next B:<P07+P17 P06+P16 P05+P15 ...>
punpckhwd mm6,mm7 ; [0:31] W:<Line_0&1_odd Line_2&3_odd>
mov MBlockActionStream,edx
dec cl
movdf [esi],mm6 ; Save W:<Line_0&1_odd Line_2&3_odd>
psrlw mm5,8 ; Next W:<0x00FF 0x00FF 0x00FF 0x00FF>
lea esi,[esi+4] ; Advance output cursor
jne @b
; ebp -- Pitch
; edi -- Address of candidate reference MB's signature contribs.
; esi -- Address at which target MB's signature contribs were stored, plus 16.
; edx -- Scratch.
; ecx -- Count down number of lines of signatures to try.
; ebx -- Increment to get from end of one line of signatures to start of next.
; al -- Count down number of signatures to try in a line.
; ah -- Reinits counter of signatures to try in a line.
; mm0 -- Target MB's sig contrib for first four even columns.
; mm1 -- Target MB's sig contrib for last four even columns.
; mm2 -- Target MB's sig contrib for first four pairs of rows, odd columns.
; mm3 -- Amount and address of best signature seen so far.
IF PITCH-384
*** error: The magic here assumes a pitch of 384.
ENDIF
xor eax,eax
mov ecx,TargetToSig_Debiased
mov al,EMVLimitsForThisMB+4 ; Lower vert lim for sig srch (half pels)
xor ebx,ebx
add edi,ecx
mov bl,EMVLimitsForThisMB+0 ; Lower horz lim for sig srch (half pels)
shr ebx,1
lea ecx,[eax+eax*2]
shl ecx,6
add edi,ebx
add edi,ecx
xor ecx,ecx
add ebx,ebx
mov cl,EMVLimitsForThisMB+6 ; Upper vert lim for sig srch (half pels)
sub ecx,eax
mov al,EMVLimitsForThisMB+2 ; Upper horz lim for sig srch (half pels)
shr ecx,3 ; Number of lines of sigs to do, minus 1.
sub eax,ebx
shr eax,3 ; Number of columns of sigs to do.
lea ebx,[ebp-1+080000000H]
sub ebx,eax ; 1/4th amt to add to move to next line.
mov ah,al
inc ah ; To reinit cntr for line.
movq mm2,[esi-16]
pcmpeqd mm3,mm3 ; Set winning signature artificially high.
movdt mm4,[edi]
psrld mm3,2
punpckldq mm4,[edi+4] ; ref sig contribs of left even cols.
TryNextSignature:
movdt mm5,[edi+8]
psubw mm4,mm0 ; diffs for sums of left even columns.
punpckldq mm5,[edi+12] ; ref sig contribs of right even cols.
pmaddwd mm4,mm4 ; Squared differences.
movdt mm6,[edi+ebp*2] ; Sums for first two pairs of rows.
psubw mm5,mm1 ; diffs for sums of right even columns.
punpckldq mm6,[edi+PITCH*6] ; Sums for second two pairs of rows.
pmaddwd mm5,mm5 ; Squared differences.
movdt mm7,[edi+PITCH*10] ; Sums for third two pairs of rows.
psubw mm6,mm2 ; Words: diffs for sums of first 4 pairs rows.
punpckldq mm7,[edi+PITCH*14] ; Sums for last two pairs of rows.
pmaddwd mm6,mm6 ; Squared differences.
psubw mm7,[esi-8] ; Words: diffs for sums of first 4 pairs rows.
paddd mm4,mm5 ; Accumulate squared differences.
sub al,1 ; Decrement line counter.
pmaddwd mm7,mm7 ; Squared differences.
sbb edx,edx ; -1 if done with line, else 0.
paddd mm6,mm4 ; Accumulate squared differences.
and edx,ebx ; 1/4 Amt to sub to goto next line, else 0.
paddd mm7,mm6 ; Accumulate squared differences.
movdt mm5,edi ; Address of this signature
punpckldq mm6,mm7 ; <low_order_accumulator junk>
paddd mm7,mm6 ; <full_signature_amt junk>
psllq mm5,32 ; <Addr_of_this_signature 0>
lea edi,[edi+edx*4+4] ; advance signature position to next cand.
punpckhdq mm5,mm7 ; <cand_signature_amt cand_signature_addr>
sar edx,31 ; -1 if done with line, else 0.
pcmpgtd mm7,mm3 ; <0xFFFFFFFF if cand not better junk>
movdt mm4,[edi]
punpckhdq mm7,mm7 ; <0xFFFFFFFFFFFFFFFF if cand not better>
punpckldq mm4,[edi+4]
pand mm3,mm7 ; 1st_best if cand not better, else 0.
and dl,ah ; Num cols in a line if done with line, else 0.
pandn mm7,mm5 ; cand if better than 1st_best, else 0.
add al,dl ; Reinit col count if finishing with line.
por mm3,mm7 ; Better of cand and 1st_best.
sbb ecx,0 ; Decrement line count if just finished line.
jge TryNextSignature
movdf ecx,mm3 ; Fetch address of best signature.
pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
mov edi,TargetMacroBlockBaseAddr
mov ebx,-4 ; Indicate trying MV of best signature.
sub ecx,edi
mov eax,SigToTarget
movdt mm7,BestMBFullPelSWD ; Reload SWD for best full pel MB MV.
lea esi,[ecx+eax] ; Linearized motion vector
add eax,ecx ; Linearized motion vector
sar esi,8 ; Full pel vert lin offset div 256.
mov edx,MBlockActionStream ; Reload pointer to MBA descriptor.
shl eax,25
punpckldq mm7,mm7
movsx ecx,UnlinearizedVertMV[esi] ; Get full pel vert MV component.
sar eax,24 ; Full pel HMV.
jmp ClampHeurMECandidateToRange
HeuristicME_CaseSigMVDone:
HeuristicME_SkipSigMV:
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
pcmpeqd mm0,mm0 ; Init previous best SWD to huge.
mov ecx,Addr0MVRef ; Start to calc linearized MV.
mov bh,EMVLimitsForThisMB+1 ; HMV lower limit.
mov BestOfFourStartingPoints[eax*4],esi
add bh,4
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
psrlq mm0,2
movq SWDURandLL[eax*8],mm5
psrld mm0,14
mov eax,BestOfFourStartingPoints
mov bl,EMVLimitsForThisMB+5 ; VMV lower limit.
mov esi,eax
sub eax,ecx ; Linearized motion vector
mov ecx,eax ; Linearized motion vector
add al,al ; Full pel HMV.
cmp al,bh
jl ClampHMV_2
mov bh,EMVLimitsForThisMB+3 ; HMV upper limit
sub bh,4
cmp al,bh
jle NoClampHMV_2
ClampHMV_2:
sar ecx,8 ; Full pel vert lin offset div 256.
add bl,4
movzx eax,bh
movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
cmp cl,bl
jl @f
mov bl,EMVLimitsForThisMB+7 ; VMV upper limit.
movq mm7,mm0
sub bl,4
cmp cl,bl
jle NoClampVMV_2
@@:
movsx ecx,bl
movq mm7,mm0
NoClampVMV_2:
sar eax,1
lea ecx,[ecx+ecx*2]
shl ecx,6
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
mov esi,Addr0MVRef
add eax,ecx ; Linearized motion vector.
add esi,eax
jmp ComputeMBSWD
NoClampHMV_2:
sar ecx,8 ; Full pel vert lin offset div 256.
add bl,4
mov ah,bl
movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
cmp cl,ah
jl @f
mov ah,EMVLimitsForThisMB+7 ; VMV upper limit.
lea esi,[esi+ebp+1]
sub ah,4
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE ; New state number.
cmp cl,ah
jle ComputeMBSWD
@@:
movsx ecx,ah
movzx eax,al
sar eax,1
lea ecx,[ecx+ecx*2]
shl ecx,6
mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
mov esi,Addr0MVRef
add eax,ecx ; Linearized motion vector.
add esi,eax
movq mm7,mm0
jmp ComputeMBSWD
ZeroMVDoneForNonHeuristicME:
movq SWDULandLR,mm6
movq SWDURandLL,mm5
cmp eax,ZEROVECTORTHRESHOLD ; Compare 0-MV against ZeroVectorThreshold.
jl BelowZeroThresh ; Jump if 0-MV is good enough.
xor ecx,ecx
sub eax,NONZEROMVDIFFERENTIAL
mov cl,StateEngineFirstRule[ebx] ; MV adjustment.
mov bl,StateEngineFirstRule[ebx+10] ; New state number.
shl ecx,11
mov SWDForNon0MVToBeat,eax
movq SWD0MVULandLR,mm6
movq SWD0MVURandLL,mm5
lea esi,[esi+ecx-PITCH*8]
jmp ComputeMBSWD
MEForNonZeroMVDone:
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
MblkEst_EarlyOut:
xor ecx,ecx
test ebx,ebx
movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
pcmpeqb mm2,mm2 ; Set cand as worse than 0MV.
mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
js HeuristicME_EarlyOut
add esi,ecx ; Adjust ref addr for horz motion.
mov bl,StateEngine[eax+ebx*4+3] ; 0:239 -> New state number;
; ; 240:255 -> flags which 1/2 pel to do.
shr ecx,4
punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
movq SWDURandLL[eax*8],mm5
pxor mm6,mm6 ; Speculatively zero to prep for half pel ME.
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
cmp bl,240 ; Terminal state?
jb ComputeMBSWD
mov eax,esi
mov ecx,Addr0MVRef ; Start to calc linearized MV.
sub eax,ecx ; Linearized Motion Vector
;
mov ecx,eax
;
sar eax,8 ; Full pel vert lin offset div 256.
and cl,07FH ; Full pel HMV
add cl,cl
;
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
IFDEF H261
ELSE
mov eax,DoHalfPelME ; 0 if not, -4 if so.
test eax,eax
je SkipHalfPelMBME
cmp cl,EMVLimitsForThisMB+1 ; Skip half pel ME if at edge of range
jle SkipHalfPelMBME
cmp cl,EMVLimitsForThisMB+3
jge SkipHalfPelMBME
cmp ch,EMVLimitsForThisMB+5
jle SkipHalfPelMBME
cmp ch,EMVLimitsForThisMB+7
jge SkipHalfPelMBME
; Registers:
; ebp -- PITCH
; esi -- Address of best full pel reference macroblock
; edx -- MBlockActionStream
; ecx -- Nothing presently.
; edi -- Address of target macroblock.
; ebx -- 240 + Flags to indicate which half pel ME to do:
; 1 --> right; 2 --> left; 4 --> down; 8 --> up
; eax -- Count from -4 to -1 for blocks of macroblock.
; mm0:mm7 -- Scratch
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
pxor mm7,mm7 ; Prep accumulator for half pel ME.
call HalfPelMotionEstimation
movdt mm7,InvalidateBadHalfPelMVs[eax*4] ; Need to inflate SWDs for
; ; MVs that go off frame edge.
mov eax,esi
mov ebx,Addr0MVRef ; Start to calc linearized MV.
sub eax,ebx ; Linearized Motion Vector
punpcklbw mm7,mm7 ; Expand adjustment to words.
mov ecx,eax ; Linearized Motion Vector
paddusw mm7,mm3 ; Now have SWDs for half pel MBME.
sar eax,8 ; Full pel vert lin offset div 256.
and cl,07FH ; Full pel HMV
add cl,cl
movq mm6,mm7
mov [edx].BestFullPelMBHMV,cl ; Save HMV
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
movdf eax,mm7 ; eax[ 0:15] -- SWD for leftward ref.
; ; eax[16:31] -- SWD for rightward ref.
psrlq mm6,32
mov [edx].BestFullPelMBVMV,ch ; Save VMV
mov ebx,eax
shr eax,16 ; eax -- SWD for leftward ref.
and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
cmp eax,ebx
jg MBME_RightBetterThanLeft
MBME_LeftBetterThanRight:
cmp eax,BestMBFullPelSWD
jge MBME_CtrIsBestHMV
MBME_LeftBestHMV:
movdf ebx,mm6 ; ebx[ 0:15] -- SWD for downward ref.
; ; ebx[16:31] -- SWD for upward ref.
mov BestHalfPelHorzSWD,eax
mov eax,ebx
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jg MBME_LeftBestHMV_DownBetterThanUp
MBME_LeftBestHMV_UpBetterThanDown:
cmp eax,BestMBFullPelSWD
jge MBME_LeftIsBest
MBME_LeftBestHMV_UpBestVMV:
sub esi,PITCH+1 ; Try ref 1/2 pel left and up
mov BestHalfPelVertSWD,eax
mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD
lea esi,[esi+ebp*1+1] ; Back to center.
cmp eax,ebx
jle MBME_UpBetterThanUpLeft
MBME_UpLeftBetterThanUp:
cmp ebx,BestHalfPelHorzSWD
jge MBME_LeftIsBest
MBME_UpLeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
dec ch ; Back up the vert MV one up.
jmp MBME_HalfPelSearchDone
MBME_UpBetterThanUpLeft:
cmp eax,BestHalfPelHorzSWD
jg MBME_LeftIsBest
MBME_UpIsBest:
mov ebx,eax
dec ch ; Back up the vert MV one up.
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
jmp MBME_HalfPelSearchDone
MBME_LeftBestHMV_DownBetterThanUp:
cmp ebx,BestMBFullPelSWD
jge MBME_LeftIsBest
MBME_LeftBestHMV_DownBestVMV:
dec esi ; Try ref 1/2 pel left and down
mov BestHalfPelVertSWD,ebx
mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD
inc esi ; Back to center.
cmp eax,ebx
jle MBME_DownBetterThanDownLeft
MBME_DownLeftBetterThanDown:
cmp ebx,BestHalfPelHorzSWD
jge MBME_LeftIsBest
MBME_DownLeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-1] ; Best is ref 1/2 pel left and down
inc ch ; Advance the vert MV one down.
jmp MBME_HalfPelSearchDone
MBME_DownBetterThanDownLeft:
cmp eax,BestHalfPelHorzSWD
jle MBME_DownIsBest
MBME_LeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-1] ; Best is ref 1/2 pel left.
mov ebx,BestHalfPelHorzSWD
jmp MBME_HalfPelSearchDone
MBME_RightBetterThanLeft:
cmp ebx,BestMBFullPelSWD
jge MBME_CtrIsBestHMV
MBME_RightBestHMV:
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
; ; eax[16:31] -- SWD for upward ref.
mov BestHalfPelHorzSWD,ebx
mov ebx,eax
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jg MBME_RightBestHMV_DownBetterThanUp
MBME_RightBestHMV_UpBetterThanDown:
cmp eax,BestMBFullPelSWD
jge MBME_RightIsBest
MBME_RightBestHMV_UpBestVMV:
sub esi,ebp ; Try ref 1/2 pel right and up
mov BestHalfPelVertSWD,eax
mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD
lea esi,[esi+ebp*1] ; Back to center.
cmp eax,ebx
jle MBME_UpBetterThanUpRight
MBME_UpRightBetterThanUp:
cmp ebx,BestHalfPelHorzSWD
jge MBME_RightIsBest
MBME_UpRightIsBest:
inc cl ; Advance the horz MV one to right.
lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
dec ch ; Back up the vert MV one up.
jmp MBME_HalfPelSearchDone
MBME_UpBetterThanUpRight:
cmp eax,BestHalfPelHorzSWD
jle MBME_UpIsBest
MBME_RightIsBest:
mov ebx,BestHalfPelHorzSWD
inc cl ; Advance the horz MV one to right.
mov eax,esi
jmp MBME_HalfPelSearchDone
MBME_RightBestHMV_DownBetterThanUp:
cmp ebx,BestMBFullPelSWD
jge MBME_RightIsBest
MBME_RightBestHMV_DownBestVMV:
mov BestHalfPelVertSWD,ebx
mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD
cmp eax,ebx
jle MBME_DownBetterThanDownRight
MBME_DownRightBetterThanDown:
cmp ebx,BestHalfPelHorzSWD
jge MBME_RightIsBest
MBME_DownRightIsBest:
inc cl ; Advance the horz MV one to right.
mov eax,esi
inc ch ; Advance vert MV one down.
jmp MBME_HalfPelSearchDone
MBME_DownBetterThanDownRight:
cmp eax,BestHalfPelHorzSWD
jg MBME_RightIsBest
MBME_DownIsBest:
mov ebx,eax
inc ch ; Advance vert MV one down.
mov eax,esi
jmp MBME_HalfPelSearchDone
MBME_CtrIsBestHMV:
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
; ; eax[16:31] -- SWD for upward ref.
mov ebx,eax
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jge MBME_CtrBestHMV_DownBetterThanUp
MBME_CtrBestHMV_UpBetterThanDown:
mov ebx,BestMBFullPelSWD
cmp eax,ebx
jge MBME_CenterIsBest
; Up is best.
mov ebx,eax
dec ch ; Back up the vert MV one up.
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
jmp MBME_HalfPelSearchDone
MBME_CtrBestHMV_DownBetterThanUp:
mov eax,ebx
mov ebx,BestMBFullPelSWD
cmp eax,ebx
jge MBME_CenterIsBest
; Down is best.
mov ebx,eax
inc ch ; Advande the vert MV one down.
mov eax,esi
jmp MBME_HalfPelSearchDone
ENDIF
SkipHalfPelMBME:
mov [edx].BestFullPelMBHMV,cl ; Save HMV
movdf ebx,mm7 ; SWD for best full pel MB MV.
mov [edx].BestFullPelMBVMV,ch ; Save VMV
MBME_CenterIsBest:
mov eax,esi
MBME_HalfPelSearchDone:
mov BestMBHalfPelSWD,ebx
mov BestMBHalfPelMV,cl ; Save HMV
mov BestMBHalfPelRefAddr,eax
mov BestMBHalfPelMV+1,ch ; Save VMV
IFDEF H261
ELSE ; H263
mov bl,EMVLimitsForThisMB+1 ; Lower limit comparison.
mov al,DoBlockLevelVectors ; Are we doing block level MVs?
dec al
jne NoBlockMotionVectors
mov cl,[edx].CodedBlocks ; Fetch coded block pattern.
add bl,2
and cl,080H
jne NoBlockMotionVectors ; Skip Block ME if forced intra.
mov al,[edx].BestFullPelMBHMV ; Compare full pel HMV against limits.
mov cl,EMVLimitsForThisMB+3
cmp al,bl
jl NoBlockMotionVectors
mov bl,EMVLimitsForThisMB+5
sub cl,2
cmp al,cl ; Upper limit comparison.
jg NoBlockMotionVectors
mov al,[edx].BestFullPelMBVMV ; Compare full pel VMV against limits.
add bl,2
mov cl,EMVLimitsForThisMB+7
cmp al,bl
mov ebx,PD [edx].BestFullPelMBVMV-3
jl NoBlockMotionVectors
sar ebx,18
sub cl,2
cmp al,cl ; Upper limit comparison.
jg NoBlockMotionVectors
mov ecx,BestMBHalfPelSWD ; Jump if SWD for MB MV < thresh.
IF PITCH-384
*** error: The magic here assumes a pitch of 384.
ENDIF
and ebx,0FFFFFF80H ; VMV*128
cmp ecx,BLOCKMOTIONTHRESHOLD
jle NoBlockMotionVectors
;==========================================================================
; Starting from the best full pel macroblock motion vector calculated above, we
; search for the best block motion vectors.
;
; ebp -- PITCH
; esi -- Address of ref block.
; edi -- Address of target block.
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
; ecx -- Scratch
; ebx -- CurrSWDState
; eax -- Scratch
; mm7 -- Best SWD for current block
; mm6 -- unused.
; mm5 -- Best SWD for right block of pair worked on by inner loop.
; mm0-mm4 Scratch
;
movq mm0,HalfPelMBMESWDAccum+8
movq mm1,HalfPelMBMESWDAccum+16
psubusw mm7,mm0
movq mm2,HalfPelMBMESWDAccum+0
psubusw mm0,mm1
movq [edx].BlkY4.BlkLvlSWD+16,mm7
psubusw mm1,mm2
movq [edx].BlkY2.BlkLvlSWD+16,mm0
movq [edx].BlkY3.BlkLvlSWD+16,mm1
movq [edx].BlkY1.BlkLvlSWD+16,mm2
movsx eax,[edx].BestFullPelMBHMV
sar eax,1
lea ebx,[ebx+ebx*2]
mov esi,Addr0MVRef
add ebx,ebp
mov Addr0MVRefBlk,esi
add esi,eax
lea ecx,[ecx+ecx*2] ; Best MBMV SWD times 3.
add esi,ebx ; Try V+1 first
shr ecx,2 ; Best MBMV SWD * 3/4.
mov eax,SWDForNon0MVToBeat
mov BestBlockRefAddrVP1,esi ; Stash BestBlockRefAddr
sub ecx,BLOCKMVDIFFERENTIAL ; Best MBMV SWD * 3/4 - Differential.
lea eax,[eax+eax*2-BLOCKMVDIFFERENTIAL*4] ; Non0MBMVSWDToBeat*3-4*Diff.
mov LimitForSWDForBlkMV,ecx
shr eax,2 ; Non0MBMVSWDToBeat * 3/4.
mov ebx,FIRSTBLOCKMESTATE
cmp eax,ecx
jg @f
mov LimitForSWDForBlkMV,eax
mov ecx,eax
@@:
movdt mm5,SWDURandLL ; Get SWD for best MB level full pel MVs, blk 2.
test ecx,ecx
jle NoBlockMotionVectors
movdt mm7,SWDULandLR ; Get SWD for best MB level full pel MVs, blk 1.
movdf SWDForBlock2Or4,mm5
;============================================================================
; Compute SWD for block.
DoBlkMEForNextBlk:
ComputeBlkSWD:
movq mm0,[esi+ebp*1]
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
movq mm1,[esi+PITCH*3] ; Ref MB, upper left block, Line 3.
psllw mm0,8 ; Extract diffs for line 1 even pels.
psubw mm1,[edi+PITCH*3] ; Diff for line 3.
pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
movq mm2,[esi+PITCH*5]
psllw mm1,8
psubw mm2,[edi+PITCH*5]
pmaddwd mm1,mm1
movq mm3,[esi+PITCH*7]
psllw mm2,8
psubw mm3,[edi+PITCH*7]
pmaddwd mm2,mm2
movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
psllw mm3,8
psubw mm4,[edi] ; Diff for line 0.
paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
movq mm1,[esi+ebp*2]
pmaddwd mm3,mm3
psubw mm1,[edi+ebp*2]
paddusw mm0,mm2
movq mm2,[esi+ebp*4]
pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
psubw mm2,[edi+ebp*4]
paddusw mm0,mm3
movq mm3,[esi+PITCH*6]
pmaddwd mm1,mm1
psubw mm3,[edi+PITCH*6]
pmaddwd mm2,mm2
paddusw mm0,mm4
pmaddwd mm3,mm3
paddusw mm0,mm1
;
paddusw mm0,mm2
;
paddusw mm0,mm3
;
punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
movq mm4,mm7 ; Get original Best SWD for block
paddusw mm1,mm0 ; mm1[48:63] is SWD for block.
pxor mm2,mm2
psrlq mm1,48 ; mm1 is SWD for block.
;
psubusw mm4,mm1
xor ecx,ecx
pcmpeqd mm2,mm4 ; mm2[0:31] == 0 iff cand better, else -1.
psubusw mm7,mm4 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
;
;
movdf eax,mm2 ; edi == 0 iff cand better, else -1.
;
; Registers at this point:
; ebp -- PITCH
; esi -- Address of block of candidate ref area.
; edi -- 0 iff candidate SWD better, else -1.
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
; ecx -- Scratch
; ebx -- CurrSWDState.
; eax -- CurrSWDState.
; mm7 -- New best SWD for current block
; mm6 -- Unused.
movq [edx].BlkY1.BlkLvlSWD,mm7 ; Save best blk level SWD.
pxor mm6,mm6 ; Spec zero to prep for half pel ME.
mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
mov bl,StateEngine[eax+ebx*4+3] ; New state number; 255 means done.
add esi,ecx ; Adjust ref addr for horz motion.
mov eax,DoHalfPelME ; 0 if not, -4 if so.
shr ecx,4
cmp bl,240 ; Terminal state?
jae @f
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
jmp ComputeBlkSWD
@@:
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
add eax,4
mov ecx,esi
jne SkipHalfPelBlkME
; Registers:
; ebp -- PITCH
; esi -- Address of best full pel reference macroblock
; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
; ecx -- Copy of esi.
; edi -- Address of target block.
; ebx -- Scratch
; eax -- Set to 0 to cause HalfPelMotionEstimation to quit after one block.
; mm0:mm7 -- Scratch
mov ebx,BestBlockRefAddrVP1
add ecx,ebp
cmp ebx,ecx
jne FullPelBlkMEMovedFromCenter
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
movq mm3,[edx].BlkY1.BlkLvlSWD+16 ; SWDs: H+1, H-1, V+1, V-1.
jmp FullPelBlkMEDidNotMoveFromCenter
FullPelBlkMEMovedFromCenter:
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
pxor mm7,mm7 ; Prep accumulator for half pel ME.
call HalfPelMotionEstimation
lea esi,[esi+ebp*8+8] ; Fix reference pointer.
lea edi,[edi+ebp*8+8] ; Fix target pointer.
FullPelBlkMEDidNotMoveFromCenter:
mov eax,esi
mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
sub ecx,ebx ; Linearized Motion Vector
sub eax,ebx ; Linearized Motion Vector
sar eax,8 ; Full pel vert lin offset div 256.
and cl,07FH ; Full pel HMV
movdf ebx,mm3 ; ebx[ 0:15] -- SWD for leftward ref.
; ; ebx[16:31] -- SWD for rightward ref.
psrlq mm3,32
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
mov eax,ebx
shr eax,16 ; eax -- SWD for leftward ref.
and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
cmp eax,ebx
jg BlkME_RightBetterThanLeft
BlkME_LeftBetterThanRight:
add cl,cl
mov ebx,BestBlkFullPelSWD
cmp eax,ebx
jge BlkME_CtrIsBestHMV
BlkME_LeftBestHMV:
movdf ebx,mm3 ; ebx[ 0:15] -- SWD for downward ref.
; ; ebx[16:31] -- SWD for upward ref.
mov BestHalfPelHorzSWD,eax
mov eax,ebx
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jg BlkME_LeftBestHMV_DownBetterThanUp
BlkME_LeftBestHMV_UpBetterThanDown:
cmp eax,BestBlkFullPelSWD
jge BlkME_LeftIsBest
BlkME_LeftBestHMV_UpBestVMV:
sub esi,PITCH+1 ; Try ref 1/2 pel left and up
mov BestHalfPelVertSWD,eax
mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8]
mov eax,BestHalfPelVertSWD
lea esi,[esi+PITCH*9+9] ; Back to center.
cmp eax,ebx
jle BlkME_UpBetterThanUpLeft
BlkME_UpLeftBetterThanUp:
cmp ebx,BestHalfPelHorzSWD
jge BlkME_LeftIsBest
BlkME_UpLeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
dec ch ; Back up the vert MV one up.
jmp BlkME_HalfPelSearchDone
BlkME_UpBetterThanUpLeft:
cmp eax,BestHalfPelHorzSWD
jg BlkME_LeftIsBest
BlkME_UpIsBest:
dec ch ; Back up the vert MV one up.
mov ebx,eax
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
jmp BlkME_HalfPelSearchDone
BlkME_LeftBestHMV_DownBetterThanUp:
cmp ebx,BestBlkFullPelSWD
jge BlkME_LeftIsBest
BlkME_LeftBestHMV_DownBestVMV:
dec esi ; Try ref 1/2 pel left and down
mov BestHalfPelVertSWD,ebx
mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8]
mov eax,BestHalfPelVertSWD
lea esi,[esi+ebp*8+9] ; Back to center.
cmp eax,ebx
jle BlkME_DownBetterThanDownLeft
BlkME_DownLeftBetterThanDown:
cmp ebx,BestHalfPelHorzSWD
jge BlkME_LeftIsBest
BlkME_DownLeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-1] ; Best is ref 1/2 pel left and down
inc ch ; Advance the vert MV one down.
jmp BlkME_HalfPelSearchDone
BlkME_DownBetterThanDownLeft:
cmp eax,BestHalfPelHorzSWD
jle BlkME_DownIsBest
BlkME_LeftIsBest:
dec cl ; Back up the horz MV one to the left.
lea eax,[esi-1] ; Best is ref 1/2 pel left.
mov ebx,BestHalfPelHorzSWD
jmp BlkME_HalfPelSearchDone
BlkME_RightBetterThanLeft:
add cl,cl
mov eax,BestBlkFullPelSWD
cmp eax,ebx
jle BlkME_CtrIsBestHMV
BlkME_RightBestHMV:
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
; ; eax[16:31] -- SWD for upward ref.
mov BestHalfPelHorzSWD,ebx
mov ebx,eax
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jg BlkME_RightBestHMV_DownBetterThanUp
BlkME_RightBestHMV_UpBetterThanDown:
cmp eax,BestBlkFullPelSWD
jge BlkME_RightIsBest
BlkME_RightBestHMV_UpBestVMV:
sub esi,ebp ; Try ref 1/2 pel right and up
mov BestHalfPelVertSWD,eax
mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8]
mov eax,BestHalfPelVertSWD
lea esi,[esi+PITCH*9+8] ; Back to center.
cmp eax,ebx
jle BlkME_UpBetterThanUpRight
BlkME_UpRightBetterThanUp:
cmp ebx,BestHalfPelHorzSWD
jge BlkME_RightIsBest
BlkME_UpRightIsBest:
inc cl ; Advance the horz MV one to right.
lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
dec ch ; Back up the vert MV one up.
jmp BlkME_HalfPelSearchDone
BlkME_UpBetterThanUpRight:
cmp eax,BestHalfPelHorzSWD
jle BlkME_UpIsBest
BlkME_RightIsBest:
mov ebx,BestHalfPelHorzSWD
inc cl ; Advance the horz MV one to right.
mov eax,esi
jmp BlkME_HalfPelSearchDone
BlkME_RightBestHMV_DownBetterThanUp:
cmp ebx,BestBlkFullPelSWD
jge BlkME_RightIsBest
BlkME_RightBestHMV_DownBestVMV:
mov BestHalfPelVertSWD,ebx
mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8]
mov eax,BestHalfPelVertSWD
lea esi,[esi+ebp*8+8] ; Back to center.
cmp eax,ebx
jle BlkME_DownBetterThanDownRight
BlkME_DownRightBetterThanDown:
cmp ebx,BestHalfPelHorzSWD
jge BlkME_RightIsBest
BlkME_DownRightIsBest:
inc cl ; Advance the horz MV one to right.
mov eax,esi
inc ch ; Advance vert MV one down.
jmp BlkME_HalfPelSearchDone
BlkME_DownBetterThanDownRight:
cmp eax,BestHalfPelHorzSWD
jg BlkME_RightIsBest
BlkME_DownIsBest:
inc ch ; Advance vert MV one down.
mov ebx,eax
mov eax,esi
jmp BlkME_HalfPelSearchDone
BlkME_CtrIsBestHMV:
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
; ; eax[16:31] -- SWD for upward ref.
mov ebx,eax
shr eax,16 ; eax -- SWD for upward ref.
and ebx,00000FFFFH ; ebx -- SWD for downward ref.
cmp eax,ebx
jge BlkME_CtrBestHMV_DownBetterThanUp
BlkME_CtrBestHMV_UpBetterThanDown:
mov ebx,BestBlkFullPelSWD
cmp eax,ebx
jge BlkME_CenterIsBest
; Up is best.
mov ebx,eax
dec ch ; Back up the vert MV one up.
lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
jmp BlkME_HalfPelSearchDone
BlkME_CtrBestHMV_DownBetterThanUp:
mov eax,ebx
mov ebx,BestBlkFullPelSWD
cmp eax,ebx
jge BlkME_CenterIsBest
; Down is best.
mov ebx,eax
inc ch ; Advande the vert MV one down.
mov eax,esi
jmp BlkME_HalfPelSearchDone
SkipHalfPelBlkME:
mov eax,esi
mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
sub ecx,ebx ; Linearized Motion Vector
sub eax,ebx ; Linearized Motion Vector
sar eax,8 ; Full pel vert lin offset div 256.
and cl,07FH ; Full pel HMV
add cl,cl
;
mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
;
movdf ebx,mm7 ; SWD for best full pel block MV.
BlkME_CenterIsBest:
mov eax,esi
BlkME_HalfPelSearchDone:
mov [edx].BlkY1.BlkLvlSWD,ebx
mov [edx].BlkY1.PastRef,eax
mov [edx].BlkY1.PHMV,cl ; Save HMV
mov eax,LimitForSWDForBlkMV ; Does block's SWD put us over limit?
mov [edx].BlkY1.PVMV,ch ; Save VMV
sub eax,ebx
jl BlkEst_EarlyOut
mov LimitForSWDForBlkMV,eax ; Remember how much is left for other blks.
mov esi,BestBlockRefAddrVP1
add edi,8 ; Move to blk 2 or 4, V+4.
mov ecx,Addr0MVRefBlk ; Calc addr of 0MV ref for this blk.
add esi,8 ; Move to blk 2 or 4, V+4.
add ecx,8
mov Addr0MVRefBlk,ecx
add edx,SIZEOF T_Blk ; Increment to next block.
test dl,SIZEOF T_Blk
movdt mm7,SWDForBlock2Or4
mov ebx,FIRSTBLOCKMESTATE
jne DoBlkMEForNextBlk ; If so, go do blk 2 or 4.
lea esi,[esi+ebp*8-8] ; Move to blk 3
lea ecx,[ecx+ebp*8-16]
mov BestBlockRefAddrVP1,esi
lea edi,[edi+ebp*8-16]
movdt mm5,SWDULandLR+4 ; Get SWD for best MB level MVs, blk 4.
movdt mm7,SWDURandLL+4 ; Get SWD for best MB level MVs, blk 3.
movdf SWDForBlock2Or4,mm5
test dl,2*SIZEOF T_Blk ; Just finishing blk 2?
mov Addr0MVRefBlk,ecx
jne DoBlkMEForNextBlk ; If so, go do blk 3.
;==============================================================================
; Block motion vectors are best.
mov esi,[edx-4*SIZEOF T_Blk].BlkY1.BlkLvlSWD
mov edi,[edx-4*SIZEOF T_Blk].BlkY4.BlkLvlSWD
mov SWDULandLR,esi
mov SWDULandLR+4,edi
mov esi,[edx-4*SIZEOF T_Blk].BlkY3.BlkLvlSWD
mov edi,[edx-4*SIZEOF T_Blk].BlkY2.BlkLvlSWD
mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs
mov ebx,[edx-4*SIZEOF T_Blk].BlkY2.MVs
mov ecx,eax
xor eax,ebx
xor ecx,[edx-4*SIZEOF T_Blk].BlkY3.MVs
xor ebx,[edx-4*SIZEOF T_Blk].BlkY4.MVs
mov SWDURandLL,edi
or eax,ebx
sub edx,4*SIZEOF T_Blk ; Restore MacroBlockActionStream ptr.
or eax,ecx
test eax,0FFFFH
mov SWDURandLL+4,esi
je MotionVectorSettled
mov al,INTER4MV ; Set type for MB to INTER-coded, 4 MVs.
mov [edx].BlockType,al
jmp MotionVectorSettled
BlkEst_EarlyOut:
and edx,-1-3*SIZEOF T_Blk
mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
BlockMVNotBigEnoughGain: ; Try MB-level motion vector.
cmp ecx,SWDForNon0MVToBeat
jge NonZeroMVNotBigEnoughGain
ENDIF ; H263
mov ebx,BestMBHalfPelMV
mov esi,BestMBHalfPelRefAddr ; Reload BestMBHalfPelRefAddr
NonZeroMBLevelMVBest:
; Non-zero macroblock level motion vector is best.
mov [edx].BlkY1.MVs,ebx
mov [edx].BlkY2.MVs,ebx
mov [edx].BlkY3.MVs,ebx
mov [edx].BlkY4.MVs,ebx
mov [edx].BlkY1.PastRef,esi
lea ecx,[esi+ebp*8]
mov [edx].BlkY3.PastRef,ecx
add esi,8
mov [edx].BlkY2.PastRef,esi
add ecx,8
mov [edx].BlkY4.PastRef,ecx
jmp MotionVectorSettled
NoBlockMotionVectors:
mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
mov eax,SWDForNon0MVToBeat
cmp eax,ecx
mov ebx,BestMBHalfPelMV
mov esi,BestMBHalfPelRefAddr
jge NonZeroMBLevelMVBest
NonZeroMVNotBigEnoughGain:
mov esi,Addr0MVRef ; 0-MV ref block.
movq mm6,SWD0MVULandLR
movq mm5,SWD0MVURandLL
movq SWDULandLR,mm6
movq SWDURandLL,mm5
BelowZeroThresh:
mov [edx].BlkY1.PastRef,esi ; Save address of ref block, all blks.
lea eax,[esi+8]
mov [edx].BlkY2.PastRef,eax
lea eax,[esi+ebp*8]
mov [edx].BlkY3.PastRef,eax
add eax,8
mov [edx].BlkY4.PastRef,eax
xor eax,eax
mov [edx].BlkY1.MVs,eax ; Set horz and vert MVs to 0 in all blks.
mov [edx].BlkY2.MVs,eax
mov [edx].BlkY3.MVs,eax
mov [edx].BestFullPelMBHMV,al
mov [edx].BlkY4.MVs,eax
mov [edx].BestFullPelMBVMV,al
mov BestMBHalfPelMV,eax
MotionVectorSettled:
IFDEF H261
;===============================================================================
; For H261, we've settled on the best motion vector. Now we need to determine
; if spatial filtering should be done.
;
; ebp -- PITCH
; esi -- Address of block of ref area.
; edi -- Address of spatially filtred block.
; edx -- MBlockActionStream
; ecx -- Loop counter.
; ebx -- Address of constant 0x7F in all 8 bytes.
; eax -- Scratch
; mm7 -- Mask to extract bytes 0 and 7. (High bit of bytes 1:6 must be off).
; mm6 -- All bytes -1.
; mm5 -- Mask to extract bytes 1:6 and clear bit 8 thereof.
movdf esi,mm7 ; Restore non-SLF SWD for macroblock.
cmp esi,SpatialFiltThreshold
jle SkipSpatialFiltering
mov ecx,DoSpatialFiltering ; Are we doing spatial filtering?
mov esi,[edx].BlkY1.PastRef
test cl,cl
je SkipSpatialFiltering
DoSpatialFilterForChroma:
DoSpatialFilterForLuma:
movq mm5,C7F7F7F7F7F7F7F7F ; Mask to extract bytes 1:6.
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
psllq mm5,16
psrlq mm5,8
pcmpeqb mm7,mm7
pxor mm7,mm5 ; Mask to extract bytes 0 and 7.
mov edi,SpatiallyFilteredMB
lea eax,[esi+ebp*4]
lea ebx,C7F7F7F7F7F7F7F7F ; Address of this useful constant.
SpatialFilterLoop:
movq mm0,[esi] ; 0a: <P7 P6 P5 P4 P3 P2 P1 P0>
pcmpeqb mm6,mm6 ; To add one to all bytes.
movq mm4,mm0 ; 0b: <P7 P6 P5 P4 P3 P2 P1 P0>
psllq mm0,16 ; 0c: <P5 P4 P3 P2 P1 P0 0 0>
movq mm3,[esi+ebp*1]; 1a
paddb mm0,mm4 ; 0d: <P7+P5 P6+P4 ... P3+P1 P2+P0 jnk jnk >
movq mm1,mm3 ; 1b
psrlq mm0,9 ; 0e: <0 (P7+P5)/2 ... (P2+P0)/2 jnk> (dirty)
SpatialFilterLoop_BlockToRight:
pand mm0,mm5 ; 0f: <0 (P7+P5)/2 ... (P2+P0)/2 0> (clean)
psllq mm1,16 ; 1c
paddb mm0,mm4 ; 0g: <jnk (P7+2P6+P5)/2 ... (P2+2P1+P0)/2 jnk>
paddb mm1,mm3 ; 1d
psubb mm0,mm6 ; 0h: <jnk (P7+2P6+P5+2)/2 ... (P2+2P1+P0+2)/2 jnk>
psrlq mm1,9 ; 1e
psrlq mm0,1 ; 0i: <jnk (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 jnk>
pand mm4,mm7 ; 0j: <P7 0 0 0 0 0 0 P0>
pand mm0,mm5 ; 0k: < 0 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 0>
pand mm1,mm5 ; 1f
por mm0,mm4 ; 0l: <P7 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/4 P0>
paddb mm1,mm3 ; 1g
movq mm2,[esi+ebp*2]; 2a
psubb mm1,mm6 ; 1h
movq [edi],mm0 ; 0m: Store line 0 of filtered block. This is R0.
movq mm4,mm2 ; 2b
psrlq mm1,1 ; 1i
pand mm3,mm7 ; 1j
pand mm1,mm5 ; 1k
psllq mm2,16 ; 2c
por mm1,mm3 ; 1l: This is R1
paddb mm2,mm4 ; 2d
psubb mm1,mm6 ; 1A: R1+1
psrlq mm2,9 ; 2e
pand mm2,mm5 ; 2f
paddb mm0,mm1 ; 1B: R0+R1+1
paddb mm2,mm4 ; 2g
psrlq mm0,1 ; 1C: (R0+R1+1)/2 (dirty)
pand mm0,[ebx] ; 1D: (R0+R1+1)/2 (clean)
psubb mm2,mm6 ; 2h
psrlq mm2,1 ; 2i
pand mm4,mm7 ; 2j
movq mm3,[esi+PITCH*3] ; 3a
pand mm2,mm5 ; 2k
por mm2,mm4 ; 2l: This is R2.
movq mm4,mm3 ; 3b
paddb mm1,mm2 ; 1E & 2B: R1+R2+1
psllq mm3,16 ; 3c
psrlq mm1,1 ; 1F & 2C: (R1+R2+1)/2 (dirty)
paddb mm3,mm4 ; 3d
pand mm1,[ebx] ; 1G & 2D: (R1+R2+1)/2 (clean)
psrlq mm3,9 ; 3e
paddb mm0,mm1 ; 1H: (R0+2R1+R2+2)/2
pand mm3,mm5 ; 3f
psrlq mm0,1 ; 1I: (R0+2R1+R2+2)/4 (dirty)
paddb mm3,mm4 ; 3g
pand mm0,[ebx] ; 1J: (R0+2R1+R2+2)/4 (clean)
psubb mm3,mm6 ; 3h
psrlq mm3,1 ; 3i
pand mm4,mm7 ; 3j
movq [edi+ebp*1],mm0 ; 1K: Store line 1 of filtered block.
pand mm3,mm5 ; 3k
movq mm0,[eax] ; 4a
por mm3,mm4 ; 3l
psubb mm3,mm6 ; 3A: R3+1
movq mm4,mm0 ; 4b
paddb mm2,mm3 ; 2E & 3B: R2+R3+1
psllq mm0,16 ; 4c
psrlq mm2,1 ; 2F & 3C: (R2+R3+1)/2 (dirty)
paddb mm0,mm4 ; 4d
pand mm2,[ebx] ; 2G & 3D: (R2+R3+1)/2 (clean)
psrlq mm0,9 ; 4e
paddb mm1,mm2 ; 2H: (R1+2R2+R3+2)/2
pand mm0,mm5 ; 4f
psrlq mm1,1 ; 2I: (R1+2R2+R3+2)/4 (dirty)
paddb mm0,mm4 ; 4g
pand mm1,[ebx] ; 2J: (R1+2R2+R3+2)/4 (clean)
psubb mm0,mm6 ; 4h
psrlq mm0,1 ; 4i
pand mm4,mm7 ; 4j
movq [edi+ebp*2],mm1 ; 2K: Store line 2 of filtered block.
pand mm0,mm5 ; 4k
movq mm1,[eax+ebp*1] ; 5a
por mm0,mm4 ; 4l
movq mm4,mm1 ; 5b
psllq mm1,16 ; 5c
paddb mm3,mm0 ; 3E & 4B: R3+R4+1
paddb mm1,mm4 ; 5d
add esi,8
psrlq mm3,1 ; 3F & 4C: (R3+R4+1)/2 (dirty)
pand mm3,[ebx] ; 3G & 4D: (R3+R4+1)/2 (clean)
psrlq mm1,9 ; 5e
paddb mm2,mm3 ; 3H: (R2+2R3+R4+2)/2
pand mm1,mm5 ; 5f
psrlq mm2,1 ; 3I: (R2+2R3+R4+2)/4 (dirty)
paddb mm1,mm4 ; 5g
pand mm2,[ebx] ; 3J: (R2+2R3+R4+2)/4 (clean)
psubb mm1,mm6 ; 5h
psrlq mm1,1 ; 5i
pand mm4,mm7 ; 5j
movq [edi+PITCH*3],mm2 ; 3K: Store line 3 of filtered block.
pand mm1,mm5 ; 5k
movq mm2,[eax+ebp*2] ; 6a
por mm1,mm4 ; 5l
psubb mm1,mm6 ; 5A: R5+1
movq mm4,mm2 ; 6b
paddb mm0,mm1 ; 4E & 5B: R4+R5+1
psllq mm2,16 ; 6c
psrlq mm0,1 ; 4F & 5C: (R4+R5+1)/2 (dirty)
paddb mm2,mm4 ; 6d
pand mm0,[ebx] ; 4G & 5D: (R4+R5+1)/2 (clean)
psrlq mm2,9 ; 6e
paddb mm3,mm0 ; 4H: (R3+2R4+R5+2)/2
pand mm2,mm5 ; 6f
psrlq mm3,1 ; 4I: (R3+2R4+R5+2)/4 (dirty)
paddb mm2,mm4 ; 6g
pand mm3,[ebx] ; 4J: (R3+2R4+R5+2)/4 (clean)
psubb mm2,mm6 ; 6h
psrlq mm2,1 ; 6i
sub cl,2 ; Loop control
movq [edi+ebp*4],mm3 ; 4K: Store line 4 of filtered block.
pand mm4,mm7 ; 6j
movq mm3,[eax+PITCH*3] ; 7a
pand mm2,mm5 ; 6k
por mm2,mm4 ; 6l
movq mm4,mm3 ; 7b
paddb mm1,mm2 ; 5E & 6B: R5+R6+1
psllq mm3,16 ; 7c
psrlq mm1,1 ; 5F & 6C: (R5+R6+1)/2 (dirty)
paddb mm3,mm4 ; 7d
pand mm1,[ebx] ; 5G & 6D: (R5+R6+1)/2 (clean)
psrlq mm3,9 ; 7e
paddb mm0,mm1 ; 5H: (R4+2R5+R6+2)/2
pand mm3,mm5 ; 7f
psrlq mm0,1 ; 5I: (R4+2R5+R6+2)/4 (dirty)
paddb mm3,mm4 ; 7g
pand mm0,[ebx] ; 5J: (R4+2R5+R6+2)/4 (clean)
psubb mm3,mm6 ; 7h
psrlq mm3,1 ; 7i
pand mm4,mm7 ; 7j
movq [edi+PITCH*5],mm0 ; 5K: Store line 5 of filtered block.
pand mm3,mm5 ; 7k
psubb mm2,mm6 ; 7A: R6+1
por mm3,mm4 ; 7l
paddb mm2,mm3 ; 6E: R6+R7+1
lea eax,[esi+ebp*4]
movq mm0,[esi] ; 0a: for next iteration
psrlq mm2,1 ; 6F: (R6+R7+1)/2 (dirty)
pand mm2,[ebx] ; 6G: (R6+R7+1)/2 (clean)
movq mm4,mm0 ; 0b: for next iteration
movq [edi+PITCH*7],mm3 ; 7m: Store line 7 of filtered block.
paddb mm1,mm2 ; 6H: (R5+2R6+R7+2)/2
lea edi,[edi+8] ; Advance output cursor.
psrlq mm1,1 ; 6I: (R5+2R6+R7+2)/4 (dirty)
pand mm1,[ebx] ; 6J: (R5+2R6+R7+2)/4 (clean)
psllq mm0,16 ; 0c: for next iteration
movq mm3,[esi+ebp*1] ; 1a: for next iteration
paddb mm0,mm4 ; 0d: for next iteration
movq [edi+PITCH*6-8],mm1 ; 6K: Store line 6 of filtered block.
movq mm1,mm3 ; 1b: for next iteration
psrlq mm0,9 ; 0e: for next iteration
jg SpatialFilterLoop_BlockToRight
lea esi,[esi+ebp*8-16]
lea eax,[eax+ebp*8-16]
lea edi,[edi+ebp*8-16]
mov cl,4
jl SpatialFilterLoop
SpatialFilterDone:
mov edi,TargetMacroBlockBaseAddr
mov esi,SpatiallyFilteredMB
test ch,ch
jg ReturnFromSpatialFilterForU
; Registers at this point:
; ebp -- PITCH
; esi -- Address of upper left block of spatially filtered candidate ref area.
; edi -- Address of upper left block of target.
; edx -- MBlockActionStream
; ecx -- Scratch
; ebx -- Scratch
; eax -- Loop control
; mm0-mm4 -- Scratch
; mm5,mm6 -- SWD for each block
; mm7 -- SWD for macroblock
;
movq mm0,[esi+ebp*1]
pxor mm7,mm7
mov al,3
jl ReturnFromSpatialFilterForV
ComputeSWDforSLFBlock:
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
ComputeSWDforSLFBlock_BlkToRight:
movq mm1,[esi+PITCH*3] ; Ref MB, Line 3.
psllw mm0,8 ; Extract diffs for line 1 even pels.
psubw mm1,[edi+PITCH*3] ; Diff for line 3.
pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
movq mm2,[esi+PITCH*5]
psllw mm1,8
psubw mm2,[edi+PITCH*5]
pmaddwd mm1,mm1
movq mm3,[esi+PITCH*7]
psllw mm2,8
psubw mm3,[edi+PITCH*7]
pmaddwd mm2,mm2
movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
psllw mm3,8
psubw mm4,[edi] ; Diff for line 0.
paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
movq mm1,[esi+ebp*2]
pmaddwd mm3,mm3
psubw mm1,[edi+ebp*2]
paddusw mm0,mm2
movq mm2,[esi+ebp*4]
pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
psubw mm2,[edi+ebp*4]
paddusw mm0,mm3
movq mm3,[esi+PITCH*6]
pmaddwd mm1,mm1
psubw mm3,[edi+PITCH*6]
pmaddwd mm2,mm2
paddusw mm4,mm0
pmaddwd mm3,mm3
paddusw mm4,mm1
add esi,8
paddusw mm4,mm2
add edi,8
movq mm0,[esi+ebp*1]
paddusw mm4,mm3
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
punpckldq mm1,mm4 ; Get low order SWD accum to high order of mm1.
paddusw mm1,mm4 ; mm1[48:63] is SWD for block.
psllq mm6,32 ; Shift previous block's SWD left.
psrlq mm1,48 ; mm1 is SWD for block.
sub al,2 ; Loop control.
paddusw mm7,mm1
por mm6,mm1 ; Save current block's SWD.
movq mm4,mm5
jg ComputeSWDforSLFBlock_BlkToRight
movq mm0,[esi+PITCH*9-16]
movq mm5,mm6
lea edi,[edi+ebp*8-16]
lea esi,[esi+ebp*8-16]
mov al,4
jl ComputeSWDforSLFBlock
mov ebx,BestMBFullPelSWD ; Restore non-SLF SWD for macroblock.
mov eax,SpatialFiltDifferential
sub ebx,eax
sub edi,PITCH*16+16
movdf eax,mm7 ; SLF SWD for macroblock.
cmp eax,ebx
jge SpatialFilterNotAsGood
movdf SWDULandLR+4,mm5
psrlq mm5,32
movdf SWDURandLL+4,mm5
movdf SWDURandLL,mm6
psrlq mm6,32
movdf SWDULandLR,mm6
mov al,INTERSLF
mov ebx,SpatiallyFilteredMB
mov [edx].BlockType,al
sub esi,PITCH*8-8
mov [edx].BlkY4.PastRef,esi
mov [edx].BlkY1.PastRef,ebx
sub esi,8
add ebx,8
mov [edx].BlkY3.PastRef,esi
mov [edx].BlkY2.PastRef,ebx
SkipSpatialFiltering:
SpatialFilterNotAsGood:
ENDIF ; H261
;===============================================================================
; We've settled on the motion vector that will be used if we do indeed code the
; macroblock with inter-coding. We need to determine if some or all of the
; blocks can be forced as empty (copy). If all the blocks can be forced
; empty, we force the whole macroblock to be empty.
mov esi,EMPTYTHRESHOLD ; Get threshold for forcing block empty?
mov ebx,SWDULandLR ; Get SWD for block 1.
mov al,[edx].CodedBlocks
cmp ebx,esi ; Is SWD > threshold?
jg @f
and al,0FEH ; If not, indicate block 1 is NOT coded.
xor ebx,ebx
@@:
mov ecx,SWDURandLL ; Get SWD for block 2.
cmp ecx,esi
jg @f
and al,0FDH
xor ecx,ecx
@@:
add ebx,ecx
mov ecx,SWDURandLL+4 ; Get SWD for block 3.
cmp ecx,esi
jg @f
and al,0FBH
xor ecx,ecx
@@:
add ebx,ecx
mov ecx,SWDULandLR+4 ; Get SWD for block 4.
cmp ecx,esi
jg @f
and al,0F7H
xor ecx,ecx
@@:
mov [edx].CodedBlocks,al ; Store coded block pattern.
and al,00FH
add ebx,ecx
cmp al,00FH ; Are any blks marked empty?
jne InterBest ; If some blks are empty, can't code as Intra
mov edi,TargetMacroBlockBaseAddr
mov [edx].SWD,ebx
cmp ebx,INTERCODINGTHRESHOLD ; Is InterSWD below inter-coding thresh?
jae CalculateIntraSWD
InterBestX:
mov ebx,[edx].SWD
InterBest:
mov ecx,SWDTotal ; Add to total for this macroblock class.
add ecx,ebx
IFDEF H261
mov SWDTotal,ecx
ELSE ;H263
mov bl,DoAdvancedPrediction
mov SWDTotal,ecx
test bl,bl
jne OBMCDifferencing
ENDIF
;============================================================================
; Perform differencing for the non-empty luma blocks of an Inter-coded
; macroblock. This is the non-OBMC case; i.e. Advanced Prediction is
; not selected.
;
; ebp -- PITCH
; esi -- Address of reference block.
; edi -- Address of target block.
; edx -- MBlockActionStream. Used as cursor over luma blocks.
; ecx -- Not in use.
; ebx -- Scratch. Used to test half pel MV resolution.
; eax[0:3] -- Coded block pattern for luma blocks.
mov cl,INTER1MV
mov ebx,TargetMacroBlockBaseAddr
mov StashBlockType,cl
test al,1 ; Don't diff block 1 if marked empty.
mov edi,ebx
je @f
mov ebx,[edx].BlkY1.MVs
mov esi,[edx].BlkY1.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
mov al,[edx].CodedBlocks
sub al,bl
mov ebx,TargetMacroBlockBaseAddr
mov [edx].CodedBlocks,al
pop edi ; Adjust stack pointer
StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+8] ; Get address of next macroblock to do.
test al,2 ; Don't diff block 2 if marked empty.
je @f
mov ebx,[edx].BlkY2.MVs
mov esi,[edx].BlkY2.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,1
mov al,[edx].CodedBlocks
sub al,bl
mov ebx,TargetMacroBlockBaseAddr
mov [edx].CodedBlocks,al
pop edi ; Adjust stack pointer
StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+ebp*8] ; Get address of next macroblock to do.
test al,4 ; Don't diff block 3 if marked empty.
je @f
mov ebx,[edx].BlkY3.MVs
mov esi,[edx].BlkY3.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,2
mov al,[edx].CodedBlocks
sub al,bl
mov ebx,TargetMacroBlockBaseAddr
mov [edx].CodedBlocks,al
pop edi ; Adjust stack pointer
StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+ebp*8+8] ; Get address of next macroblock to do.
test al,8 ; Don't diff block 4 if marked empty.
je NonOBMCDifferencingDone
mov ebx,[edx].BlkY4.MVs
mov esi,[edx].BlkY4.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.)
movq mm4,[edi+ebp*4] ; T4
psrlq mm1,1
movq mm5,[edi+PITCH*5]
psubb mm4,mm0 ; D4 = T4 - P4
movq mm0,[edi+PITCH*6]
psubb mm5,mm1
movq mm1,[edi+PITCH*7]
pand mm2,mm6
pand mm3,mm6
psrlq mm2,1
movq PelDiffsLine4,mm4 ; Store D4.
psubb mm0,mm2
movq PelDiffsLine5,mm5
psrlq mm3,1
movq PelDiffsLine6,mm0
psubb mm1,mm3
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,3
mov al,[edx].CodedBlocks
sub al,bl
pop edi ; Adjust stack pointer
mov [edx].CodedBlocks,al
StackOffset TEXTEQU <0>
NonOBMCDifferencingDone:
IFDEF H261
ELSE
mov al,IsPlainPFrame
test al,al
jne NextMacroBlock
movq mm6,C0101010101010101
pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
ENDIF
jmp NextMacroBlock
;============================================================================
; Register usage in the following internal function. This function does
; half pel motion estimation for whole macroblocks, or individual blocks.
;
; ebp -- PITCH
; esi -- Address of best full pel reference macroblock. For MBME unchanged
; at exit. For BlkME, adjusted by -8-8*PITCH.
; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
; adjusted by -8-8*PITCH.
; edx -- MBlockActionStream
; ecx -- Reserved.
; ebx -- For MBME: 240 + Flags to indicate which half pel ME to do:
; 1 --> right; 2 --> left; 4 --> down; 8 --> up
; For BlkME: Garbage
; eax -- Count from -4 to -1 for blocks of macroblock. 0 for single block.
; mm7 -- Initialized to zero.
; mm6 -- Initialized to zero.
; mm0:mm7 -- Scratch
; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
; mm3[16:31] -- SWD for ref 1/2 pel leftward
; mm3[32:47] -- SWD for ref 1/2 pel downward
; mm3[48:63] -- SWD for ref 1/2 pel upward
StackOffset TEXTEQU <4>
HalfPelMotionEstimation:
and bl,15
HalfPelMBMEForUpperBlock:
HalfPelMEForFirst2LinesOfBlock:
movq mm0,[esi-PITCH] ; <P^7 P^6 P^5 P^4 P^3 P^2 P^1 P^0>
movq mm1,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
HalfPelMEForNext2LinesOfBlock:
movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
HalfPelMBMEForLowerBlock:
psubw mm0,[edi] ; <(P^7+P07)/2-T07 junk (P^5+P05)/2-T05 junk ...>
paddb mm5,mm2 ; <P07+P17 P06+P16 P05+P15 P04+P14 ...>
pmullw mm1,C0101010101010101 ; <(P07+P06)*256+P06 ...>
psllw mm5,8 ; <(P06+P16) 0 (P04+P14) 0 ...>
pmaddwd mm0,mm0 ; Square diff for line 0 odd pels, upward ref.
psrlw mm5,1 ; <(P06+P16)/2 0 (P04+P14)/2 0 ...>
movq mm3,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
psubw mm4,mm5 ; <T16-(P06+P16)/2 junk ...>
pmaddwd mm4,mm4 ; Square diff for line 1 even pels, upward ref.
psrlw mm1,1 ; <(P07+P06)*128+P06/2 ...>
psllw mm3,8 ; <T06 0 T04 0 T02 0 T00 0>
lea edi,[edi+ebp*2] ; Advance Target cursor
psubw mm3,mm1 ; <T06-(P07+P06)/2 junk T04-(P05+P03)/2 junk ...>
lea esi,[esi+ebp*2] ; Advance Reference cursor
psubw mm1,[edi-PITCH*2] ; <(P07+P06)/2-T07 junk (P05+P04)/2-T05 junk ...>
pmaddwd mm3,mm3 ; Square diff for line 0 even pels, rightwrd ref.
pmaddwd mm1,mm1 ; Square diff for line 0 odd pels, leftward ref.
paddusw mm0,mm4 ; SSD for line 0 and 1, upward ref.
pand mm0,CFFFF0000FFFF0000 ; Extract SSD for line 0 and 1, upward ref.
movq mm4,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
paddusw mm6,mm0 ; Accumulate SSD for line 0 and 1, upward ref.
psrlq mm4,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
pand mm1,CFFFF0000FFFF0000 ; Extract SSD for line 0, leftward ref.
psrld mm3,16 ; Extract SSD for line 0, rightward ref.
pmullw mm4,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
paddw mm3,mm1 ; SSD for line 0, leftward and rightward refs.
movq mm1,[esi] ; <P27 P26 P25 P24 P23 P22 P21 P20>
movq mm0,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
paddusw mm7,mm3 ; Accumulate SSD for line 0, left and right refs.
paddb mm2,mm1 ; <P17+P27 P16+P26 P15+P25 P14+P24 ...>
movq mm3,mm0 ; <P17 P16 P15 P14 P13 P12 P11 P10>
psrlw mm4,1 ; <P17 (P16*P15)*128+P15/2 ...>
psubw mm4,[edi-PITCH*1] ; <P17-T17 junk (P16*P15)/2-T15 junk ...>
psllq mm3,8 ; <P16 P15 P14 P13 P12 P11 P10 0>
pmullw mm3,C0101010101010002 ; <(P16+P15)*256+P15 ... P10*256*2>
psrlw mm2,1 ; <(P17+P27)/2 junk (P15+P25)/2 junk ...>
movq StashMM6,mm6
pmaddwd mm4,mm4 ; Square diff for line 1 odd pels, rightward ref.
movq mm6,[edi-PITCH*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
psrlw mm3,1 ; <(P16+P15)*128+P15/2 ... P10*256>
psubw mm2,[edi-PITCH*1] ; <(P17+P27)/2-T17 junk (P15+P25)/2-T15 junk ...>
psllw mm6,8 ; <T16 0 T14 0 T12 0 T10 0>
psubw mm3,mm6 ; <(P16+P15)/2-T16 junk ... P10-T10>
psrld mm4,16 ; Extract SSD for line 1, rightward ref.
movq mm6,[edi-PITCH*2] ; <T07 T06 T05 T04 T03 T02 T01 T00>
pmaddwd mm3,mm3 ; Square diff for line 1 even pels, leftward ref.
pmaddwd mm2,mm2 ; Square diff for line 1 odd pels, downward ref.
psllw mm6,8 ; <T06 0 T04 0 T02 0 T00 0>
paddusw mm7,mm4 ; Accumulate SSD for line 1, rightward ref.
psubw mm6,mm5 ; <T06-(P06+P16)/2 junk ...>
pand mm3,CFFFF0000FFFF0000 ; Extract SSD for line 1, leftward ref.
pmaddwd mm6,mm6 ; Square diff for line 0 even pels, downward ref.
add bl,080H
psrld mm2,16 ; Extract SSD for line 1, downward ref.
paddusw mm2,StashMM6 ; Accumulate SSD for line 1, downward ref.
paddusw mm7,mm3 ; Accumulate SSD for line 1, leftward ref.
movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
psrld mm6,16 ; Extract SSD for line 0, downward ref.
paddusw mm6,mm2 ; Accumulate SSD for line 0, downward ref.
paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
punpckldq mm5,mm6 ; Speculatively start to accum partial SWDs.
jnc HalfPelMEForNext2LinesOfBlock ; Iterate twice, for half a block.
punpckldq mm3,mm7
add bl,040H
paddusw mm5,mm6
jns HalfPelMEForNext2LinesOfBlock ; Iterate twice, for a whole block.
paddusw mm3,mm7
psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
punpckhdq mm3,mm5 ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
; ; mm3[16:31] -- SWD for ref 1/2 pel leftward
; ; mm3[32:47] -- SWD for ref 1/2 pel downward
; ; mm3[48:63] -- SWD for ref 1/2 pel upward
movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
sub bl,080H
movq HalfPelMBMESWDAccum[eax*8+32],mm3
psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
add eax,2
jl HalfPelMBMEForLowerBlock ; Iterate twice for 2 blocks.
lea edi,[edi-PITCH*16+8]
lea esi,[esi-PITCH*16+8]
lea eax,[eax-3]
je HalfPelMBMEForUpperBlock ; Iterate twice for macroblock.
sub edi,16
xor eax,eax
sub esi,16
mov al,bl
ret
StackOffset TEXTEQU <0>
;============================================================================
; Register usage in the following internal function. This function does
; half pel motion estimation in both directions for whole macroblocks, or
; individual blocks.
;
; ebp -- PITCH
; esi -- Address of best full pel reference macroblock. For MBME unchanged
; at exit. For BlkME, adjusted by -8-8*PITCH.
; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
; adjusted by -8-8*PITCH.
; edx -- MBlockActionStream
; ecx -- Reserved. Contains motion vectors.
; ebx -- Returns SWD for this reference block or macroblock.
; al -- Count from 4 to 1 for blocks of macroblock. 1 for blk only.
; mm0:mm6 -- Scratch
; mm7 -- Reserved. Contains SWDs for four 1/2 pel refs at main compass points.
; mm4 -- Returns SWD for this reference block or macroblock.
StackOffset TEXTEQU <4>
HalfPelMotionEstimationBothWays:
movq mm3,C0101010101010101
pxor mm6,mm6 ; Zero out SSD accumulator.
HalfPelMBMEForUpperBlockBothWays:
HalfPelMEForFirst2LinesOfBlockBothWays:
movq mm0,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
HalfPelMEForNext2LinesOfBlockBothWays:
HalfPelMBMEForLowerBlockBothWays:
movq mm1,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
pmullw mm0,mm3 ; <(P07+P06)*256+P06 ...>
movq mm2,[esi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
pmullw mm3,mm1 ; <(P17+P16)*256+P16 ...>
movq mm4,mm2 ; <P27 P26 P25 P24 P23 P22 P21 P20>
psrlq mm2,8 ; < 0 P27 P26 P25 P24 P23 P22 P21>
pmullw mm2,C0200010101010101 ; <P27*256*2 (P26+P25)*256+P25 ...>
psrlq mm1,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
pmullw mm1,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
psrlw mm3,2 ; <(P17+P16)/4 junk ...> (w /2 frac bits)
movq mm5,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
psrlw mm0,2 ; <(P07+P06)/4 junk ...> (w/ 2 frac bits)
paddw mm3,mm0 ; <(P07+P06+P17+P16)/4 junk ...>
psrlw mm2,2 ; <P27/2 junk (P26+P25)/4 junk ...>
psubw mm2,[edi+ebp*1] ; <P27/2-T17 junk (P26+P25)/4-T15 junk ...>
psrlw mm1,2 ; <P17/2 junk (P16+P15)/4 junk ...>
paddw mm2,mm1 ; <(P17+P27)/2-T17 junk (P16+P15+P26+P25)-T15 junk ...>
psllw mm5,8 ; <T06 0 T04 0 T02 0 T00 0>
psubw mm5,mm3 ; <T06-(P07+P06+P17+P16)/4 junk ...>
pmaddwd mm2,mm2 ; Square diffs for odd pels of line 1.
pmaddwd mm5,mm5 ; Square diffs for even pels of line 0.
movq mm0,mm4 ; <P27 P26 P25 P24 P23 P22 P21 P20>
lea edi,[edi+ebp*2] ; Advance target cursor.
lea esi,[esi+ebp*2] ; Advance reference cursor.
paddusw mm6,mm2 ; Accumulate SSD for odd pels of line 1.
add al,080H
movq mm3,C0101010101010101
paddusw mm6,mm5 ; Accumulate SSD for even pels of line 0.
punpckldq mm4,mm6 ; Speculatively start to accum partial SWDs.
jnc HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for half a block.
add al,040H
paddusw mm4,mm6 ; After whole block, SSD is in mm4[48:63].
psrlq mm4,48
jns HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for a whole block.
movdf ebx,mm4
sub al,082H
jg HalfPelMBMEForLowerBlockBothWays ; Iterate twice for 2 blocks.
lea edi,[edi-PITCH*16+8]
lea esi,[esi-PITCH*16+8]
mov al,3
je HalfPelMBMEForUpperBlockBothWays ; Iterate twice for macroblock.
sub edi,16
sub esi,16
ret
StackOffset TEXTEQU <0>
;============================================================================
; Register usage in the following internal function. This function is also
; called to do frame differencing for chroma blocks.
;
; ebp -- PITCH
; esi -- Address of reference block.
; edi -- Address of target block.
; edx -- Unavailable. In use by caller.
; ecx -- Not in use.
; ebx -- Motion vectors for the block. bl[0] indicates whether half-pel
; horizontal interpolation is required; bh[0] same for vertical.
; This register is then used for scratch purposes.
; eax -- Unavailable. In use by caller.
; mm0-mm5 -- Scratch
; mm6 -- 8 bytes of 0xFE
; mm7 -- 8 bytes of -1
StackOffset TEXTEQU <4>
DoNonOBMCDifferencing: ; Internal Function
pcmpeqb mm7,mm7
pcmpeqb mm6,mm6
IFDEF H261
ELSE ;H263
shr bl,1
jc NonOBMCDiff_Horz
ENDIF
movq mm1,[esi+ebp*1] ; BC . . . R0Dn
paddb mm6,mm6
IFDEF H261
ELSE ;H263
shr bh,1
jc NonOBMCDiff_Vert
ENDIF
psubb mm1,[edi+ebp*1] ; P1 - T1
pxor mm4,mm4
movq mm0,[edi] ; T0
psubb mm4,mm1 ; D1 = T1 - P1
psubb mm0,[esi] ; D0 = T0 - P0
movq mm2,[edi+ebp*2] ; T2
movq mm3,[edi+PITCH*3] ; T3
psubb mm2,[esi+ebp*2] ; D2 = T2 - P2
psubb mm3,[esi+PITCH*3] ; D3 = T3 - P3
movq PelDiffsLine0,mm0 ; Store D0.
movq PelDiffsLine1,mm4 ; Store D1.
movq PelDiffsLine2,mm2 ; Store D2.
movq PelDiffsLine3,mm3 ; Store D3.
movq mm3,[esi+PITCH*7] ; P7
movq mm2,[esi+PITCH*6] ; P6
paddb mm3,mm3 ; Double so that return will fix it.
movq mm1,[esi+PITCH*5] ; P5
paddb mm2,mm2 ; Double so that return will fix it.
movq mm0,[esi+ebp*4] ; P4
paddb mm1,mm1 ; Double so that return will fix it.
ret
IFDEF H261
ELSE ;H263
NonOBMCDiff_Vert: ; 0123 Detail for 0
movq mm0,[esi] ; C. . R0Up
psubb mm1,mm7 ; DD . R0Dn+1
call Get4LinesOfPred_InterpVert
movq mm5,[edi] ; T0
psrlq mm1,1 ; O .
movq mm7,[edi+ebp*1]
psubb mm5,mm0 ; D0 = T0 - P0
movq mm0,mm4
psubb mm7,mm1
movq mm1,[edi+ebp*2]
pand mm2,mm6 ; .N.
movq mm4,[edi+PITCH*3]
pand mm3,mm6 ; . N
psrlq mm2,1 ; .O.
movq PelDiffsLine0,mm5 ; Store D0.
psubb mm1,mm2
movq PelDiffsLine1,mm7 ; Store D1.
psrlq mm3,1 ; . O
movq PelDiffsLine2,mm1 ; Store D2.
psubb mm4,mm3
movq mm1,[esi+ebp*1] ; BC . . . R0Dn
pcmpeqb mm7,mm7
movq PelDiffsLine3,mm4 ; Store D3.
psubb mm1,mm7 ; DD . . . R0Dn+1
; jmp Get4MoreLinesOfPred_InterpVert
;===========================================================================
; Internal function to get 4 lines of prediction, interpolating in the
; vertical direction. The first 3 lines of the function are scheduled into
; the caller's space, and so are commented out here. For 8 lines of prediction,
; a second call, to the second entry point, is called after consuming the
; outputs of the first function call. Certain registers must remain intact
; to convey information from the first call to the second.
;
; ebp -- PITCH
; edi -- Points to target block.
; esi -- Points to Upper left corner of 8 column, 9 row block that will be
; interpolated vertically to generate prediction.
; edx -- Reserved (MBlockActionStream)
; ecx -- Not in use.
; ebx -- Will be used.
; eax -- Reserved.
; mm6 -- 8 bytes of 0xFE.
; mm7 -- 8 bytes of -1.
; mm0-mm5 -- Scratch.
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
Get4LinesOfPred_InterpVert: ; 0123 Details for line 0
; movq mm1,[esi+ebp*1] ; BC . R0Dn
; movq mm0,[esi] ; C. . R0Up
; psubb mm1,mm7 ; DD . R0Dn+1
Get4MoreLinesOfPred_InterpVert:
movq mm2,[esi+ebp*2] ; BC.
paddb mm0,mm1 ; E. . R0Up+R0Dn+1
movq mm3,[esi+PITCH*3] ; .BC
paddb mm1,mm2 ; E .
movq mm4,[esi+ebp*4] ; . BC
psubb mm3,mm7 ; .DD
paddb mm2,mm3 ; .E.
pand mm0,mm6 ; F. . Pre-clean
paddb mm3,mm4 ; E
pand mm1,mm6 ; F .
lea esi,[esi+ebp*4] ; Advance to next four lines.
psrlq mm0,1 ; G. . P0 = (R0Up + R0Dn + 1) / 2
; pand mm2,mm6 ; G.
; psrlq mm1,1 ; H .
; pand mm3,mm6 ; G
; psrlq mm2,1 ; H.
; psrlq mm3,1 ; H
ret
StackOffset TEXTEQU <4>
;===========================================================================
NonOBMCDiff_Horz:
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
paddb mm6,mm6 ; . . 8 bytes of 0xFE
shr bh,1
jc NonOBMCDiff_Both
movq mm7,[edi+PITCH*3] ; T3
call Get4LinesOfPred_InterpHorz
movq mm4,[edi] ; T0
psrlq mm1,1 ; O .
movq mm5,[edi+ebp*1]
psubb mm4,mm0 ; D0 = T0 - P0
movq mm0,[edi+ebp*2]
psubb mm5,mm1
movq mm1,[edi+PITCH*3]
pand mm2,mm6 ; .N.
pand mm3,mm6 ; . N
psrlq mm2,1 ; .O.
movq PelDiffsLine0,mm4 ; Store D0.
psubb mm0,mm2
movq PelDiffsLine1,mm5 ; Store D1.
psrlq mm3,1 ; . O
movq PelDiffsLine2,mm0 ; Store D2.
psubb mm1,mm3
movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
;
movq PelDiffsLine3,mm1 ; Store D3.
;
;===========================================================================
; Internal function to get 4 lines of prediction, interpolating in the
; horizontal direction. The first line of the function are scheduled into
; the caller's space, and so are commented out here. For 8 lines of prediction,
; a second call, to the second entry point, is called after consuming the
; outputs of the first function call. Certain registers must remain intact
; to convey information from the first call to the second.
;
; ebp -- PITCH
; edi -- Points to target block.
; esi -- Points to Upper left corner of 9 column, 8 row block that will be
; interpolated horizontally to generate prediction.
; edx -- Reserved (MBlockActionStream)
; ecx -- Not in use.
; ebx -- Will be used.
; eax -- Reserved.
; mm6 -- 8 bytes of 0xFE.
; mm0-mm5 -- Will be used.
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
Get4LinesOfPred_InterpHorz:
Get4MoreLinesOfPred_InterpHorz:
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
xor ebx,ebx ; . .
movq mm0,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
mov bl,[esi] ; C. . R00
psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
movq mm1,[esi+ebp*1+1] ; A .
paddb mm0,mm5 ; E. . <R08+R07 ... R02+R01 R01 >
paddb mm0,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
movq mm4,mm1 ; B .
mov bl,[esi+ebp*1] ; C .
psllq mm4,8 ; D .
movq mm2,[esi+ebp*2+1] ; A.
paddb mm1,mm4 ; E .
paddb mm1,Pel_Rnd[ebx*8] ; F .
movq mm5,mm2 ; B.
mov bl,[esi+ebp*2] ; C.
psllq mm5,8 ; D.
movq mm3,[esi+PITCH*3+1] ; A
paddb mm2,mm5 ; E.
paddb mm2,Pel_Rnd[ebx*8] ; F.
movq mm4,mm3 ; B
mov bl,[esi+PITCH*3] ; C
psllq mm4,8 ; D
paddb mm3,mm4 ; E
pand mm0,mm6 ; G. . pre-cleaned
paddb mm3,Pel_Rnd[ebx*8] ; F
psrlq mm0,1 ; H. . P0=<(R08+R07+1)/2 ... (R01+R00+1)/2>
lea esi,[esi+ebp*4] ; Advance to next four lines.
pand mm1,mm6 ; G .
; pand mm2,mm6 ; G.
; psrlq mm1,1 ; H .
; pand mm3,mm6 ; G
; psrlq mm2,1 ; H.
; psrlq mm3,1 ; H
ret
StackOffset TEXTEQU <4>
; The steps commented out above are scheduled into the mem-ops the caller has
; to do at the point of return. As though these ops were done, the registers
; look as follows:
; mm0 -- Prediction for line 0.
; mm1 -- Prediction for line 1.
; mm2 -- Prediction for line 2.
; mm3 -- Prediction for line 3.
; mm6 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
;=============================================================================
NonOBMCDiff_Both:
call Get4LinesOfPred_InterpBoth
movq mm7,[edi] ; T0
psrlq mm1,1 ; O .
psubb mm7,mm0 ; D0 = T0 - P0
pand mm2,mm6 ; .N.
movq mm0,[edi+ebp*1]
psrlq mm2,1 ; .O.
movq PelDiffsLine0,mm7 ; Store D0.
psubb mm0,mm1
movq mm7,[edi+ebp*2]
pand mm3,mm6 ; . N
movq PelDiffsLine1,mm0
psrlq mm3,1 ; . O
movq mm1,[edi+PITCH*3]
psubb mm7,mm2
psubb mm1,mm3
movq mm0,mm4
movq PelDiffsLine2,mm7
paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
movq PelDiffsLine3,mm1 ; Store D3.
pcmpeqb mm7,mm7
jmp Get4MoreLinesOfPred_InterpBoth
;===========================================================================
; Internal function to get 4 lines of prediction, interpolating in both
; directions. The first line of the function are scheduled into the
; caller's space, and so are commented out here. For 8 lines of prediction,
; a second call, to the second entry point, is called after consuming the
; outputs of the first function call. Certain registers must remain intact
; to convey information from the first call to the second.
;
; ebp -- PITCH
; edi -- Points to target block.
; esi -- Points to Upper left corner of 9*9 block that will be interpolated
; horizontally and vertically to generate prediction.
; edx -- Reserved (MBlockActionStream)
; ecx -- Not in use
; ebx -- Will be used.
; eax -- Reserved.
; mm6 -- 8 bytes of 0xFE.
; mm7 -- 8 bytes of -1.
; mm0-mm5 -- Scratch
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
Get4LinesOfPred_InterpBoth: ; 01234 Details for line 0
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
movq mm1,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
xor ebx,ebx ; . .
mov bl,[esi] ; C. . R00
psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
paddb mm5,mm1 ; E. . <R08+R07 ... R02+R01 R01>
paddb mm5,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
movq mm0,mm6 ; G. . Mask to extract each pel's frac bit.
pandn mm0,mm5 ; H. . <(R08+R07+1)&1 ...>
pand mm5,mm6 ; I. . Pre-clean
Get4MoreLinesOfPred_InterpBoth: ; . .
movq mm2,[esi+ebp*1+1] ; A .
psrlq mm5,1 ; J. . <(R08+R07+1)/2 ... (R01+R00+1)/2)>
xor ebx,ebx ; . .
movq mm1,mm2 ; B .
mov bl,[esi+ebp*1] ; C .
psllq mm2,8 ; D .
movq mm3,[esi+ebp*2+1] ; .A.
paddb mm2,mm1 ; E .
paddb mm2,Pel_Rnd[ebx*8] ; F .
movq mm1,mm3 ; .B.
mov bl,[esi+ebp*2] ; .C.
psllq mm3,8 ; .D.
movq mm4,[esi+PITCH*3+1] ; . A
paddb mm3,mm1 ; .E.
paddb mm3,Pel_Rnd[ebx*8] ; .F.
movq mm1,mm4 ; . B
mov bl,[esi+PITCH*3] ; . C
pand mm0,mm2 ; K. . <(R08+R07+1)&(R18+R17+1)&1 ...>
paddb mm0,mm5 ; L. . <(R08+R07+1+((R18+R17+1)&1))/2 ...>
psllq mm4,8 ; . D
movq mm5,[esi+ebp*4+1] ; . .A
paddb mm4,mm1 ; . E
paddb mm4,Pel_Rnd[ebx*8] ; . F
movq mm1,mm5 ; . .B
mov bl,[esi+ebp*4] ; . .C
psllq mm5,8 ; . .D
paddb mm5,mm1 ; . .E
movq mm1,mm6 ; G .
pandn mm1,mm2 ; H .
pand mm2,mm6 ; I .
paddb mm5,Pel_Rnd[ebx*8] ; . .F
psrlq mm2,1 ; J .
paddb mm0,mm2 ; M. . <(R08+R07+R18+R17+2)/2 ...>
pand mm1,mm3 ; K .
paddb mm1,mm2 ; L .
movq mm2,mm6 ; .G.
pandn mm2,mm3 ; .H.
pand mm3,mm6 ; .I.
pand mm0,mm6 ; N. . Pre-clean
psrlq mm3,1 ; .J.
paddb mm1,mm3 ; M .
pand mm2,mm4 ; .K.
paddb mm2,mm3 ; .L.
movq mm3,mm6 ; . G
pandn mm3,mm4 ; . H
pand mm4,mm6 ; . I
pand mm3,mm5 ; . K
psrlq mm4,1 ; . J
paddb mm2,mm4 ; .M.
paddb mm3,mm4 ; . L
movq mm4,mm6 ; . .G
psrlq mm0,1 ; O. . P0 = <(R08+R07+R18+R17+2)/4 ...>
pandn mm4,mm5 ; . .H
pand mm5,mm6 ; . .I
pand mm1,mm6 ; N .
psrlq mm5,1 ; . .J
paddb mm3,mm5 ; . M
lea esi,[esi+ebp*4] ; Advance to next four lines.
; pand mm2,mm6 ; .N.
; psrlq mm1,1 ; O .
; pand mm3,mm6 ; . N
; psrlq mm2,1 ; .O.
; paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
; psrlq mm3,1 ; . O
ret
StackOffset TEXTEQU <4>
; The steps commented out above are scheduled into the mem-ops the caller has
; to do at the point of return. As though these ops were done, the registers
; look as follows:
; mm0 -- Prediction for line 0.
; mm1 -- Prediction for line 1.
; mm2 -- Prediction for line 2.
; mm3 -- Prediction for line 3.
; mm4 -- Must be moved to mm0 before computing prediction for next 4 lines.
; mm5 -- Must be doubled before computing prediction for next 4 lines.
; mm6 -- 8 bytes of 0x01. Must be this when computing pred for next 4 lines.
; mm7 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
;=============================================================================
ENDIF
StackOffset TEXTEQU <0>
IFDEF H261
ELSE ;H263
OBMCDifferencing:
mov al,PendingOBMC ; Do OBMC for previous block, if needed..
mov bl,1
test al,al
mov PendingOBMC,bl
mov cl,INTER1MV
je NextMacroBlock
mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame
test al,al
jne NextMacroBlock
add edx,-SIZEOF T_MacroBlockActionDescr
movq mm6,C0101010101010101
pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr
jmp NextMacroBlock
ENDIF
;============================================================================
; Calculate the IntraSWD
;
; ebp -- PITCH
; esi -- Accumulation for IntraSWD
; edi -- Address of target macroblock.
; edx -- MBlockActionStream
; ecx -- Scratch
; ebx -- Amount IntraSWD has to be less than to be the winner.
; eax -- Reserved. Holds coded blk pattern, (except undef when IntraByDecree).
; mm7 -- SWD total for macroblock.
; mm6 -- Average pel value for block 1.
; mm5 -- Average pel value for block 2.
; mm4 -- Average pel value for block 3.
; mm3 -- Average pel value for block 4.
; mm0-mm2 Scratch
;
IntraByDecree:
mov ebx,000080000H ; Set Inter SWD artificially high.
CalculateIntraSWD:
sub ebx,INTRACODINGDIFFERENTIAL
mov cl,1
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
pcmpeqb mm5,mm5
ComputeIntraSWDForNextBlock:
movq mm2,[edi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
psrlw mm5,8
movq mm4,[edi+ebp*4]
paddw mm0,mm2 ; <junk P06+P26 junk P04+P24 ...>
movq mm6,[edi+PITCH*6]
pand mm0,mm5 ; <P06+P26 P04+P24 P02+P22 P00+P20>
movq mm1,[edi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
paddw mm4,mm6
movq mm3,[edi+PITCH*3] ; <P37 P36 P35 P34 P33 P32 P31 P30>
pand mm4,mm5
movq mm5,[edi+PITCH*5]
paddw mm1,mm3 ; <P17+P37 junk P15+P35 junk ...>
movq mm7,[edi+PITCH*7]
psrlw mm1,8 ; <P17+P37 P15+P35 P13+P33 P11+P31>
paddw mm0,mm1
paddw mm5,mm7
paddw mm0,mm4
psrlw mm5,8
paddw mm0,mm5
pcmpeqw mm5,mm5 ; Get words of -1
movq mm4,[edi+ebp*4]
pmaddwd mm0,mm5 ; <SumHi = Sum3+Sum2 | SumLo = Sum1+Sum0>
pcmpeqw mm1,mm1
psllw mm3,8 ; <P36 0 P34 0 P32 0 P30 0>
movq mm5,[edi+PITCH*5]
psllw mm1,3 ; 4 words of 0xFFF8
packssdw mm0,mm0 ; <SumHi | SumLo | SumHi | SumLo>
mov al,[edx].CodedBlocks ; Fetch coded block pattern.
pmaddwd mm0,mm1 ; <Sum = SumHi+SumLo | Sum = SumHi+SumLo>
psllw mm5,8
movq mm1,[edi+ebp*1]
psllw mm7,8
;
psllw mm1,8
;
packssdw mm0,mm0 ; <Sum | Sum | Sum | Sum>
psubw mm1,mm0 ; <P16-Avg frac P14-Avg frac ...>
psubw mm2,mm0 ; <P27-Avg frac P25-Avg frac ...>
pmaddwd mm1,mm1 ; Square of diff
psubw mm3,mm0
pmaddwd mm2,mm2
psubw mm4,mm0
pmaddwd mm3,mm3
psubw mm5,mm0
pmaddwd mm4,mm4
psubw mm6,mm0
psubw mm7,mm0
paddusw mm1,mm2
psubw mm0,[edi]
pmaddwd mm5,mm5
pmaddwd mm6,mm6
paddusw mm1,mm3
pmaddwd mm7,mm7
paddusw mm1,mm4
pmaddwd mm0,mm0
paddusw mm1,mm5
paddusw mm1,mm6
cmp cl,2
paddusw mm1,mm7
;
paddusw mm0,mm1
;
punpckldq mm1,mm0
;
paddusw mm0,mm1
jg LowerBlkIntraDone
psrlq mm0,48
lea edi,[edi+ebp*8+8] ; Speculate going from blk 1 to blk 4
mov cl,4
je Blk2IntraDone
Blk1IntraDone:
movdf esi,mm0
sub ebx,esi
jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
pcmpeqb mm5,mm5
jmp ComputeIntraSWDForNextBlock
LowerBlkIntraDone:
psrlq mm0,48
sub edi,PITCH*8 ; Speculate going from blk 4 to blk 2
cmp cl,3
je Blk3IntraDone
Blk4IntraDone:
movdf ecx,mm0
add esi,ecx ; Accumulate IntraSWD
sub ebx,ecx
jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
pcmpeqb mm5,mm5
mov cl,2
jmp ComputeIntraSWDForNextBlock
Blk2IntraDone:
movdf ecx,mm0
add esi,ecx ; Accumulate IntraSWD
sub edi,16 ; Get to blk 3.
sub ebx,ecx
jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
pcmpeqb mm5,mm5
mov cl,3
jmp ComputeIntraSWDForNextBlock
Blk3IntraDone:
movdf ecx,mm0
add esi,ecx ; Accumulate IntraSWD
sub ebx,ecx
jle InterBestX
IntraBest:
mov ecx,SWDTotal
and al,07FH ; Turn off FORCE-INTRA bit.
mov [edx].SWD,esi
add ecx,esi ; Add to total.
mov SWDTotal,ecx
mov cl,INTRA
mov [edx].BlockType,cl ; Indicate macroblock handling decision.
xor ecx,ecx
mov [edx].BlkY1.MVs,ecx
mov [edx].BlkY2.MVs,ecx
mov [edx].BlkY3.MVs,ecx
mov [edx].BlkY4.MVs,ecx
mov [edx].CodedBlocks,al
IFDEF H261
ELSE ;H263
mov al,PendingOBMC ; Do Prev MB if it needs to be OBMC'ed.
mov [edx].BestFullPelMBHMV,cl ; Kill MVs so extended EMV of other
; ; blocks will work right.
dec al
mov [edx].BestFullPelMBVMV,cl
jne @f
mov PendingOBMC,al ; Go on to next MB, unless the prev MB
; ; needs to be finished (OBMC).
mov cl,INTER1MV
mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame
test al,al
jne @f
add edx,-SIZEOF T_MacroBlockActionDescr
movq mm6,C0101010101010101
pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr
@@:
ENDIF
mov cl,INTRA
mov esi,TargetMacroBlockBaseAddr
mov StashBlockType,cl
push eax ; Adjust stack pointer
StackOffset TEXTEQU <4>
call MMxDoForwardDCT
mov al,[edx].CodedBlocks
mov esi,TargetMacroBlockBaseAddr
sub al,bl
add esi,8
mov [edx].CodedBlocks,al
call MMxDoForwardDCT
shl bl,1
mov al,[edx].CodedBlocks
sub al,bl
mov esi,TargetMacroBlockBaseAddr
mov [edx].CodedBlocks,al
add esi,PITCH*8
call MMxDoForwardDCT
shl bl,2
mov al,[edx].CodedBlocks
sub al,bl
mov esi,TargetMacroBlockBaseAddr
mov [edx].CodedBlocks,al
add esi,PITCH*8+8
call MMxDoForwardDCT
shl bl,3
mov al,[edx].CodedBlocks
sub al,bl
pop edi ; Adjust stack pointer
StackOffset TEXTEQU <0>
mov [edx].CodedBlocks,al
IFDEF H261
ELSE
mov al,IsPlainPFrame
test al,al
jne NextMacroBlock
movq mm6,C0101010101010101
pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
ENDIF
jmp NextMacroBlock
IFDEF H261
ELSE; H263
StackOffset TEXTEQU <4>
DoPendingOBMCDiff: ; Internal function
;============================================================================
; Perform differencing for the non-empty luma blocks of an Inter-coded
; macroblock. This is the OBMC case; i.e. Advanced Prediction is selected.
PrevMBAD EQU [edx-SIZEOF T_MacroBlockActionDescr]
pcmpeqb mm6,mm6
pcmpeqb mm7,mm7 ; 8 bytes of -1
paddb mm6,mm6 ; 8 bytes of 0xFE
mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
test al,1 ; Check if block 1 empty.
je OBMCDoneForBlock1
xor ebx,ebx
mov eax,SIZEOF T_Blk ; Blk to right is blk 2 of this MB.
mov bl,PrevMBAD.MBEdgeType
mov ecx,1 ; Mask to extract left edge indicator.
and ecx,ebx ; Extract left edge indicator.
and ebx,4 ; Extract top edge indicator.
mov esi,PrevMBAD.BlkY1.MVs
lea edi,[eax*2] ; Blk below is blk 3 of this MB.
mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
mov edi,BlockAbove[ebx] ; Blk above is blk 3 of mb above, or off
; ; upper edge.
mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 2 of mb to the
; ; left, or off left edge.
mov DistToBADforBlockAbove,edi
call DoOBMCForBlock
mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
sub al,bl
mov PrevMBAD.CodedBlocks,al
OBMCDoneForBlock1:
add edx,SIZEOF T_Blk
test al,2 ; Check if block 2 empty.
je OBMCDoneForBlock2
xor ebx,ebx
mov eax,2 ; Mask to extract right edge indicator.
mov bl,PrevMBAD[-SIZEOF T_Blk].MBEdgeType
mov edi,2*SIZEOF T_Blk ; Blk below is blk 4 of this MB.
and eax,ebx ; Extract right edge indicator.
and ebx,4 ; Extract top edge indicator.
mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
lea ecx,[edi-3*SIZEOF T_Blk] ; Blk to left is blk 1 of this MB.
mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
; ; right, or off right edge.
mov edi,BlockAbove[ebx] ; Blk above is blk 4 of mb above, or off
; ; upper edge.
mov esi,PrevMBAD.BlkY1.MVs
mov DistToBADforBlockAbove,edi
call DoOBMCForBlock
shl bl,1
mov al,PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks
sub al,bl
mov PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock2:
add edx,SIZEOF T_Blk
test al,4 ; Check if block 3 empty.
je OBMCDoneForBlock3
xor ecx,ecx
xor ebx,ebx ; Blk below is this block.
mov cl,PrevMBAD[-2*SIZEOF T_Blk].MBEdgeType
mov eax,SIZEOF T_Blk ; Blk to right is blk 4 of this MB.
and ecx,1 ; Extract left edge indicator.
mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
lea edi,[eax-3*SIZEOF T_Blk] ; Blk above is blk 1 of this MB.
mov esi,PrevMBAD.BlkY1.MVs
mov DistToBADforBlockAbove,edi
mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 1 of mb to the
; ; left, or off left edge.
call DoOBMCForBlock
shl bl,2
mov al,PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks
sub al,bl
mov PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock3:
add edx,SIZEOF T_Blk
test al,8 ; Check if block 4 empty.
je OBMCDoneForBlock4
xor eax,eax
xor ebx,ebx ; Blk below is this block.
mov al,PrevMBAD[-3*SIZEOF T_Blk].MBEdgeType
mov ecx,-SIZEOF T_Blk ; Blk to left is blk 3 of this MB.
and eax,2 ; Extract right edge indicator.
mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
lea edi,[ecx*2] ; Blk above is blk 2 of this MB.
mov esi,PrevMBAD.BlkY1.MVs
mov DistToBADforBlockAbove,edi
mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
; ; right, or off right edge.
call DoOBMCForBlock
shl bl,3
mov al,PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks
sub al,bl
mov PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock4:
sub edx,3*SIZEOF T_Blk ; Get back to MacroBlock Action Descriptor
ret
StackOffset TEXTEQU <8>
DoOBMCForBlock: ; Internal Function
; Present register contents.
; ebp -- PITCH
; esi -- Motion vectors for current block.
; ecx -- Distance from BAD of blk we're doing to BAD for block that provides
; remote MV from left.
; eax -- Distance from BAD of blk we're doing to BAD for block that provides
; remote MV from right.
; edx -- MBlockActionStream, adjusted to reach BAD of blk we are doing OBMC to.
; doing OBMC)
; mm7 -- 8 bytes of -1.
; mm6 -- 8 bytes of 0xFE.
;
; In the body of this code:
;
; edx -- Unchanged.
; edi -- Saved to memory. Then used for address of destination for storing
; remote prediction blocks.
; ebp -- PITCH.
; esi -- Pointer to 8*8, 8*9, 9*8, or 9*9 remote reference areas, which are
; then interpolated and stored at edi.
; ecx, eax -- Inputs are used, then these are scratch.
; ebx -- Scratch
; mm7 -- 8 bytes of -1
; mm6 -- 8 bytes of 0xFE
; mm0-mm5 -- Scratch
; Compute left remote prediction block.
lea edi,PrevMBAD[ecx]
and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to left.
lea ebx,CentralPred
mov AddrOfLeftPred,ebx ; Speculate that left remote MV == center MV.
mov AddrOfRightPred,ebx ; Speculate that right remote MV == center MV.
mov bl,[edi].BlockType
cmp bl,INTRA
je LeftEqCtr ; Jump if INTRA. (Use central)
mov ebx,PrevMBAD[ecx].BlkY1.MVs
and ebx,00000FFFFH ; Blk to left may have B MVs set. Clear them.
cmp esi,ebx
je LeftEqCtr
mov edi,PrevMBAD[ecx].BlkY1.BlkOffset
mov esi,PrevMBAD[ecx].BlkY1.PastRef ; Get ref addr using left remote.
sub esi,edi
mov edi,PrevMBAD.BlkY1.BlkOffset
add esi,edi
lea edi,LeftPred
call GetPredForCenterLeftOrRight
pand mm2,mm6
psrlq mm1,1
movq [edi+32],mm0
psrlq mm2,1
movq [edi+40],mm1
pand mm3,mm6
movq [edi+48],mm2
psrlq mm3,1
lea ecx,PrevMBAD[eax]
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
mov esi,PrevMBAD.BlkY1.MVs
movq [edi+56],mm3
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
; Compute right remote prediction block.
mov AddrOfLeftPred,edi
mov bl,[ecx].BlockType
cmp bl,INTRA
je RightEqCtrButLeftNeCtr ; Jump if INTRA.(Use central)
mov ebx,PrevMBAD[eax].BlkY1.MVs
cmp esi,ebx
je RightEqCtrButLeftNeCtr
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
mov edi,PrevMBAD[eax].BlkY1.BlkOffset
RightNeCtr:
sub esi,edi
mov edi,PrevMBAD.BlkY1.BlkOffset
add esi,edi
lea edi,RightPred
call GetPredForCenterLeftOrRight
pand mm2,mm6
psrlq mm1,1
movq [edi+32],mm0
psrlq mm2,1
movq [edi+40],mm1
pand mm3,mm6
movq [edi+48],mm2
psrlq mm3,1
mov AddrOfRightPred,edi
;
movq [edi+56],mm3
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
RightEqCtrButLeftNeCtr:
; Compute central prediction block.
mov ebx,PrevMBAD.BlkY1.MVs
mov esi,PrevMBAD.BlkY1.PastRef
lea edi,CentralPred
mov eax,DistToBADforBlockBelow
call GetPredForCenterLeftOrRight
pand mm2,mm6
psrlq mm1,1
movq [edi+32],mm0
psrlq mm2,1
movq [edi+40],mm1
pand mm3,mm6
movq [edi+48],mm2
psrlq mm3,1
lea ecx,PrevMBAD[eax]
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
mov esi,PrevMBAD.BlkY1.MVs
movq [edi+56],mm3
pcmpeqb mm7,mm7
mov bl,[ecx].BlockType
mov ecx,PrevMBAD.BlkY1.BlkOffset
cmp bl,INTRA
je BelowEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
; Compute bottom remote prediction block.
mov ebx,PrevMBAD[eax].BlkY1.MVs
mov edi,AddrOfLeftPred
cmp esi,ebx
jne BelowNeCtr
BelowEqCtrButSidesDiffer:
paddb mm1,mm1 ; Prep mm0-3, which have ctr, for reuse below.
paddb mm2,mm2
paddb mm3,mm3
mov edi,AddrOfLeftPred
jmp BelowEqCtr
BelowNeCtr:
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
sub esi,eax
lea eax,[ecx+ebp*4]
call GetPredForAboveOrBelow
BelowEqCtr:
; Compute difference for lines 4 thru 7.
; Lines 4 and 5: Cols 0,1,6, and 7 treated same. Cols 2-5 treated same.
mov esi,AddrOfRightPred
mov ebx,TargetFrameBaseAddress
movdt mm5,[edi+48] ; 6B: < 0 0 0 0 R63 R62 R61 R60>
pand mm2,mm6
punpckldq mm5,[esi+48+4] ; 6C: <L67 L66 L65 L64 R63 R62 R61 R60>
pand mm3,mm6
movq mm4,CFFFF00000000FFFF ; 6D: < FF FF 00 00 00 00 FF FF>
psrlq mm2,1 ; 6A: <B67 B66 B65 B64 B63 B62 B61 B60>
pand mm4,mm5 ; 6E: <L67 L66 00 00 00 00 R61 R60>
paddb mm5,mm2 ; 6F: <B67+L67 ... B65+L65 ...>
pand mm2,C0000FFFFFFFF0000 ; 6G: < 00 00 B65 B64 B63 B62 00 00>
psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
paddb mm2,mm4 ; 6H: <L67 L66 B65 B64 B63 B62 R61 R60>
add ecx,ebx ; Address of target block.
movdt mm4,[edi+56] ; 7B: < 0 0 0 0 R73 R72 R71 R70>
psubb mm5,mm2 ; 6I: <B67 B66 L65 L64 R63 R62 B61 B60>
paddb mm5,CentralPred+48 ; 6J: <C67+B67 ... C65+L65 ...>
psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
punpckldq mm4,[esi+56+4] ; 7C: <L77 L76 L75 L74 R73 R72 R71 R70>
pand mm5,mm6 ; 6K: <C67+B67 ... C65+L65 ...> pre-cleaned
mov eax,DistToBADforBlockAbove
psrlq mm5,1 ; 6L: <(C67+B67)/2 ... (C65+L65)/2 ...>
paddb mm2,mm5 ; 6M: <(C67+B67+2L67)/2 ...
; ; (C65+2B65+L65)/2 ...>
lea ebx,PelDiffs
movq mm5,CFF000000000000FF ; 7D: < FF 00 00 00 00 00 00 FF>
pand mm2,mm6 ; 6N: pre-cleaned
pandn mm5,CentralPred+56 ; 7E: < 00 C76 C75 C74 C73 C72 C71 00>
psrlq mm2,1 ; 6O: <(C67+B67+2L67)/4 ...
; ; (C65+2B65+L65)/4 ...>
paddb mm2,CentralPred+48 ; 6P: <(5C67+B67+2L67)/4 ...
; ; (5C65+2B65+L65)/4 ...>
paddb mm5,mm4 ; 7F: <L77 C76+L76 ...>
pand mm4,CFF000000000000FF ; 7G: <L77 00 00 00 00 00 00 L70>
psubb mm2,mm7 ; 6Q: <(5C67+B67+2L67+4)/4 ...
; ; (5C65+2B65+L65+4)/4 ...>
paddb mm4,mm5 ; 7H: <2L77 C76+L76 ...>
pand mm2,mm6 ; 6R: pre-cleaned
movq mm5,[ecx+PITCH*6] ; 6T: T6
psrlq mm2,1 ; 6S: P6 = <(5C67+B67+2L67+4)/8 ...
; ; (5C65+2B65+L65+4)/8 ...>
psubb mm5,mm2 ; 6U: D6 = T6 - P6
;
; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
movdt mm2,[edi+32] ; 4B: < 0 0 0 0 R43 R42 R41 R40>
pand mm4,mm6 ; 7I: <2L77 C76+L76 ...> pre-cleaned
movq [ebx+6*16],mm5 ; 6V: Store D6.
psrlq mm4,1 ; 7J: <2L77/2 (C76+L76)/2 ...>
punpckldq mm2,[esi+32+4] ; 4C: <L47 L46 L45 L44 R43 R42 R41 R40>
paddb mm3,mm4 ; 7K: <(2B77+2L77)/2 (C76+2B76+L76)/2 ...>
movq mm5,CFFFF00000000FFFF ; 4D: < FF FF 00 00 00 00 FF FF>
pand mm3,mm6 ; 7L: pre-cleaned
movq mm4,CentralPred+32 ; 4E: <C47 C46 C45 C44 C43 C42 C41 C40>
psrlq mm3,1 ; 7M: <(2B77+2L77)/4 (C76+2B76+L76)/4 ...>
paddb mm3,CentralPred+56 ; 7N: <(4C77+2B77+2L77)/4
; ; (5C76+2B76+L76)/4 ...>
pand mm5,mm4 ; 4F: <C47 C46 00 00 00 00 C41 C40>
psubb mm3,mm7 ; 7O: <(4C77+2B77+2L77+4)/4
; ; (5C76+2B76+L76+4)/4 ...>
paddb mm4,mm2 ; 4G: <C47+L47 ... C45+L45 ...>
pand mm2,C0000FFFFFFFF0000 ; 4H: < 00 00 L45 L44 R43 R42 00 00>
pand mm3,mm6 ; 7P: <(4C77+2B77+2L77+4)/4
; ; (5C76+2B76+L76+4)/4 ...> pre-cleaned
paddb mm2,mm5 ; 4I: <C47 C46 L45 L44 R43 R42 C41 C40>
psrlq mm3,1 ; 7Q: P7 = <(4C77+2B77+2L77+4)/8
; ; (5C76+2B76+L76+4)/8 ...>
movdt mm5,[edi+40] ; 5B: < 0 0 0 0 R53 R52 R51 R50>
psubb mm4,mm2 ; 4J: <L47 L46 C45 C44 C43 C42 R41 R40>
punpckldq mm5,[esi+40+4] ; 5C: <L57 L56 L55 L54 R53 R52 R51 R50>
paddb mm0,mm2 ; 4K: <C47+B47 ... B45+L45 ...>
movq mm2,[ecx+PITCH*7] ; 7R: T7
pand mm0,mm6 ; 4L: <C47+B47 ... B45+L45 ...> pre-cleaned
psubb mm2,mm3 ; 7S: D7 = T7 - P7
psrlq mm0,1 ; 4M: <(C47+B47)/2 ... (B45+L45)/2 ...>
movq mm3,CFFFF00000000FFFF ; 5D: < FF FF 00 00 00 00 FF FF>
paddb mm0,mm4 ; 4N: <(C47+B47+2L47)/2 ...
; ; (2C45+B45+L45)/2 ...>
movq mm4,CentralPred+40 ; 5E: <C57 C56 C55 C54 C53 C52 C51 C50>
pand mm0,mm6 ; 4O: pre-cleaned
pand mm3,mm4 ; 5F: <C57 C56 00 00 00 00 C51 C50>
paddb mm4,mm5 ; 5G: <C57+L57 ... C55+L55 ...>
pand mm5,C0000FFFFFFFF0000 ; 5H: < 00 00 L55 L54 R53 R52 00 00>
psrlq mm0,1 ; 4P: <(C47+B47+2L47)/4 ...
; ; (2C45+B45+L45)/4 ...>
paddb mm0,CentralPred+32 ; 4Q: <(5C47+B47+2L47)/4 ...
; ; (6C45+B45+L45)/4 ...>
paddb mm5,mm3 ; 5I: <C57 C56 L55 L54 R53 R52 C51 C50>
psubb mm4,mm5 ; 5J: <L57 L56 C55 C54 C53 C52 R51 R50>
paddb mm1,mm5 ; 5K: <C57+B57 ... B55+L55 ...>
pand mm1,mm6 ; 5L: <C57+B57 ... B55+L55 ...> pre-cleaned
psubb mm0,mm7 ; 4R: <(5C47+B47+2L47+4)/4 ...
; ; (6C45+B45+L45+4)/4 ...>
pand mm0,mm6 ; 4S: pre-cleaned
psrlq mm1,1 ; 5M: <(C57+B57)/2 ... (B55+L55)/2 ...>
paddb mm1,mm4 ; 5N: <(C57+B57+2L57)/2 ...
; ; (2C55+B55+L55)/2 ...>
psrlq mm0,1 ; 4T: P4 = <(5C47+B47+2L47+4)/8 ...
; ; (6C45+B45+L45+4)/8 ...>
movq mm3,[ecx+PITCH*5] ; 5U: T5
pand mm1,mm6 ; 5O: pre-cleaned
movq mm4,[ecx+ebp*4] ; 4U: T4
psrlq mm1,1 ; 5P: <(C57+B57+2L57)/4 ...
; ; (2C55+B55+L55)/4 ...>
paddb mm1,CentralPred+40 ; 5Q: <(5C57+B57+2L57)/4 ...
; ; (6C55+B55+L55)/4 ...>
psubb mm4,mm0 ; 4V: D4 = T4 - P4
lea esi,PrevMBAD[eax]
psubb mm1,mm7 ; 5R: <(5C57+B57+2L57+4)/4 ...
; ; (6C55+B55+L55+4)/4 ...>
and esi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
pand mm1,mm6 ; 5S: pre-cleaned
movq [ebx+7*16],mm2 ; 7T
psrlq mm1,1 ; 5T: P5 = <(5C57+B57+2L57+4)/8 ...
; ; (6C55+B55+L55+4)/8 ...>
movq [ebx+4*16],mm4 ; 4W: Store D4.
psubb mm3,mm1 ; 5V: D5 = T5 - P5
mov cl,[esi].BlockType ; Bottom bit set if above neighbor is INTRA.
mov esi,PrevMBAD.BlkY1.MVs
movq [ebx+5*16],mm3 ; 5W: Store D5.
cmp cl,INTRA
je AboveEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
; Compute top remote prediction block.
mov ebx,PrevMBAD[eax].BlkY1.MVs
and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
mov ecx,PrevMBAD.BlkY1.BlkOffset
cmp esi,ebx
jne AboveNeCtr
AboveEqCtrButSidesDiffer:
movq mm3,CentralPred+24 ; Prep mm0-3, which have ctr, for reuse below.
movq mm2,CentralPred+16
paddb mm3,mm3
movq mm1,CentralPred+8
paddb mm2,mm2
movq mm0,CentralPred
paddb mm1,mm1
mov ecx,PrevMBAD.BlkY1.BlkOffset
jmp AboveEqCtr
AboveNeCtr:
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
sub esi,eax
mov eax,ecx
call GetPredForAboveOrBelow
AboveEqCtr:
; Compute difference for lines 0 thru 3.
mov esi,AddrOfRightPred
mov ebx,TargetFrameBaseAddress
movdt mm5,[edi+8] ; 1B: < 0 0 0 0 R13 R12 R11 R10>
psrlq mm1,1 ; 1A: <A17 A16 A15 A14 A13 A12 A11 A10>
punpckldq mm5,[esi+8+4] ; 1C: <L17 L16 L15 L14 R13 R12 R11 R10>
pand mm3,mm6
movq mm4,CFFFF00000000FFFF ; 1D: < FF FF 00 00 00 00 FF FF>
psrlq mm3,1 ; 3A: <A37 A36 A35 A34 A33 A32 A31 A30>: mm0
pand mm4,mm5 ; 1E: <L17 L16 00 00 00 00 R11 R10>
paddb mm5,mm1 ; 1F: <A17+L17 ... A15+L15 ...>
pand mm1,C0000FFFFFFFF0000 ; 1G: < 00 00 A15 A14 A13 A12 00 00>
pand mm2,mm6
paddb mm5,CentralPred+8 ; 1H: <C17+A17+L17 ... C15+A15+L15 ...>
paddb mm1,mm4 ; 1I: <L17 L16 A15 A14 A13 A12 R11 R10>
; 0A: <A07 A06 A05 A04 A03 A02 A01 A00>:mm0
movdt mm4,[edi] ; 0B: < 0 0 0 0 R03 R02 R01 R00>
psubb mm5,mm1 ; 1J: <C17+A17 ... C15+L15 ...>
punpckldq mm4,[esi+4] ; 0C: <L07 L06 L05 L04 R03 R02 R01 R00>
pand mm5,mm6 ; 1K: <C17+A17 ... C15+L15 ...> pre-cleaned
add ecx,ebx ; Address of target block.
psrlq mm5,1 ; 1L: <(C17+A17)/2 ... (C15+L15)/2 ...>
paddb mm1,mm5 ; 1M: <(C17+A17+2L17)/2 ...
; ; (C15+2A15+L15)/2 ...>
psrlq mm2,1 ; 2A: <A27 A26 A25 A24 A23 A22 A21 A20>
movq mm5,CFF000000000000FF ; 0D: < FF 00 00 00 00 00 00 FF>
pand mm1,mm6 ; 1N: pre-cleaned
pandn mm5,CentralPred ; 0E: < 00 C06 C05 C04 C03 C02 C01 00>
psrlq mm1,1 ; 1O: <(C17+A17+2L17)/4 ...
; ; (C15+2A15+L15)/4 ...>
paddb mm1,CentralPred+8 ; 1P: <(5C17+A17+2L17)/4 ...
; ; (5C15+2A15+L15)/4 ...>
paddb mm5,mm4 ; 0F: <L07 C06+L06 ...>
pand mm4,CFF000000000000FF ; 0G: <L07 00 00 00 00 00 00 L00>
psubb mm1,mm7 ; 1Q: <(5C17+A17+2L17+4)/4 ...
; ; (5C15+2A15+L15+4)/4 ...>
paddb mm4,mm5 ; 0H: <2L07 C06+L06 ...>
pand mm1,mm6 ; 1R: pre-cleaned
movq mm5,[ecx+ebp*1] ; 1T: T1
psrlq mm1,1 ; 1S: P1 = <(5C17+A17+2L17+4)/8 ...
; ; (5C15+2A15+L15+4)/8 ...>
psubb mm5,mm1 ; 1U: D1 = T1 - P1
;
movdt mm1,[edi+24] ; 3B: < 0 0 0 0 R33 R32 R31 R30>
pand mm4,mm6 ; 0I: <2L07 C06+L06 ...> pre-cleaned
movq PelDiffsLine1,mm5 ; 1V: Store D1.
psrlq mm4,1 ; 0J: <2L07/2 (C06+L06)/2 ...>
punpckldq mm1,[esi+24+4] ; 3C: <L37 L36 L35 L34 R33 R32 R31 R30>
paddb mm0,mm4 ; 0K: <(2A07+2L07)/2 (C06+2A06+L06)/2 ...>
movq mm5,CFFFF00000000FFFF ; 3D: < FF FF 00 00 00 00 FF FF>
pand mm0,mm6 ; 0L: pre-cleaned
movq mm4,CentralPred+24 ; 3E: <C37 C36 C35 C34 C33 C32 C31 C30>
psrlq mm0,1 ; 0M: <(2A07+2L07)/4 (C06+2A06+L06)/4 ...>
paddb mm0,CentralPred ; 0N: <(4C07+2A07+2L07)/4
; ; (5C06+2A06+L06)/4 ...>
pand mm5,mm4 ; 3F: <C37 C36 00 00 00 00 C31 C30>
psubb mm0,mm7 ; 0O: <(4C07+2A07+2L07+4)/4
; ; (5C06+2A06+L06+4)/4 ...>
paddb mm4,mm1 ; 3G: <C37+L37 ... C35+L35 ...>
pand mm1,C0000FFFFFFFF0000 ; 3H: < 00 00 L35 L34 R33 R32 00 00>
pand mm0,mm6 ; 0P: <(4C07+2A07+2L07+4)/4
; ; (5C06+2A06+L06+4)/4 ...> pre-cleaned
paddb mm1,mm5 ; 3I: <C37 C36 L35 L34 R33 R32 C31 C30>
psrlq mm0,1 ; 0Q: P0 = <(4C07+2A07+2L07+4)/8
; ; (5C06+2A06+L06+4)/8 ...>
movdt mm5,[edi+16] ; 2B: < 0 0 0 0 R23 R22 R21 R20>
psubb mm4,mm1 ; 3J: <L37 L36 C35 C34 C33 C32 R31 R30>
punpckldq mm5,[esi+16+4] ; 2C: <L27 L26 L25 L24 R23 R22 R21 R20>
paddb mm3,mm1 ; 3K: <C37+A37 ... A35+L35 ...>
movq mm1,[ecx] ; 0R: T0
pand mm3,mm6 ; 3L: <C37+A37 ... A35+L35 ...> pre-cleaned
psubb mm1,mm0 ; 0S: D0 = T0 - P0
psrlq mm3,1 ; 3M: <(C37+A37)/2 ... (A35+L35)/2 ...>
movq mm0,CFFFF00000000FFFF ; 2D: < FF FF 00 00 00 00 FF FF>
paddb mm3,mm4 ; 3N: <(C37+A37+2L37)/2 ...
; ; (2C35+A35+L35)/2 ...>
movq mm4,CentralPred+16 ; 2E: <C27 C26 C25 C24 C23 C22 C21 C20>
pand mm3,mm6 ; 3O: pre-cleaned
pand mm0,mm4 ; 2F: <C27 C26 00 00 00 00 C21 C20>
paddb mm4,mm5 ; 2G: <C27+L27 ... C25+L25 ...>
pand mm5,C0000FFFFFFFF0000 ; 2H: < 00 00 L25 L24 R23 R22 00 00>
psrlq mm3,1 ; 3P: <(C37+A37+2L37)/4 ...
; ; (2C35+A35+L35)/4 ...>
paddb mm3,CentralPred+24 ; 3Q: <(5C37+A37+2L37)/4 ...
; ; (6C35+A35+L35)/4 ...>
paddb mm5,mm0 ; 2I: <C27 C26 L25 L24 R23 R22 C21 C20>
psubb mm4,mm5 ; 2J: <L27 L26 C25 C24 C23 C22 R21 R20>
paddb mm2,mm5 ; 2K: <C27+A27 ... A25+L25 ...>
pand mm2,mm6 ; 2L: <C27+A27 ... A25+L25 ...> pre-cleaned
psubb mm3,mm7 ; 3R: <(5C37+A37+2L37+4)/4 ...
; ; (6C35+A35+L35+4)/4 ...>
pand mm3,mm6 ; 3S: pre-cleaned
psrlq mm2,1 ; 2M: <(C27+A27)/2 ... (A25+L25)/2 ...>
paddb mm2,mm4 ; 2N: <(C27+A27+2L27)/2 ...
; ; (2C25+A25+L25)/2 ...>
psrlq mm3,1 ; 3T: P3 = <(5C37+A37+2L37+4)/8 ...
; ; (6C35+A35+L35+4)/8 ...>
movq mm0,[ecx+ebp*2] ; 2U: T2
pand mm2,mm6 ; 2O: pre-cleaned
movq mm4,[ecx+PITCH*3] ; 3U: T3
psrlq mm2,1 ; 2P: <(C27+A27+2L27)/4 ...
; ; (2C25+A25+L25)/4 ...>
paddb mm2,CentralPred+16 ; 2Q: <(5C27+A27+2L27)/4 ...
; ; (6C25+A25+L25)/4 ...>
psubb mm4,mm3 ; 3V: D3 = T3 - P3
movq PelDiffsLine0,mm1 ; 0T
psubb mm2,mm7 ; 2R: <(5C27+A27+2L27+4)/4 ...
; ; (6C25+A25+L25+4)/4 ...>
movq PelDiffsLine3,mm4 ; 3W: Store D3.
pand mm2,mm6 ; 2S: pre-cleaned
psrlq mm2,1 ; 2T: P2 = <(5C27+A27+2L27+4)/8 ...
; ; (6C25+A25+L25+4)/8 ...>
;
psubb mm0,mm2 ; 2V: D2 = T2 - P2
;
;
;
movq PelDiffsLine2,mm0 ; 2W: Store D2.
;
jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
LeftEqCtr:
; Left remote motion vector was same as center.
; Compute right remote prediction block.
lea edi,PrevMBAD[eax]
and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
mov esi,PrevMBAD.BlkY1.MVs
;
;
mov cl,[edi].BlockType
mov ebx,PrevMBAD[eax].BlkY1.MVs
cmp cl,INTRA
je LeftEqCtrAndRightEqCtr ; Jump if INTRA. (Use central)
cmp esi,ebx
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
mov edi,PrevMBAD[eax].BlkY1.BlkOffset
jne RightNeCtr
; Left and right remote motion vectors were same as center.
; Compute central prediction block.
LeftEqCtrAndRightEqCtr:
mov ebx,PrevMBAD.BlkY1.MVs
mov esi,PrevMBAD.BlkY1.PastRef
lea edi,CentralPred
mov eax,DistToBADforBlockBelow
call GetPredForCenterLeftOrRight
pand mm2,mm6
psrlq mm1,1
movq [edi+32],mm0
psrlq mm2,1
movq [edi+40],mm1
pand mm3,mm6
movq [edi+48],mm2
psrlq mm3,1
lea ecx,PrevMBAD[eax]
and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
mov esi,PrevMBAD.BlkY1.MVs
movq [edi+56],mm3
pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
mov bl,[ecx].BlockType
mov ecx,PrevMBAD.BlkY1.BlkOffset
cmp bl,INTRA
mov edi,AddrOfLeftPred
mov ebx,PrevMBAD[eax].BlkY1.MVs
je BottomHalfAllSame ; Jump if INTRA. (Use central)
; Compute bottom remote prediction block.
cmp esi,ebx
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
je BottomHalfAllSame
sub esi,eax
lea eax,[ecx+ebp*4]
call GetPredForAboveOrBelow
; Compute difference for lines 4 thru 7. Only the remote motion vector below
; was different than the central motion vector.
; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
movq mm5,CentralPred+48 ; 6b: <C67 C66 C65 C64 C63 C62 C61 C60>
pand mm2,mm6
movq mm4,CentralPred+32 ; 4B: <C47 C46 C45 C44 C43 C42 C41 C40>
psrlq mm2,1 ; 6a: <B67 B66 B65 B64 B63 B62 B61 B60>
paddb mm2,mm5 ; 6c: <C67+B67 ... C65+B65 ...>
paddb mm0,mm4 ; 4C: <C47+B47>
pand mm0,mm6 ; 4D: <C47+B47> pre-cleaned
psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
pand mm2,mm6 ; 6d: <C67+B67 ... C65+B65 ...> pre-cleaned
psrlq mm0,1 ; 4E: <(C47+B47)/2 ...>
paddb mm0,mm4 ; 4F: <(3C47+B47)/2 ...>
psrlq mm2,1 ; 6e: <(C67+B67)/2 ... (C65+B65)/2 ...>
pmullw mm2,C0001000200020001 ; 6f: <(C67+B67)/2 ... (2C65+2B65)/2 ...>
pand mm0,mm6 ; 4G: <(3C47+B47)/2 ...> pre-cleaned
pand mm3,mm6
psrlq mm0,1 ; 4H: <(3C47+B47)/4 ...>
paddb mm0,mm4 ; 4I: <(7C47+B47)/4 ...>
psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
movq mm4,C0000FFFFFFFF0000 ; 6g: < 00 00 FF FF FF FF 00 00>
psubb mm0,mm7 ; 4J: <(7C47+B47+4)/4 ...>
pandn mm4,mm5 ; 6h: <C67 C66 00 00 00 00 C61 C60>
psubb mm5,mm7 ; 6i: <C67+1 ... C65+1 ...>
paddb mm2,mm4 ; 6j: <(3C67+B67)/2 ... (2C65+2B65)/2 ...>
pand mm0,mm6 ; 4K: <(7C47+B47+4)/4 ...> pre-cleaned
movq mm4,CentralPred+40 ; 5B
pand mm2,mm6 ; 6k: pre-cleaned
paddb mm1,mm4 ; 5C
psrlq mm0,1 ; 4L: <(7C47+B47+4)/8 ...>
pand mm1,mm6 ; 5D
psrlq mm2,1 ; 6l: <(3C67+B67)/4 ... (2C65+2B65)/4 ...>
paddb mm2,mm5 ; 6m: <(7C67+B67+4)/4 ... (6C65+2B65+4)/4...>
psrlq mm1,1 ; 5E
movq mm5,CentralPred+56 ; 7B: <C77 C76 C75 C74 C73 C72 C71 C70>
paddb mm1,mm4 ; 5F
paddb mm3,mm5 ; 7C: <C77+B47>
pand mm1,mm6 ; 5G
pand mm3,mm6 ; 7D: <C77+B47> pre-cleaned
psrlq mm1,1 ; 5H
paddb mm1,mm4 ; 5I
psrlq mm3,1 ; 7E: <(C77+B47)/2 ...>
psubb mm1,mm7 ; 5J
paddb mm3,mm5 ; 7F: <(3C77+B47)/2 ...>
pand mm1,mm6 ; 5K
psubb mm3,mm7 ; 7G: <(3C77+B47+2)/2 ...>
pand mm2,mm6 ; 6n: pre-cleaned
psrlq mm1,1 ; 5L
pand mm3,mm6 ; 7H: <(3C77+B47+2)/2 ...> pre-cleaned
psrlq mm2,1 ; 6o: <(7C67+B67+4)/8 ... (6C65+2B65+4)/8...>
psrlq mm3,1 ; 7I: <(3C77+B47+2)/4 ...>
BottomHalfAllSame:
mov ebx,TargetFrameBaseAddress
mov eax,DistToBADforBlockAbove
mov esi,PrevMBAD.BlkY1.MVs
movq mm5,[ecx+ebx+PITCH*5] ; 5M
add ecx,ebx ; Address of target block.
lea ebx,PrevMBAD[eax]
and ebx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
psubb mm5,mm1 ; 5N
movq mm4,[ecx+ebp*4] ; 4M: T4
movq mm1,[ecx+PITCH*7] ; 7J: T7
psubb mm4,mm0 ; 4N: D4 = T4 - P4
movq mm0,[ecx+PITCH*6] ; 6p: T6
psubb mm1,mm3 ; 7K: D7 = T7 - P7
movq PelDiffsLine4,mm4 ; 4O: Store D4.
psubb mm0,mm2 ; 6q: D6 = T6 - P6
movq PelDiffsLine5,mm5 ; 5O
movq PelDiffsLine6,mm0 ; 6r
movq PelDiffsLine7,mm1 ; 7L
mov cl,[ebx].BlockType
cmp cl,INTRA
mov ecx,PrevMBAD.BlkY1.BlkOffset
mov ebx,PrevMBAD[eax].BlkY1.MVs
je SidesEqCtrAndAboveEqCtr ; Jump if INTRA. (Use central)
; Compute top remote prediction block.
and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
cmp esi,ebx
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
mov eax,PrevMBAD[eax].BlkY1.BlkOffset
jne SidesEqCtrButAboveNeCtr
SidesEqCtrAndAboveEqCtr:
movq mm0,CentralPred
movq mm1,CentralPred+8
paddb mm0,mm0
movq mm2,CentralPred+16
paddb mm1,mm1
movq mm3,CentralPred+24
paddb mm2,mm2
jmp TopHalfAllSame
SidesEqCtrButAboveNeCtr:
sub esi,eax
mov eax,ecx
call GetPredForAboveOrBelow
; Compute difference for lines 0 thru 3. Only the remote motion vector above
; was different than the central motion vector.
movq mm5,CentralPred+8 ; 1b
pand mm3,mm6
movq mm4,CentralPred+24 ; 3B
psrlq mm3,1 ; 3A
paddb mm3,mm4 ; 3C
psrlq mm1,1 ; 1A
paddb mm1,mm5 ; 1c
pand mm3,mm6 ; 3D
pand mm1,mm6 ; 1d
psrlq mm3,1 ; 3E
paddb mm3,mm4 ; 3F
psrlq mm1,1 ; 1e
pmullw mm1,C0001000200020001 ; 1f
pand mm3,mm6 ; 3G
pand mm2,mm6
psrlq mm3,1 ; 3H
paddb mm3,mm4 ; 3I
psrlq mm2,1 ; 2a
movq mm4,C0000FFFFFFFF0000 ; 1g
psubb mm3,mm7 ; 3J
pandn mm4,mm5 ; 1h
psubb mm5,mm7 ; 1i
paddb mm1,mm4 ; 1j
pand mm3,mm6 ; 3K
movq mm4,CentralPred+16 ; 2B
pand mm1,mm6 ; 1k
paddb mm2,mm4 ; 2C
psrlq mm3,1 ; 3L
pand mm2,mm6 ; 2D
psrlq mm1,1 ; 1l
paddb mm1,mm5 ; 1m
psrlq mm2,1 ; 2E
movq mm5,CentralPred ; 0B
paddb mm2,mm4 ; 2F
paddb mm0,mm5 ; 0C
pand mm2,mm6 ; 2G
pand mm0,mm6 ; 0D
psrlq mm2,1 ; 2H
paddb mm2,mm4 ; 2I
psrlq mm0,1 ; 0E
psubb mm2,mm7 ; 2J
paddb mm0,mm5 ; 0F
pand mm2,mm6 ; 2K
psubb mm0,mm7 ; 0G
TopHalfAllSame:
mov ebx,TargetFrameBaseAddress
lea edi,[ecx+ebx]
pand mm1,mm6 ; 1n
movq mm7,[ecx+ebx] ; 0J
pand mm0,mm6 ; 0H
movq mm5,[edi+PITCH*3] ; 3M
psrlq mm2,1 ; 2L
movq mm4,[edi+ebp*2] ; 2M
psubb mm5,mm3 ; 3N
psubb mm4,mm2 ; 2N
psrlq mm1,1 ; 1o
movq mm3,[edi+ebp*1] ; 1p
psubb mm3,mm1 ; 1q
movq PelDiffsLine3,mm5 ; 3O
psrlq mm0,1 ; 0I
movq PelDiffsLine2,mm4 ; 2O
psubb mm7,mm0 ; 0K
movq PelDiffsLine1,mm3 ; 1r
movq PelDiffsLine0,mm7 ; 0L
jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
;=============================================================================
; This internal function computes the OBMC contribution for the reference
; block that uses the left, central, or right remote motion vector.
;
; ebp -- PITCH
; edi -- Address of where to put the contribution.
; esi -- Address of reference block.
; edx -- Reserved. MBlockActionStream
; ecx -- Unavailable.
; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
; eax -- Unavailable.
; mm7 -- 8 bytes of -1
; mm6 -- 8 bytes of 0xFE
; mm0-mm5 -- Scratch
StackOffset TEXTEQU <12_ButAccessToLocalVariablesShouldNotBeNeeded>
GetPredForCenterLeftOrRight:
shr ebx,1
jc HorzInterpInCLRPred
movq mm1,[esi+ebp*1]
and bl,080H
je NoInterpInCLRPred
VertInterpInCLRPred:
movq mm0,[esi]
psubb mm1,mm7
call Get4LinesOfPred_InterpVert
pand mm2,mm6
psrlq mm1,1
movq [edi+0],mm0
pand mm3,mm6
movq [edi+8],mm1
psrlq mm2,1
movq mm1,[esi+ebp*1]
psrlq mm3,1
movq [edi+16],mm2
movq mm0,mm4
movq [edi+24],mm3
psubb mm1,mm7
jmp Get4MoreLinesOfPred_InterpVert
HorzInterpInCLRPred:
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
and bl,080H
jne BothInterpInCLRPred
call Get4LinesOfPred_InterpHorz
pand mm2,mm6
psrlq mm1,1
movq [edi+0],mm0
pand mm3,mm6
movq [edi+8],mm1
psrlq mm2,1
movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
psrlq mm3,1
movq [edi+16],mm2
;
movq [edi+24],mm3
;
jmp Get4MoreLinesOfPred_InterpHorz
BothInterpInCLRPred:
call Get4LinesOfPred_InterpBoth
pand mm2,mm6
psrlq mm1,1
movq [edi+0],mm0
pand mm3,mm6
movq [edi+8],mm1
psrlq mm2,1
movq mm1,[esi+ebp*1]
psrlq mm3,1
movq [edi+16],mm2
movq mm0,mm4
movq [edi+24],mm3
psubb mm1,mm7
paddb mm5,mm5
jmp Get4MoreLinesOfPred_InterpBoth
NoInterpInCLRPred:
movq mm0,[esi]
movq mm2,[esi+ebp*2]
movq mm3,[esi+PITCH*3]
movq [edi+0],mm0
movq [edi+8],mm1
movq [edi+16],mm2
movq [edi+24],mm3
movq mm3,[esi+PITCH*7]
movq mm2,[esi+PITCH*6]
paddb mm3,mm3
movq mm1,[esi+PITCH*5]
paddb mm2,mm2
movq mm0,[esi+ebp*4]
paddb mm1,mm1
ret
;=============================================================================
; This internal function computes the OBMC contribution for the reference
; block that uses the remote motion vector from block above or below.
;
; ebp -- PITCH
; edi -- Not used.
; esi -- Address of reference block (after ecx is added in).
; edx -- Reserved. MBlockActionStream
; ecx -- Unavailable. Must not be changed.
; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
; eax -- Offset within frame for block being worked on.
; mm7 -- 8 bytes of -1
; mm6 -- 8 bytes of 0xFE
; mm0-mm5 -- Scratch
GetPredForAboveOrBelow:
shr ebx,1
lea esi,[esi+eax]
jc HorzInterpInABPred
movq mm1,[esi+ebp*1]
movq mm0,[esi]
psubb mm1,mm7
and bl,080H
jne Get4LinesOfPred_InterpVert
movq mm2,[esi+ebp*2]
paddb mm1,mm7
movq mm3,[esi+PITCH*3]
paddb mm1,mm1
paddb mm2,mm2
paddb mm3,mm3
ret
HorzInterpInABPred:
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
and bl,080H
jne Get4LinesOfPred_InterpBoth
jmp Get4LinesOfPred_InterpHorz
StackOffset TEXTEQU <0>
;=============================================================================
ENDIF
Done:
IFDEF H261
ELSE; H263
mov bl,PendingOBMC
mov cl,INTER1MV
test bl,bl
je TrulyDone
mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame
add edx,-SIZEOF T_MacroBlockActionDescr
test al,al
jne TrulyDone
movq mm6,C0101010101010101
pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
ENDIF
TrulyDone:
emms
IFDEF H261
mov eax,SWDTotal
mov esp,StashESP
mov edi,[esp+PSWDTotal]
mov [edi],eax
ELSE
mov eax,SWDTotal
mov ebx,BSWDTotal
mov esp,StashESP
mov edi,[esp+PSWDTotal]
mov esi,[esp+PBSWDTotal]
mov [edi],eax
mov [esi],ebx
ENDIF
pop ebx
pop ebp
pop edi
pop esi
rturn
MMxEDTQ endp END