|
|
;//////////////////////////////////////////////////////////////////////////// ;// ;// INTEL CORPORATION PROPRIETARY INFORMATION ;// ;// This software is supplied under the terms of a license ;// agreement or nondisclosure agreement with Intel Corporation ;// and may not be copied or disclosed except in accordance ;// with the terms of that agreement. ;// ;//////////////////////////////////////////////////////////////////////////// ;// ;// $Header: S:\h26x\src\enc\exmme.asv 1.37 13 Dec 1996 17:19:38 MBODART $ ;// ;// $Log: S:\h26x\src\enc\exmme.asv $ ;// ;// Rev 1.37 13 Dec 1996 17:19:38 MBODART ;// Tuned the ME parameters for H.261. ;// ;// Rev 1.36 06 Nov 1996 16:18:24 BNICKERS ;// Improve performance. ;// ;// Rev 1.35 30 Oct 1996 17:30:36 BNICKERS ;// Fix UMV table for right edge macroblocks. ;// ;// Rev 1.34 30 Oct 1996 14:49:20 KLILLEVO ;// zero motion vectors for intra blocks in PB-frame mode. ;// This is necesseary in the Extended Motion Vector mode ;// ;// Rev 1.33 18 Oct 1996 16:57:16 BNICKERS ;// Fixes for EMV ;// ;// Rev 1.32 15 Oct 1996 17:53:04 BNICKERS ;// ;// Fix major bug w.r.t. EMV ME. ;// ;// Rev 1.31 14 Oct 1996 13:10:14 BNICKERS ;// ;// Correct several problems wrt H261 ME. ;// ;// Rev 1.30 11 Oct 1996 16:53:12 KLILLEVO ;// ;// Fix threshold ;// ;// Rev 1.29 11 Oct 1996 16:52:18 KLILLEVO ;// Another EMV fix. ;// ;// Rev 1.28 11 Oct 1996 15:43:16 KLILLEVO ;// Really fix the handling of the top row of MBs for EMV ME. ;// ;// Rev 1.27 11 Oct 1996 15:24:38 BNICKERS ;// Special handling of top row of MBs for EMV ME. ;// ;// Rev 1.26 11 Oct 1996 14:47:42 KLILLEVO ;// Kill full pel MV for Intra blocks so that EMV of adjacent blocks will work. ;// ;// Rev 1.25 10 Oct 1996 16:42:56 BNICKERS ;// Initial debugging of Extended Motion Vectors. ;// ;// Rev 1.24 04 Oct 1996 08:48:02 BNICKERS ;// Add EMV. ;// ;// Rev 1.23 24 Sep 1996 10:42:24 BNICKERS ;// For H261, zero out motion vectors when classifying MB as intra. ;// ;// Rev 1.22 12 Sep 1996 10:56:24 BNICKERS ;// Add arguments for thresholds and differentials. ;// ;// Rev 1.21 22 Jul 1996 15:23:24 BNICKERS ;// Reduce code size. Implement H261 spatial filter. ;// ;// Rev 1.20 18 Jul 1996 16:54:26 KLILLEVO ;// changed emptythreshold to 40 instead of 128 to remove some blockiness ;// from the still frame mode on MMX ;// ;// Rev 1.19 26 Jun 1996 12:49:02 KLILLEVO ;// Fix minor booboo left in by Brian. ;// ;// Rev 1.18 26 Jun 1996 12:21:50 BNICKERS ;// Make heuristic ME work without unrestricted motion vectors. ;// ;// Rev 1.17 25 Jun 1996 14:24:58 BNICKERS ;// Implement heuristic motion estimation for MMX, AP mode. ;// ;// Rev 1.16 15 May 1996 16:57:14 BNICKERS ;// Fix SWD tabulation (again)! @#$%!% ;// ;// Rev 1.15 15 May 1996 16:53:24 BNICKERS ;// ;// Fix SWD tabulation. ;// ;// Rev 1.14 15 May 1996 11:33:28 BNICKERS ;// Bug fix for calc of total SWD. ;// ;// Rev 1.13 14 May 1996 12:18:58 BNICKERS ;// Initial debugging of MMx B-Frame ME. ;// ;// Rev 1.12 03 May 1996 14:03:50 BNICKERS ;// ;// Minor bug fixes and integration refinements. ;// ;// Rev 1.11 02 May 1996 12:00:32 BNICKERS ;// Initial integration of B Frame ME, MMX version. ;// ;// Rev 1.10 16 Apr 1996 16:40:14 BNICKERS ;// Fix some important but simple bugs. Start adding table inits for B frm ME. ;// ;// Rev 1.9 10 Apr 1996 13:13:44 BNICKERS ;// Recoding of Motion Estimation, Advanced Prediction. ;// ;// Rev 1.8 05 Apr 1996 12:28:10 BNICKERS ;// Improvements to baseline half pel ME. ;// ;// Rev 1.7 26 Mar 1996 12:00:22 BNICKERS ;// Did some tuning for MMx encode. ;// ;// Rev 1.6 20 Mar 1996 17:01:44 KLILLEVO ;// fixed bug in new quant code ;// ;// Rev 1.5 20 Mar 1996 15:26:40 KLILLEVO ;// changed quantization to match IA quantization ;// ;// Rev 1.3 15 Mar 1996 15:51:16 BECHOLS ;// Completed monolithic - Brian ;// ;// Rev 1.0 16 Feb 1996 17:12:12 BNICKERS ;// Initial revision. ;// ;//////////////////////////////////////////////////////////////////////////// ; ; MMxMotionEstimation -- This function performs motion estimation for the ; macroblocks identified in the input list. This is ; the MMx version. Conditional assembly selects either ; the H263 or H261 version. ; ; Arguments: See ex5me.asm. ; ; Other assumptions: See ex5me.asm. Most of the read-only tables needed in ; ex5me.asm are not needed here. ;
OPTION PROLOGUE:None OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro OPTION M510 OPTION CASEMAP:NONE
IFDEF H261 ZEROVECTORTHRESHOLD = 600 NONZEROMVDIFFERENTIAL = 256 BLOCKMOTIONTHRESHOLD = 1152 BLOCKMVDIFFERENTIAL = 768 EMPTYTHRESHOLD = 40 INTERCODINGTHRESHOLD = 300 INTRACODINGDIFFERENTIAL = 200 ELSE ZEROVECTORTHRESHOLD = 450 NONZEROMVDIFFERENTIAL = 375 BLOCKMOTIONTHRESHOLD = 1152 BLOCKMVDIFFERENTIAL = 768 EMPTYTHRESHOLD = 40 INTERCODINGTHRESHOLD = 1152 INTRACODINGDIFFERENTIAL = 1000 ENDIF
include iammx.inc include e3inst.inc include e3mbad.inc
.xlist include memmodel.inc .list
include exEDTQ.inc
MMXMEDATA SEGMENT PAGE ALIGN 16
; Storage for Target and Reference frames can interleave into 8K of the 16K ; cache. Pitch must be 384. ; ; C# -- Stands for row number "#" of target macroblock in *C*urrent P frame. ; B# -- Stands for row number "#" of target macroblock in current *B* frame. ; R# -- Stands for row number "#" of 0MV *R*ef macroblock in past frame. ; v -- Stands for a row below 0MV, reference macroblock. ; These same cache lines would hit reference lines >8 above the 0MV. ; ^ -- Stands for a row below 0MV, reference macroblock. ; These same cache lines would hit reference lines >8 below the 0MV. ; +-+-+ ; | | -- A cache line (32 bytes). Position of letters,<, and > indicate ; +-+-+ which 16 bytes may be used in the cache line. ; ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; |C0 | | v| |Cb | | ^| |B6 | | R6| | ; |C1 | | v| |Cc | | ^| |B7 | | R7| | ; |C2 | | v| |Cd | | ^| |B8 | | R8| | ; |C3 | | v| |Ce | | ^| |B9 | | R9| | ; |C4 | | v| |Cf | | ^| |Ba | | Ra| | ; |C5 | | v| |B0 | | R0| |Bb | | Rb| | ; |C6 | | v| |B1 | | R1| |Bc | | Rc| | ; |C7 | | v| |B2 | | R2| |Bd | | Rd| | ; |C8 | | ^| |B3 | | R3| |Be | | Re| | ; |C9 | | ^| |B4 | | R4| |Bf | | Rf| | ; |Ca | | ^| |B5 | | R5| +-+-+-+-+-+-+-+-+ ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ;
; The static storage space used for read-only tables, and the stack usage ; are coordinated such that they mesh in the data cache, and use only one ; 4K way of the 4-way, 16K cache. ; ; The first 32 bytes of the static storage space are unallocated, because ; the top of stack ranges in this area. As local procedure calls are made ; within this function, return addresses get pushed into these 32 bytes. ; (32 bytes; 0: 31)
DB 32 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
; ; The next 608 bytes of the static storage space are unallocated, because ; the local stack frame is made to hit cache at these addresses. More of ; the local stack frame is allocated after a gap of 64 bytes. ; (608 bytes; 32: 639)
LocalStorage LABEL DWORD
DB 608 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
; Motion Estimation State Engine adjustments to reference block address to get ; to next candidate reference block. ; (64 bytes; 640: 703)
FullPelMotionVectorAdjustment LABEL DWORD
DD -16*PITCH-8 VMG EQU 000H+0+8 VMGHM8 EQU 000H-8+8
DD -8*PITCH-8-010H VM8HM8 EQU 010H
DD -8*PITCH-020H VM8 EQU 020H VM8HP8 EQU 020H+8
DD -4*PITCH-8-030H VM4HM8 EQU 030H-8+8 VM4HM4 EQU 030H-4+8 VM4 EQU 030H+0+8 VM4HP4 EQU 030H+4+8
DD -4*PITCH+8-040H VM4HP8 EQU 040H+8-8 VM4HPG EQU 040H+16-8
DD -2*PITCH-4-050H VM2HM4 EQU 050H-4+4 VM2HM2 EQU 050H-2+4 VM2HM1 EQU 050H-1+4 VM2 EQU 050H+0+4 VM2HP1 EQU 050H+1+4 VM2HP2 EQU 050H+2+4 VM2HP4 EQU 050H+4+4 VM2HP8 EQU 050H+8+4
DD -1*PITCH-2-060H VM1HM2 EQU 060H-2+2 VM1HM1 EQU 060H-1+2 VM1 EQU 060H+0+2 VM1HP1 EQU 060H+1+2 VM1HP2 EQU 060H+2+2 VM1HP4 EQU 060H+4+2
DD -16-070H HMG EQU 070H-16+16 HM8 EQU 070H-8+16 HM4 EQU 070H-4+16 HM3 EQU 070H-3+16 HM2 EQU 070H-2+16 HM1 EQU 070H-1+16
DD -080H NOADJ EQU 080H HP1 EQU 080H+1 HP2 EQU 080H+2 HP4 EQU 080H+4 HP8 EQU 080H+8
DD 1*PITCH-2-090H VP1HM2 EQU 090H-2+2 VP1HM1 EQU 090H-1+2 VP1 EQU 090H+0+2 VP1HP1 EQU 090H+1+2 VP1HP2 EQU 090H+2+2 VP1HP4 EQU 090H+4+2
DD 2*PITCH-4-0A0H VP2HM4 EQU 0A0H-4+4 VP2HM2 EQU 0A0H-2+4 VP2HM1 EQU 0A0H-1+4 VP2 EQU 0A0H+0+4 VP2HP1 EQU 0A0H+1+4 VP2HP2 EQU 0A0H+2+4 VP2HP4 EQU 0A0H+4+4 VP2HP8 EQU 0A0H+8+4
DD 4*PITCH-8-0B0H VP4HM8 EQU 0B0H-8+8 VP4HM4 EQU 0B0H-4+8 VP4HM2 EQU 0B0H-2+8 VP4 EQU 0B0H+0+8 VP4HP2 EQU 0B0H+2+8 VP4HP4 EQU 0B0H+4+8
DD 4*PITCH+8-0C0H VP4HP8 EQU 0C0H+8-8 VP4HPG EQU 0C0H+16-8
DD 8*PITCH-8-0D0H VP8HM8 EQU 0D0H-8+8 VP8HM4 EQU 0D0H-4+8
DD 8*PITCH-0E0H VP8 EQU 0E0H+0 VP8HP4 EQU 0E0H+4 VP8HP8 EQU 0E0H+8
DD 16*PITCH-0F0H VPG EQU 0F0H+0 VPGHP8 EQU 0F0H+8
; Additional space reserved for stack variables. If more space is needed, ; it should go here. ; (160 bytes; 704: 863)
DB 160 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
; QWORD Constants used by motion estimation, frame differencing, and FDCT. ; (144 bytes; 864:1007)
C0101010101010101 DD 001010101H, 001010101H CFFFF0000FFFF0000 DD 0FFFF0000H, 0FFFF0000H C0200010101010101 DD 001010101H, 002000101H C0001000200020001 DD 000020001H, 000010002H CFFFF00000000FFFF DD 00000FFFFH, 0FFFF0000H C0000FFFFFFFF0000 DD 0FFFF0000H, 00000FFFFH CFF000000000000FF DD 0000000FFH, 0FF000000H C0101010101010002 DD 001010002H, 001010101H C0100010001000100 DD 001000100H, 001000100H C0001000100010001 DD 000010001H, 000010001H C7F7F7F7F7F7F7F7F DD 07F7F7F7FH, 07F7F7F7FH C1 DD 07D8A7D8AH, 07D8A7D8AH C2 DD 076417641H, 076417641H C3 DD 06A6D6A6DH, 06A6D6A6DH C4 DD 05A825A82H, 05A825A82H C5 DD 0471D471DH, 0471D471DH C6 DD 030FC30FCH, 030FC30FCH C7 DD 018F818F8H, 018F818F8H
; Distances to Block Action Descriptors for blocks that provide remote vectors ; for OBMC. Which element accessed depends on edge condition. Top edge is ; stack based variable, since different instances may have different distances ; to BAD of block above. Bottom edge is always a constant, regardless of ; edge condition. This is used in OBMC frame differencing. ; (16 bytes; 1008:1023)
BlockToLeft DD 0, -SIZEOF T_MacroBlockActionDescr+SIZEOF T_Blk BlockToRight DD 0, SIZEOF T_MacroBlockActionDescr-SIZEOF T_Blk
; Table to map linearized motion vector to vertical part, used by motion ; estimation. (Shift linearized motion vector right by 8 bits, and then ; use result as index into this array to get vertical MV.) ; (96 bytes; 1024:1119)
IF PITCH-384 *** error: The magic of this table assumes a pitch of 384. ENDIF DB -64, -64 DB -62 DB -60, -60 DB -58 DB -56, -56 DB -54 DB -52, -52 DB -50 DB -48, -48 DB -46 DB -44, -44 DB -42 DB -40, -40 DB -38 DB -36, -36 DB -34 DB -32, -32 DB -30 DB -28, -28 DB -26 DB -24, -24 DB -22 DB -20, -20 DB -18 DB -16, -16 DB -14 DB -12, -12 DB -10 DB -8, -8 DB -6 DB -4, -4 DB -2 DB 0 UnlinearizedVertMV DB 0 DB 2 DB 4, 4 DB 6 DB 8, 8 DB 10 DB 12, 12 DB 14 DB 16, 16 DB 18 DB 20, 20 DB 22 DB 24, 24 DB 26 DB 28, 28 DB 30 DB 32, 32 DB 34 DB 36, 36 DB 38 DB 40, 40 DB 42 DB 44, 44 DB 46 DB 48, 48 DB 50 DB 52, 52 DB 54 DB 56, 56 DB 58 DB 60, 60 DB 62 ; Table to provide index value in low byte, and rounding term of 1 in all bytes. ; Used in frame differencing, when half pel horizontal interpolation is needed. ; (1024 bytes; 1120:2143)
Pel_Rnd LABEL DWORD CNT = 0 REPEAT 128 DD CNT+001010101H, 001010101H CNT = CNT + 1 ENDM
; Motion Estimation State Engine Rules. ; (896 bytes;2144:3039)
StateEngineFirstRule LABEL BYTE ; Rules that govern state engine of estimator. StateEngine EQU StateEngineFirstRule-20+2
; Starting States:
IF PITCH-384 *** error: The magic of this table assumes a pitch of 384. ENDIF DB ? ; 0: not used. DB 3 ; 1: Upper left corner. DB 3 ; 2: Upper edge. DB 3 ; 3: Upper right corner. DB 3 ; 4: Left edge. DB 3 ; 5: Interior MB, not doing block search. DB 0 ; 6: Right edge. DB 0 ; 7: Lower left corner. DB 0 ; 8: Lower edge. DB 0 ; 9: Lower right corner.
DB ? ; 0: not used. DB 34 ; 1: Upper left corner. DB 66 ; 2: Upper edge. DB 42 ; 3: Upper right corner. DB 98 ; 4: Left edge. DB 16 ; 5: Interior MB, not doing block search. DB 114 ; 6: Right edge. DB 50 ; 7: Lower left corner. DB 82 ; 8: Lower edge. DB 58 ; 9: Lower right corner.
DB ?,? ; Skip 2 bytes.
LASTINITIALMESTATE EQU 9
; Interior Telescoping States:
; Try +/- 8,4,2,1, vertically first, then horizontally.
FIRSTBLOCKMESTATE EQU 10
DB VM2, VM2, 12, 11 ; 10: V+1 better/worse than central. Try V-1. DB VP2HP1, HP1, 13, 13 ; 11: Accept V+1/V-1 as best. Try H+1. DB VP1HP1, HP1, 13, 13 ; 12: Accept central/V-1 as best. Try H+1. DB HM2, HM2, 15, 14 ; 13: H+1 better/worse than central. Try H-1. DB HP2, NOADJ, 0FFH, 0FFH ; 14: Accept H+1/H-1 as best. Done. DB HP1, NOADJ, 0FFH, 0FFH ; 15: Accept central/H-1 as best. Done.
DB VMG, VMG, 18, 17 ; 16: V+8 better/worse than central. Try V-8. DB VPGHP8, HP8, 19, 19 ; 17: Accept V+8/V-8 as best. Try H+8. DB VP8HP8, HP8, 19, 19 ; 18: Accept central/V-8 as best. Try H+8. DB HMG, HMG, 21, 20 ; 19: H+8 better/worse than central. Try H-8. DB VP4HPG, VP4, 22, 22 ; 20: Accept H+8/H-8 as best. Try V+4. DB VP4HP8, VP4, 22, 22 ; 21: Accept central/H-8 as best. Try V+4.
DB VM8, VM8, 24, 23 ; 22: V+4 better/worse than central. Try V-4. DB VP8HP4, HP4, 25, 25 ; 23: Accept V+4/V-4 as best. Try H+4. DB VP4HP4, HP4, 25, 25 ; 24: Accept central/V-4 as best. Try H+4. DB HM8, HM8, 27, 26 ; 25: H+4 better/worse than central. Try H-4. DB VP2HP8, VP2, 28, 28 ; 26: Accept H+4/H-4 as best. Try V+2. DB VP2HP4, VP2, 28, 28 ; 27: Accept central/H-4 as best. Try V+2.
DB VM4, VM4, 30, 29 ; 28: V+2 better/worse than central. Try V-2. DB VP4HP2, HP2, 31, 31 ; 29: Accept V+2/V-2 as best. Try H+2. DB VP2HP2, HP2, 31, 31 ; 30: Accept central/V-2 as best. Try H+2. DB HM4, HM4, 33, 32 ; 31: H+2 better/worse than central. Try H-2. DB VP1HP4, VP1, 10, 10 ; 32: Accept H+2/H-2 as best. Try V+1. DB VP1HP2, VP1, 10, 10 ; 33: Accept central/H-2 as best. Try V+1.
; Boundary States:
; Upper left corner:
DB VM8HP8, HP8, 35, 101 ; 34: Accept corner/V+8. Try H+8. DB VP4HM8, VP4, 36, 70 ; 35: Accept corner/H+8. Try V+4. DB VM4HP4, HP4, 37, 105 ; 36: Accept corner/V+4. Try H+4. DB VP2HM4, VP2, 38, 74 ; 37: Accept corner/H+4. Try V+2. DB VM2HP2, HP2, 39, 109 ; 38: Accept corner/V+2. Try H+2. DB VP1HM2, VP1, 40, 78 ; 39: Accept corner/H+2. Try V+1. DB VM1HP1, HP1, 41, 113 ; 40: Accept corner/V+1. Try H+1. DB HM1, NOADJ, 0F5H, 0F7H ; 41: Accept corner/H+1. Done.
; Upper right corner:
DB VM8HM8, HM8, 43, 117 ; 42: Accept corner/V+8. Try H-8. DB VP4HP8, VP4, 44, 70 ; 43: Accept corner/H-8. Try V+4. DB VM4HM4, HM4, 45, 121 ; 44: Accept corner/V+4. Try H-4. DB VP2HP4, VP2, 46, 74 ; 45: Accept corner/H-4. Try V+2. DB VM2HM2, HM2, 47, 125 ; 46: Accept corner/V+2. Try H-2. DB VP1HP2, VP1, 48, 78 ; 47: Accept corner/H-2. Try V+1. DB VM1HM1, HM1, 49, 129 ; 48: Accept corner/V+1. Try H-1. DB HP1, NOADJ, 0F6H, 0F7H ; 49: Accept corner/H-1. Done
; Lower left corner:
DB VP8HP8, HP8, 51, 101 ; 50: Accept corner/V-8. Try H+8. DB VM4HM8, VM4, 52, 86 ; 51: Accept corner/H+8. Try V-4. DB VP4HP4, HP4, 53, 105 ; 52: Accept corner/V-4. Try H+4. DB VM2HM4, VM2, 54, 90 ; 53: Accept corner/H+4. Try V-2. DB VP2HP2, HP2, 55, 109 ; 54: Accept corner/V-2. Try H+2. DB VM1HM2, VM1, 56, 94 ; 55: Accept corner/H+2. Try V-1. DB VP1HP1, HP1, 57, 113 ; 56: Accept corner/V-1. Try H+1. DB HM1, NOADJ, 0F9H, 0FBH ; 57: Accept corner/H+1. Done.
; Lower right corner:
DB VP8HM8, HM8, 59, 117 ; 58: Accept corner/V-8. Try H-8. DB VM4HP8, VM4, 60, 86 ; 59: Accept corner/H-8. Try V-4. DB VP4HM4, HM4, 61, 121 ; 60: Accept corner/V-4. Try H-4. DB VM2HP4, VM2, 62, 90 ; 61: Accept corner/H-4. Try V-2. DB VP2HM2, HM2, 63, 125 ; 62: Accept corner/V-2. Try H-2. DB VM1HP2, VM1, 64, 94 ; 63: Accept corner/H-2. Try V-1. DB VP1HM1, HM1, 65, 129 ; 64: Accept corner/V-1. Try H-1. DB HP1, NOADJ, 0FAH, 0FBH ; 65: Accept corner/H-1. Done.
; Upper edge:
DB VM8HP8, HP8, 67, 19 ; 66: Accept central/V+8 as best. Try H+8. DB HMG, HMG, 69, 68 ; 67: H+8 worse/better than central. Try H-8. DB VP4HPG, VP4, 70, 70 ; 68: Accept H+8/H-8 as best. Try V+4. DB VP4HP8, VP4, 70, 70 ; 69: Accept central/H-8 as best. Try V+4. DB VM4HP4, HP4, 71, 25 ; 70: Accept central/V+4 as best. Try H+4. DB HM8, HM8, 73, 72 ; 71: H+4 worse/better than central. Try H-4. DB VP2HP8, VP2, 74, 74 ; 72: Accept H+4/H-4 as best. Try V+2. DB VP2HP4, VP2, 74, 74 ; 73: Accept central/H-4 as best. Try V+2. DB VM2HP2, HP2, 75, 31 ; 74: Accept central/V+2 as best. Try H+2. DB HM4, HM4, 77, 76 ; 75: H+2 worse/better than central. Try H-2. DB VP1HP4, VP1, 78, 78 ; 76: Accept H+2/H-2 as best. Try V+1. DB VP1HP2, VP1, 78, 78 ; 77: Accept central/H-2 as best. Try V+1. DB VM1HP1, HP1, 79, 13 ; 78: Accept central/V+1 as best. Try H+1. DB HM2, HM2, 81, 80 ; 79: H+1 worse/better than central. Try H-1. DB HP2, NOADJ, 0F7H, 0F7H ; 80: Accept H+1/H-1 as best. Done. DB HP1, NOADJ, 0F7H, 0F7H ; 81: Accept central/H-1 as best. Done.
; Lower edge:
DB VP8HP8, HP8, 83, 19 ; 82: Accept central/V-8 as best. Try H+8. DB HMG, HMG, 85, 84 ; 83: H+8 worse/better than central. Try H-8. DB VM4HPG, VM4, 86, 86 ; 84: Accept H+8/H-8 as best. Try V-4. DB VM4HP8, VM4, 86, 86 ; 85: Accept central/H-8 as best. Try V-4. DB VP4HP4, HP4, 87, 25 ; 86: Accept central/V-4 as best. Try H+4. DB HM8, HM8, 89, 88 ; 87: H+4 worse/better than central. Try H-4. DB VM2HP8, VM2, 90, 90 ; 88: Accept H+4/H-4 as best. Try V-2. DB VM2HP4, VM2, 90, 90 ; 89: Accept central/H-4 as best. Try V-2. DB VP2HP2, HP2, 91, 31 ; 90: Accept central/V-2 as best. Try H+2. DB HM4, HM4, 93, 92 ; 91: H+2 worse/better than central. Try H-2. DB VM1HP4, VM1, 94, 94 ; 92: Accept H+2/H-2 as best. Try V-1. DB VM1HP2, VM1, 94, 94 ; 93: Accept central/H-2 as best. Try V-1. DB VP1HP1, HP1, 95, 13 ; 94: Accept central/V-1 as best. Try H+1. DB HM2, HM2, 97, 96 ; 95: H+1 worse/better than central. Try H-1. DB HP2, NOADJ, 0FBH, 0FBH ; 96: Accept H+1/H-1 as best. Done. DB HP1, NOADJ, 0FBH, 0FBH ; 97: Accept central/H-1 as best. Done.
; Left edge:
DB VMG, VMG, 100, 99 ; 98: V+8 worse/better than central. Try V-8. DB VPGHP8, HP8, 101, 101 ; 99: Accept V+8/V-8 as best. Try H+8. DB VP8HP8, HP8, 101, 101 ; 100: Accept central/V-8 as best. Try H+8. DB VP4HM8, VP4, 102, 22 ; 101: Accept central/H+8 as best. Try V+4. DB VM8, VM8, 104, 103 ; 102: V+4 worse/better than central. Try V-4. DB VP8HP4, HP4, 105, 105 ; 103: Accept V+4/V-4 as best. Try H+4. DB VP4HP4, HP4, 105, 105 ; 104: Accept central/V-4 as best. Try H+4. DB VP2HM4, VP2, 106, 28 ; 105: Accept central/H+4 as best. Try V+2. DB VM4, VM4, 108, 107 ; 106: V+2 worse/better than central. Try V-2. DB VP4HP2, HP2, 109, 109 ; 107: Accept V+2/V-2 as best. Try H+2. DB VP2HP2, HP2, 109, 109 ; 108: Accept central/V-2 as best. Try H+2. DB VP1HM2, VP1, 110, 10 ; 109: Accept central/H+2 as best. Try V+1. DB VM2, VM2, 112, 111 ; 110: V+1 worse/better than central. Try V-1. DB VP2HP1, HP1, 113, 113 ; 111: Accept V+1/V-1 as best. Try H+1. DB VP1HP1, HP1, 113, 113 ; 112: Accept central/V-1 as best. Try H+1. DB HM1, NOADJ, 0FDH, 0FDH ; 113: Accept central/H+1 as best. Done.
; Right edge:
DB VPG, VPG, 116, 115 ; 114: V-8 worse/better than central. Try V+8. DB VMGHM8, HM8, 117, 117 ; 115: Accept V-8/V+8 as best. Try H-8. DB VM8HM8, HM8, 117, 117 ; 116: Accept central/V+8 as best. Try H-8. DB VP4HP8, VP4, 118, 22 ; 117: Accept central/H+8 as best. Try V+4. DB VM8, VM8, 120, 119 ; 118: V+4 worse/better than central. Try V-4. DB VP8HM4, HM4, 121, 121 ; 119: Accept V+4/V-4 as best. Try H-4. DB VP4HM4, HM4, 121, 121 ; 120: Accept central/V-4 as best. Try H-4. DB VP2HP4, VP2, 122, 28 ; 121: Accept central/H+4 as best. Try V+2. DB VM4, VM4, 124, 123 ; 122: V+2 worse/better than central. Try V-2. DB VP4HM2, HM2, 125, 125 ; 123: Accept V+2/V-2 as best. Try H-2. DB VP2HM2, HM2, 125, 125 ; 124: Accept central/V-2 as best. Try H-2. DB VP1HP2, VP1, 126, 10 ; 125: Accept central/H+2 as best. Try V+1. DB VM2, VM2, 128, 127 ; 126: V+1 worse/better than central. Try V-1. DB VP2HM1, HM1, 129, 129 ; 127: Accept V+1/V-1 as best. Try H-1. DB VP1HM1, HM1, 129, 129 ; 128: Accept central/V-1 as best. Try H-1. DB HP1, NOADJ, 0FEH, 0FEH ; 129: Accept central/H+1 as best. Done.
; Exhaustive search, radius 1 here, reaching out to radius 2 further below. ; . . . . . ; . 2 5 3 . C = center. ; . 7 C 8 . ; . 4 6 1 . # = order to try additional candidates. ; . . . . .
FIRST_HEURISTIC_EXHAUSTIVE = 130
DB VM2HM2, VM2HM2, 131, 138 ; 130: #1 worse/better than C. Try #2. DB HP2, HP2, 132, 145 ; 131: #2 worse/better than C. Try #3. DB VP2HM2, VP2HM2, 133, 151 ; 132: #3 worse/better than C. Try #4. DB VM2HP1, VM2HP1, 134, 156 ; 133: #4 worse/better than C. Try #5. DB VP2, VP2, 135, 160 ; 134: #5 worse/better than C. Try #6. DB VM1HM1, VM1HM1, 136, 163 ; 135: #6 worse/better than C. Try #7. DB HP2, HP2, 137, 165 ; 136: #7 worse/better than C. Try #8. DB HM1, HP1, 0FFH, 166 ; 137: If C best, quit. If 8 best, keep going. DB HP2, HP2, 139, 145 ; 138: #2 worse/better than #1. Try #3. DB VP2HM2, VP2HM2, 140, 151 ; 139: #3 worse/better than #1. Try #4. DB VM2HP1, VM2HP1, 141, 156 ; 140: #4 worse/better than #1. Try #5. DB VP2, VP2, 142, 160 ; 141: #5 worse/better than #1. Try #6. DB VM1HM1, VM1HM1, 143, 163 ; 142: #6 worse/better than #1. Try #7. DB HP2, HP2, 144, 165 ; 143: #7 worse/better than #1. Try #8. DB HP1, HP1, 199, 166 ; 144: #8 worse/better than #1. Take best, go on. DB VP2HM2, VP2HM2, 146, 151 ; 145: #3 worse/better than #2. Try #4. DB VM2HP1, VM2HP1, 147, 156 ; 146: #4 worse/better than #2. Try #5. DB VP2, VP2, 148, 160 ; 147: #5 worse/better than #2. Try #6. DB VM1HM1, VM1HM1, 149, 163 ; 148: #6 worse/better than #2. Try #7. DB HP2, HP2, 150, 165 ; 149: #7 worse/better than #2. Try #8. DB HM3, HP1, 208, 166 ; 150: #8 worse/better than #2. Take best, go on. DB VM2HP1, VM2HP1, 152, 156 ; 151: #4 worse/better than #3. Try #5. DB VP2, VP2, 153, 160 ; 152: #5 worse/better than #3. Try #6. DB VM1HM1, VM1HM1, 154, 163 ; 153: #6 worse/better than #3. Try #7. DB HP2, HP2, 155, 165 ; 154: #7 worse/better than #3. Try #8. DB HP1, HP1, 217, 166 ; 155: #8 worse/better than #3. Take best, go on. DB VP2, VP2, 157, 160 ; 156: #5 worse/better than #4. Try #6. DB VM1HM1, VM1HM1, 158, 163 ; 157: #6 worse/better than #4. Try #7. DB HP2, HP2, 159, 165 ; 158: #7 worse/better than #4. Try #8. DB HM3, HP1, 190, 166 ; 159: #8 worse/better than #4. Take best, go on. DB VM1HM1, VM1HM1, 161, 163 ; 160: #6 worse/better than #5. Try #7. DB HP2, HP2, 162, 165 ; 161: #7 worse/better than #5. Try #8. DB VM2HM1, HP1, 184, 166 ; 162: #8 worse/better than #5. Take best, go on. DB HP2, HP2, 164, 165 ; 163: #7 worse/better than #6. Try #8. DB VP2HM1, HP1, 176, 166 ; 164: #8 worse/better than #6. Take best, go on. DB HM3, HP1, 172, 166 ; 165: #8 worse/better than #7. Take best, go on.
; . . . . . C = center. ; . ~ ~ ~ 2 ~ = tried, but not as good. ; . ~ C X 1 X = best so far. ; . ~ ~ ~ 3 # = order to try additional candidates. ; . . . . .
DB VM1, VM1, 167, 169 ; 166: #1 better/worse than X. Try #2. DB VP2, VP2, 168, 171 ; 167: #2 better/worse than X. Try #3. DB VM1HM1, NOADJ, 0FFH,0FFH ; 168: #3 better/worse than X. Take best, quit. DB VP2, VP2, 170, 171 ; 169: #2 better/worse than #1. Try #3. DB VM1, NOADJ, 0FFH,0FFH ; 170: #3 better/worse than #1. Take best, quit. DB VM2, NOADJ, 0FFH,0FFH ; 171: #3 better/worse than #2. Take best, quit.
; . . . . . C = center. ; 2 ~ ~ ~ . ~ = tried, but not as good. ; 1 X C ~ . X = best so far. ; 3 ~ ~ ~ . # = order to try additional candidates. ; . . . . .
DB VM1, VM1, 173, 175 ; 172: #1 better/worse than X. Try #2. DB VP2, VP2, 174, 177 ; 173: #2 better/worse than X. Try #3. DB VM1HP1, NOADJ, 0FFH,0FFH ; 174: #3 better/worse than X. Take best, quit. DB VP2, VP2, 176, 177 ; 175: #2 better/worse than #1. Try #3. DB VM1, NOADJ, 0FFH,0FFH ; 176: #3 better/worse than #1. Take best, quit. DB VM2, NOADJ, 0FFH,0FFH ; 177: #3 better/worse than #2. Take best, quit.
; . . . . . C = center. ; . ~ ~ ~ . ~ = tried, but not as good. ; . ~ C ~ . X = best so far. ; . ~ X ~ . # = order to try additional candidates. ; . 2 1 3 .
DB HM1, HM1, 179, 181 ; 178: #1 better/worse than X. Try #2. DB HP2, HP2, 180, 183 ; 179: #2 better/worse than X. Try #3. DB VM1HM1, NOADJ, 0FFH,0FFH ; 180: #3 better/worse than X. Take best, quit. DB HP2, HP2, 182, 183 ; 181: #2 better/worse than #1. Try #3. DB HM1, NOADJ, 0FFH,0FFH ; 182: #3 better/worse than #1. Take best, quit. DB HM2, NOADJ, 0FFH,0FFH ; 183: #3 better/worse than #2. Take best, quit.
; . 2 1 3 . C = center. ; . ~ X ~ . ~ = tried, but not as good. ; . ~ C ~ . X = best so far. ; . ~ ~ ~ . # = order to try additional candidates. ; . . . . .
DB HM1, HM1, 185, 187 ; 184: #1 better/worse than X. Try #2. DB HP2, HP2, 186, 189 ; 185: #2 better/worse than X. Try #3. DB VP1HM1, NOADJ, 0FFH,0FFH ; 186: #3 better/worse than X. Take best, quit. DB HP2, HP2, 188, 189 ; 187: #2 better/worse than #1. Try #3. DB HM1, NOADJ, 0FFH,0FFH ; 188: #3 better/worse than #1. Take best, quit. DB HM2, NOADJ, 0FFH,0FFH ; 189: #3 better/worse than #2. Take best, quit.
; . . . . . C = center. ; . ~ ~ ~ . ~ = tried, but not as good. ; 1 ~ C ~ . X = best so far. ; 2 X ~ ~ . # = order to try additional candidates. ; 4 3 5 . .
DB VP1, VP1, 191, 195 ; 190: #1 better/worse than X. Try #2. DB VP1HP1, VP1HP1, 178, 192 ; 191: #2 better/worse than X. Try #3. DB HM1, HM1, 193, 181 ; 192: #3 better/worse than #2. Try #4. DB HP2, HP2, 194, 183 ; 193: #4 better/worse than #2. Try #5. DB VM1HM2, NOADJ, 0FFH,0FFH ; 194: #5 better/worse than #2. Take best, quit. DB VP1HP1, VP1HP1, 196, 192 ; 195: #2 better/worse than #1. Try #3. DB HM1, HM1, 197, 181 ; 196: #3 better/worse than #1. Try #4. DB HP2, HP2, 198, 183 ; 197: #4 better/worse than #1. Try #5. DB VM2HM2, NOADJ, 0FFH,0FFH ; 198: #5 better/worse than #1. Take best, quit.
; . . . . . C = center. ; . ~ ~ ~ . ~ = tried, but not as good. ; . ~ C ~ 1 X = best so far. ; . ~ ~ X 2 # = order to try additional candidates. ; . . 4 3 5
DB VP1, VP1, 200, 204 ; 199: #1 better/worse than X. Try #2. DB VP1HM1, VP1HM1, 178, 201 ; 200: #2 better/worse than X. Try #3. DB HM1, HM1, 202, 181 ; 201: #3 better/worse than #2. Try #4. DB HP2, HP2, 203, 183 ; 202: #4 better/worse than #2. Try #5. DB VM1, NOADJ, 0FFH,0FFH ; 203: #5 better/worse than #2. Take best, quit. DB VP1HM1, VP1HM1, 205, 201 ; 204: #2 better/worse than #1. Try #3. DB HM1, HM1, 206, 181 ; 205: #3 better/worse than #1. Try #4. DB HP2, HP2, 207, 183 ; 206: #4 better/worse than #1. Try #5. DB VM2, NOADJ, 0FFH,0FFH ; 207: #5 better/worse than #1. Take best, quit.
; 4 3 5 . . C = center. ; 2 X ~ ~ . ~ = tried, but not as good. ; 1 ~ C ~ . X = best so far. ; . ~ ~ ~ . # = order to try additional candidates. ; . . . . .
DB VM1, VM1, 209, 213 ; 208: #1 better/worse than X. Try #2. DB VM1HP1, VM1HP1, 184, 210 ; 209: #2 better/worse than X. Try #3. DB HM1, HM1, 211, 187 ; 210: #3 better/worse than #2. Try #4. DB HP2, HP2, 212, 189 ; 211: #4 better/worse than #2. Try #5. DB VP1HM2, NOADJ, 0FFH,0FFH ; 212: #5 better/worse than #2. Take best, quit. DB VM1HP1, VM1HP1, 214, 210 ; 213: #2 better/worse than #1. Try #3. DB HM1, HM1, 215, 187 ; 214: #3 better/worse than #1. Try #4. DB HP2, HP2, 216, 189 ; 215: #4 better/worse than #1. Try #5. DB VP2HM2, NOADJ, 0FFH,0FFH ; 216: #5 better/worse than #1. Take best, quit.
; . . 4 3 5 C = center. ; . ~ ~ X 2 ~ = tried, but not as good. ; . ~ C ~ 1 X = best so far. ; . ~ ~ ~ . # = order to try additional candidates. ; . . . . .
DB VM1, VM1, 218, 222 ; 217: #1 better/worse than X. Try #2. DB VM1HM1, VM1HM1, 184, 219 ; 218: #2 better/worse than X. Try #3. DB HM1, HM1, 220, 187 ; 219: #3 better/worse than #2. Try #4. DB HP2, HP2, 221, 189 ; 220: #4 better/worse than #2. Try #5. DB VP1, NOADJ, 0FFH,0FFH ; 221: #5 better/worse than #2. Take best, quit. DB VM1HM1, VM1HM1, 223, 219 ; 222: #2 better/worse than #1. Try #3. DB HM1, HM1, 224, 187 ; 223: #3 better/worse than #1. Try #4. DB HP2, HP2, 225, 189 ; 224: #4 better/worse than #1. Try #5. DB VP2, NOADJ, 0FFH,0FFH ; 225: #5 better/worse than #1. Take best, quit.
FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR = 226
DB VP1HP1, VP1HP1, 130, 130 ; 226: Redoing ctr, away from limiting edge.
DB ?, ?, ?, ?, ?, ?
; Table of values to add to SWDs for half pel reference macroblocks, to cause ; those that are off the edge of the frame to produce artificially high SWDs. ; (64 bytes;3040:3103)
InvalidateBadHalfPelMVs LABEL DWORD
DD 0FFFFFFFFH, 0FFFFFF00H, 0FFFF00FFH, 0FFFF0000H DD 0FF00FFFFH, 0FF00FF00H, 0FF0000FFH, 0FF000000H DD 000FFFFFFH, 000FFFF00H, 000FF00FFH, 000FF0000H DD 00000FFFFH, 00000FF00H, 0000000FFH, 000000000H
; Tables (interleaved) to select case from next table (below these) to drive ; the weighting of the future and past predictions in the construction of ; B-frame reference blocks. ; (448 bytes;3104:3551)
VertWtSel LABEL BYTE DB 0 HorzWtSel LABEL BYTE DB 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 1, 0 DB 1, 0 DB 2, 16 DB 2, 16 DB 3, 32 DB 3, 32 DB 4, 48 DB 4, 48 DB 5, 64 DB 5, 64 DB 6, 80 DB 6, 80 DB 7, 96 DB 7, 96 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 8, 112 DB 9, 128 DB 9, 128 DB 10, 144 DB 10, 144 DB 11, 160 DB 11, 160 DB 12, 176 DB 12, 176 DB 13, 192 DB 13, 192 DB 14, 208 DB 14, 208 DB 15, 224 DB 15, 224 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 ; Chroma starts here DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 ; Luma ends here DB 0, 240 DB 0, 240 DB 1, 0 DB 1, 0 DB 2, 16 DB 2, 16 DB 3, 32 DB 3, 32 DB 4, 48 DB 4, 48 DB 5, 64 DB 5, 64 DB 6, 80 DB 6, 80 DB 7, 96 DB 7, 96 DB 8, 112 DB 9, 128 DB 9, 128 DB 10, 144 DB 10, 144 DB 11, 160 DB 11, 160 DB 12, 176 DB 12, 176 DB 13, 192 DB 13, 192 DB 14, 208 DB 14, 208 DB 15, 224 DB 15, 224 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240 DB 0, 240
; Table indexed by VertWtSel and HorzWtSel to get index of weight to apply to ; future and past predictions in the construction of B-frame reference blocks ; for frame differencing. ; (264 bytes;3552:3815) ; ; Indexed by VertWtSel[VMV]+HorzWtSel[HMV]+N to get idx of weight for line N.
P8F0 = 0*8 F1P7 = 1*8 F2P6 = 2*8 F3P5 = 3*8 F4P4 = 4*8 F5P3 = 5*8 F6P2 = 6*8 F7P1 = 7*8 F8P0 = 8*8 P1F7 = 9*8 P2F6 = 10*8 P3F5 = 11*8 P4F4 = 12*8 P5F3 = 13*8 P6F2 = 14*8 P7F1 = 15*8
Diff_IdxRefWts LABEL BYTE
DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0 DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
BFrmSWDState LABEL BYTE ; State engine rules for finding best motion vector. ; (48 bytes; 3816:3863)
; 1st number: Horizontal Motion displacement to try, in half pel increments. ; 2nd number: Vertical Motion displacement to try, in half pel increments. ; 3rd number: Next state to enter if previous best is still best. ; 4th number: Next state to enter if this motion is better than previous best.
DB -2, 0, 4, 8 ; 0 -- ( 0, 0) Try (-2, 0) DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0) DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0) DB 0, -2, 16, 20 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2}) DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2) DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
DB -1, 0, 28, 32 ; 24 DB 1, 0, 36, 36 ; 28 DB 2, 0, 36, 36 ; 32 DB 0, -1, 40, 44 ; 36 DB 0, 1, 0, 0 ; 40 DB 0, 2, 0, 0 ; 44
; Table used by Quant RLE to navigate the zigzag order of quantized coeffs. ; Contents of this table are initialized by first entry to MMxEDTQ. In ; unlikely event of race condition, it will just get initialized by more ; than one encoder instance. ; (128 bytes; 3864:3991)
NextZigZagCoeff LABEL BYTE
DB 128 DUP (0FFH)
; Table used to initial above table. ; (64 bytes: 3992:4055)
InitZigZagCoeff LABEL BYTE
DB Q01,Q10,Q20,Q11,Q02,Q03,Q12,Q21,Q30,Q40,Q31,Q22,Q13,Q04,Q05,Q14 DB Q23,Q32,Q41,Q50,Q60,Q51,Q42,Q33,Q24,Q15,Q06,Q07,Q16,Q25,Q34,Q43 DB Q52,Q61,Q70,Q71,Q62,Q53,Q44,Q35,Q26,Q17,Q27,Q36,Q45,Q54,Q63,Q72 DB Q73,Q64,Q55,Q46,Q37,Q47,Q56,Q65,Q74,Q75,Q66,Q57,Q67,Q76,Q77, 0
; Constants needed by the Quant RLE phase. ; (128 bytes; 4056:4183)
Recip2QP LABEL DWORD WORD 0H, 0H ; QP = 000h WORD 04000H, 04000H ; QP = 001h WORD 02000H, 02000H ; QP = 002h WORD 01555H, 01555H ; QP = 003h WORD 01000H, 01000H ; QP = 004h WORD 00CCCH, 00CCCH ; QP = 005h WORD 00AAAH, 00AAAH ; QP = 006h WORD 00924H, 00924H ; QP = 007h WORD 00800H, 00800H ; QP = 008h WORD 0071CH, 0071CH ; QP = 009h WORD 00666H, 00666H ; QP = 00Ah WORD 005D1H, 005D1H ; QP = 00Bh WORD 00555H, 00555H ; QP = 00Ch WORD 004ECH, 004ECH ; QP = 00Dh WORD 00492H, 00492H ; QP = 00Eh WORD 00444H, 00444H ; QP = 00Fh WORD 00400H, 00400H ; QP = 010h WORD 003C3H, 003C3H ; QP = 011h WORD 0038EH, 0038EH ; QP = 012h WORD 0035EH, 0035EH ; QP = 013h WORD 00333H, 00333H ; QP = 014h WORD 0030CH, 0030CH ; QP = 015h WORD 002E8H, 002E8H ; QP = 016h WORD 002C8H, 002C8H ; QP = 017h WORD 002AAH, 002AAH ; QP = 018h WORD 0028FH, 0028FH ; QP = 019h WORD 00276H, 00276H ; QP = 01Ah WORD 0025EH, 0025EH ; QP = 01Bh WORD 00249H, 00249H ; QP = 01Ch WORD 00234H, 00234H ; QP = 01Dh WORD 00222H, 00222H ; QP = 01Eh WORD 00210H, 00210H ; QP = 01Fh
; Skip over space to get to where the following tables can go. They will ; hit the cache at the same point as a portion of the StateEngine states ; that aren't used in the heuristic ME mode. ; (2056 bytes; 4184:6239)
DB 2056 DUP (?) ; Static space place-holder.
; Table to select base address in next table below to use for particular block ; of macroblock. First column provides address of base element of HorzWtSel ; to use to map horizontal MV to list of weighting indices to use. ; Second ; column is similar, but for Vertical MV. Third and fourth columns not used. ; 6 rows; one for each block in a macroblock. ; (88 bytes; 6240:6327)
LeftRightBlkPosition LABEL DWORD DD HorzWtSel+0-64 UpDownBlkPosition LABEL DWORD DD VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH DD HorzWtSel+32-64, VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH DD HorzWtSel+0-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH DD HorzWtSel+32-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH DD HorzWtSel+128, VertWtSel+128, 0DEADBEEFH BlkEmptyFlag LABEL BYTE ; sneak this in here DB 16, 0, 32, 0 DD HorzWtSel+128, VertWtSel+128
; The following table, indexed by MBEdgeType&7, returns a mask which is used to ; zero-out the motion vectors for predictors that are off the edge of the ; frame. The index is a 3 bit value, each bit being set if the macroblock ; is NOT on the corresponding edge. 1 == left; 2 == right; 4 == top; ; The value gotten out is (where A==left; B==above; C==above right): ; <mask(A) mask(A) mask(C) mask(C) mask(B) mask(B) mask(A) mask(A)> ; The mask is 0xFF if the corresponding remote block is NOT off the edge, and ; 0x00 if it is off the edge. ; (32 bytes: 6328: 6359)
ValidRemoteVectors LABEL DWORD DWORD 0DEADBEEFH ; 0: Can't be on left and right edges at once. DWORD 0FF0000FFH ; 1: Top right corner. DWORD 000000000H ; 2: Top left corner. DWORD 0FF0000FFH ; 3: Top edge. DWORD 0DEADBEEFH ; 4: Can't be on left and right edges at once. DWORD 0FF00FFFFH ; 5: Right edge. DWORD 000FFFF00H ; 6: Left edge. DWORD 0FFFFFFFFH ; 7: Central macroblock.
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes ; to be subtracted with saturation to the predicted motion vector for extended ; motion vector search. Since saturation occurs at 0, the values here are ; such that the motion vectors are biased to the appropriate point for the ; clamping effect. The index is a 4 bit value, each bit being set if the ; macroblock is NOT on the corresponding edge. 1 == left; 2 == right; ; 4 == top; 8 == bottom. The 8 values being calculated are as follows: ; ; [ 0: 7] -- HMV lower limit for signature search ; ; [ 8:15] -- HMV lower limit ; ; [16:23] -- HMV upper limit for signature search ; ; [24:31] -- HMV upper limit ; ; [32:39] -- VMV lower limit for signature search ; ; [40:47] -- VMV lower limit ; ; [48:55] -- VMV upper limit for signature search ; ; [56:63] -- VMV upper limit ; (88 bytes: 6360:6447)
EMV_ClampLowerEnd LABEL DWORD ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once. BYTE 87, 94, 97, 100, ; 5: Bottom right corner. 87, 94, 97, 100 BYTE 119, 126, 97, 100, ; 6: Bottom left corner. 87, 94, 97, 100 BYTE 87, 94, 97, 100, ; 7: Bottom edge. 87, 94, 97, 100 DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once. BYTE 87, 94, 97, 100, ; 9: Top right corner. 119, 126, 97, 100 BYTE 119, 126, 97, 100, ; 10: Top left corner. 119, 126, 97, 100 BYTE 87, 94, 97, 100, ; 11: Top edge. 119, 126, 97, 100 DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once. BYTE 87, 94, 97, 100, ; 13: Right edge. 87, 94, 97, 100 BYTE 119, 126, 97, 100, ; 14: Left edge. 87, 94, 97, 100 BYTE 87, 94, 97, 100, ; 15: Central macroblock. 87, 94, 97, 100
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes ; to be added with saturation to the result of the application of the preceed- ; ing table, to clamp the upper limit on the motion vector search parameters. ; Since saturation occurs at 255, the values here are such that the motion ; vectors are biased to the appropriate point for the clamping effect. ; (88 bytes: 6448:6535)
EMV_ClampUpperEnd LABEL DWORD ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once. BYTE 184, 193, 216, 225, ; 5: Bottom right corner. 184, 193, 216, 225 BYTE 216, 225, 184, 193, ; 6: Bottom left corner. 184, 193, 216, 225 BYTE 184, 193, 184, 193, ; 7: Bottom edge. 184, 193, 216, 225 DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once. BYTE 184, 193, 216, 225, ; 9: Top right corner. 216, 225, 184, 193 BYTE 216, 225, 184, 193, ; 10: Top left corner. 216, 225, 184, 193 BYTE 184, 193, 184, 193, ; 11: Top edge. 216, 225, 184, 193 DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once. BYTE 184, 193, 216, 225, ; 13: Right edge. 184, 193, 184, 193 BYTE 216, 225, 184, 193, ; 14: Left edge. 184, 193, 184, 193 BYTE 184, 193, 184, 193, ; 15: Central macroblock. 184, 193, 184, 193
; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes ; to be added without saturation to the result of the application of the ; preceeding table, to return the the motion vector search parameters to the ; proper range for subsequent use. ; (88 bytes: 6536:6623)
EMV_RestoreRange LABEL DWORD ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once. BYTE 120, 255, 88, 225, ; 5: Bottom right corner. 120, 255, 88, 225 BYTE 120, 255, 56, 193, ; 6: Bottom left corner. 120, 255, 88, 225 BYTE 120, 255, 56, 193, ; 7: Bottom edge. 120, 255, 88, 225 DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once. BYTE 120, 255, 88, 225, ; 9: Top right corner. 120, 255, 56, 193 BYTE 120, 255, 56, 193, ; 10: Top left corner. 120, 255, 56, 193 BYTE 120, 255, 56, 193, ; 11: Top edge. 120, 255, 56, 193 DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once. BYTE 120, 255, 88, 225, ; 13: Right edge. 120, 255, 56, 193 BYTE 120, 255, 56, 193, ; 14: Left edge. 120, 255, 56, 193 BYTE 120, 255, 56, 193, ; 15: Central macroblock. 120, 255, 56, 193
; Tables indexed by indices fetched from Diff_IdxRefWts. These tables return ; a multipler to apply to past or future predictions to construct the ; B-frame candidate reference blocks. ; (128 bytes;6624:6751)
FutureWt_FF_or_00 LABEL DWORD
DD 000000000H, 000000000H DD 000000000H, 0FF000000H DD 000000000H, 0FFFF0000H DD 000000000H, 0FFFFFF00H DD 000000000H, 0FFFFFFFFH DD 0FF000000H, 0FFFFFFFFH DD 0FFFF0000H, 0FFFFFFFFH DD 0FFFFFF00H, 0FFFFFFFFH DD 0FFFFFFFFH, 0FFFFFFFFH DD 0FFFFFFFFH, 000FFFFFFH DD 0FFFFFFFFH, 00000FFFFH DD 0FFFFFFFFH, 0000000FFH DD 0FFFFFFFFH, 000000000H DD 000FFFFFFH, 000000000H DD 00000FFFFH, 000000000H DD 0000000FFH, 000000000H
MMXMEDATA ENDS
;=============================================================================
.CODE EDTQ
ASSUME cs : FLAT ASSUME ds : FLAT ASSUME es : FLAT ASSUME fs : FLAT ASSUME gs : FLAT ASSUME ss : FLAT
EXTERN MMxDoForwardDCT:NEAR EXTERN MMxDoForwardDCTx:NEAR EXTERN MMxDoForwardDCTy:NEAR IFDEF H261 ELSE EXTERN MMxDoBFrameLumaBlocks:NEAR EXTERN MMxDoBFrameChromaBlocks:NEAR ENDIF
MMxEDTQ proc C AMBAS: DWORD, ATarg: DWORD, APrev: DWORD, ABTarg: DWORD, AWtFwd: DWORD, AWtBwd: DWORD, AFrmWd: DWORD, ADoHalf: DWORD, ADoBlk: DWORD, ADoSF: DWORD, ADoAP: DWORD, ADoB: DWORD, ADoLuma: DWORD, ADoExtMV:DWORD, AQP: DWORD, ABQP: DWORD, AB0VecT: DWORD, ASpaFilT:DWORD, ASpaFilD:DWORD, ASWDTot: DWORD, ABSWDTot:DWORD, ACodStr: DWORD, ABCodStr:DWORD
LocalFrameSize = 1536 ; Space needed for locals
RegStoSize = 16
; Arguments:
MBlockActionStream_arg = RegStoSize + 4 TargetFrameBaseAddress_arg = RegStoSize + 8 PreviousFrameBaseAddress_arg = RegStoSize + 12 BTargetFrameBaseAddress_arg = RegStoSize + 16 SignatureBaseAddress_arg = RegStoSize + 20 WeightForwardMotion_arg = RegStoSize + 24 WeightBackwardMotion_arg = RegStoSize + 28 FrameWidth = RegStoSize + 32 DoHalfPelEstimation_arg = RegStoSize + 36 DoBlockLevelVectors_arg = RegStoSize + 40 DoSpatialFiltering_arg = RegStoSize + 44 DoAdvancedPrediction_arg = RegStoSize + 48 DoBFrame_arg = RegStoSize + 52 DoLumaBlocksInThisPass_arg = RegStoSize + 56 DoExtendedMotionVectors_arg = RegStoSize + 60 QuantizationLevel = RegStoSize + 64 BQuantizationLevel = RegStoSize + 68 BFrmZeroVectorThreshold_arg = RegStoSize + 72 SpatialFiltThreshold_arg = RegStoSize + 76 SpatialFiltDifferential_arg = RegStoSize + 80 PSWDTotal = RegStoSize + 84 PBSWDTotal = RegStoSize + 88 CodeStreamCursor_arg = RegStoSize + 92 BCodeStreamCursor_arg = RegStoSize + 96 EndOfArgList = RegStoSize + 100
StackOffset TEXTEQU <0> CONST_384 TEXTEQU <384>
push esi push edi push ebp push ebx
; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
mov esi,esp and esp,0FFFFF000H sub esp,000000FE0H IFDEF H261
mov ebp,PITCH CONST_384 TEXTEQU <ebp>
mov eax,[esi+SpatialFiltThreshold_arg] mov ebx,[esi+SpatialFiltDifferential_arg] mov SpatialFiltThreshold,eax mov SpatialFiltDifferential,ebx mov ecx,[esi+TargetFrameBaseAddress_arg] mov ebx,[esi+SignatureBaseAddress_arg] sub ecx,ebx mov eax,[esi+TargetFrameBaseAddress_arg] mov SigToTarget,ecx add ecx,PITCH*80+64 neg ecx mov TargetToSig_Debiased,ecx mov ebx,[esi+PreviousFrameBaseAddress_arg] mov PreviousFrameBaseAddress,ebx mov TargetFrameBaseAddress,eax sub ebx,eax mov ecx,[esi+QuantizationLevel] mov TargToRef,ebx mov eax,[esi+CodeStreamCursor_arg] mov ebx,ecx mov CodeStreamCursor,eax shl ebx,16 xor edx,edx or ebx,ecx mov ecx,Recip2QP[ecx*4] mov QPDiv2,ebx mov Recip2QPToUse,ecx mov eax,[esi+DoSpatialFiltering_arg] mov DoExtendedMotionVectors,edx test eax,eax je @f mov eax,3 @@: mov DoSpatialFiltering,al mov SWDTotal,edx mov BestMBHalfPelMV,edx mov ebx,PreviousFrameBaseAddress mov BlockAbove[0],edx sub ebx,16 mov edx,[esi+FrameWidth] mov SpatiallyFilteredMB,ebx imul edx,-SIZEOF T_MacroBlockActionDescr/16 add edx,2*SIZEOF T_Blk mov eax,14 ; 14 if restricted MVs and doing heuristic ME. mov BlockAbove[4],edx mov DoHeuristicME,eax
ELSE mov eax,[esi+DoExtendedMotionVectors_arg] test eax,eax je @f mov eax,7 @@: mov DoExtendedMotionVectors,eax mov eax,[esi+BFrmZeroVectorThreshold_arg] mov edi,[esi+WeightForwardMotion_arg] mov BFrmZeroVectorThreshold,eax mov ecx,60 mov ebx,060606060H lea edx,WeightForwardMotion+128 @@: mov eax,[edi+ecx] and eax,03F3F3F3FH ; ??? mov ebp,[edi+ecx+64] and ebp,03F3F3F3FH ; ??? xor eax,ebx xor ebp,ebx mov [edx+ecx+64],eax mov [edx+ecx-128],ebp sub ecx,4 mov ebp,PITCH jge @b
mov edi,[esi+WeightBackwardMotion_arg] mov eax,edx lea edx,WeightBackwardMotion+128 mov ecx,60 sub eax,edx jne @b CONST_384 TEXTEQU <ebp>
mov ebx,[esi+PreviousFrameBaseAddress_arg] mov eax,[esi+TargetFrameBaseAddress_arg] mov PreviousFrameBaseAddress,ebx mov TargetFrameBaseAddress,eax mov ecx,[esi+BTargetFrameBaseAddress_arg] sub ebx,eax mov TargToRef,ebx sub eax,ecx mov BFrameBaseAddress,ecx mov BFrameToFuture,eax mov ecx,[esi+TargetFrameBaseAddress_arg] mov ebx,[esi+SignatureBaseAddress_arg] sub ecx,ebx mov edx,[esi+FrameWidth] mov SigToTarget,ecx add ecx,PITCH*80+64 neg ecx imul edx,-SIZEOF T_MacroBlockActionDescr/16 mov TargetToSig_Debiased,ecx mov ecx,[esi+DoBFrame_arg] add edx,2*SIZEOF T_Blk xor cl,1 mov BlockAbove[4],edx mov IsPlainPFrame,cl mov ecx,[esi+QuantizationLevel] mov eax,[esi+CodeStreamCursor_arg] mov ebx,ecx mov CodeStreamCursor,eax mov eax,[esi+BCodeStreamCursor_arg] mov BCodeStreamCursor,eax shl ebx,16 mov eax,[esi+DoHalfPelEstimation_arg] or ebx,ecx mov ecx,Recip2QP[ecx*4] mov QPDiv2,ebx mov Recip2QPToUse,ecx mov ecx,[esi+BQuantizationLevel] xor edx,edx mov ebx,ecx shl ebx,16 mov BestMBHalfPelMV,edx or ebx,ecx mov ecx,Recip2QP[ecx*4] mov BQPDiv2,ebx mov BRecip2QPToUse,ecx test eax,eax je @f mov eax,-4 @@: mov DoHalfPelME,eax mov eax,[esi+DoBlockLevelVectors_arg] mov DoBlockLevelVectors,al mov eax,[esi+DoAdvancedPrediction_arg] mov DoAdvancedPrediction,al mov SWDTotal,edx test eax,eax lea eax,[eax+14] ; 14 if restricted MVs and doing heuristic ME. je @f xor eax,eax ; 0 if unrestricted MVs and doing heuristic ME. @@: mov DoHeuristicME,eax mov BSWDTotal,edx mov PendingOBMC,edx mov BlockAbove[0],edx ENDIF mov eax,01E98E268H mov EMVLimitsForThisMB,eax ; ; [ 0: 7] -- HMV lower limit for sig search (biased 128) ; ; [ 8:15] -- HMV lower limit (signed) ; ; [16:23] -- HMV upper limit for sig search (biased 128) ; ; [24:31] -- HMV upper limit (signed) mov EMVLimitsForThisMB+4,eax ; Same as for HMV. mov edx,[esi+MBlockActionStream_arg] mov al,NextZigZagCoeff[Q77] test al,al je ZigZagCoeffInitialized
xor ecx,ecx lea ebx,InitZigZagCoeff xor eax,eax
@@:
mov al,[ebx] inc ebx mov NextZigZagCoeff[ecx],al mov ecx,eax test eax,eax jne @b
ZigZagCoeffInitialized:
mov StashESP,esi mov eax,[esi+DoLumaBlocksInThisPass_arg] test eax,eax jne FirstMacroBlock ; Jump if doing luma plane
jmp FirstMacroBlock_ChromaProcessing
IntraCodedChromaProcessingDone:
IFDEF H261 ELSE mov al,IsPlainPFrame test al,al jne NextMacroBlock_ChromaProcessing
mov eax,QPDiv2 mov ebx,BQPDiv2
call MMxDoBFrameChromaBlocks ENDIF
NextMacroBlock_ChromaProcessing:
mov bl,[edx].CodedBlocks sub edx,-SIZEOF T_MacroBlockActionDescr and bl,040H ; Check for end-of-stream jne TrulyDone
FirstMacroBlock_ChromaProcessing:
mov al,[edx].BlockType ; Chroma handling. Intra? Or Inter? mov ecx,TargetFrameBaseAddress cmp al,INTRA jne ChromaIsInterCoded
mov esi,[edx].BlkU.BlkOffset mov StashBlockType,al add esi,ecx push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
shl bl,4 mov al,[edx].CodedBlocks sub al,bl mov esi,[edx].BlkV.BlkOffset mov [edx].CodedBlocks,al mov ecx,TargetFrameBaseAddress add esi,ecx
call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
shl bl,5 mov al,[edx].CodedBlocks sub al,bl pop ecx ; Adjust stack pointer StackOffset TEXTEQU <0> mov [edx].CodedBlocks,al jmp IntraCodedChromaProcessingDone
ChromaIsInterCoded:
mov edi,[edx].BlkU.BlkOffset ; Get address of next macroblock to do. mov ebx,[edx].BlkU.MVs add edi,ecx mov esi,[edx].BlkU.PastRef mov StashBlockType,al IFDEF H261 mov ecx,2+256*1 ; cl==2 tells SpatialLoopFilter code to do one ; ; block. ch==1 causes it to return to here. mov TargetMacroBlockBaseAddr,edi ; Store address of U block. cmp al,INTERSLF je DoSpatialFilterForChroma
ReturnFromSpatialFilterForU:
ENDIF
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,4 mov al,[edx].CodedBlocks sub al,bl mov ecx,TargetFrameBaseAddress mov [edx].CodedBlocks,al pop edi ; Adjust stack pointer StackOffset TEXTEQU <0> mov edi,[edx].BlkV.BlkOffset ; Get address of next macroblock to do. mov ebx,[edx].BlkV.MVs add edi,ecx mov esi,[edx].BlkV.PastRef IFDEF H261 mov ecx,2-256*1 ; cl==2 tells SpatialLoopFilter code to do one ; ; block. ch==-1 causes it to return to here. mov TargetMacroBlockBaseAddr,edi ; Store address of U block. mov al,[edx].BlockType cmp al,INTERSLF je DoSpatialFilterForChroma
ReturnFromSpatialFilterForV:
ENDIF
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,5 mov al,[edx].CodedBlocks sub al,bl pop ecx ; Adjust stack pointer StackOffset TEXTEQU <0> mov [edx].CodedBlocks,al jmp IntraCodedChromaProcessingDone
;============================================================================ ; Here we copy the target macroblock, and interpolate left, right, and both. ; We also accumulate the target pels for each block. Result is four partial ; sums in four packed words. After summing them all up, the final sum will ; be the sum of the 64 pels of each block, divided by 2.
NextMacroBlock:
mov bl,[edx].CodedBlocks sub edx,-SIZEOF T_MacroBlockActionDescr and bl,040H ; Check for end-of-stream jne Done
FirstMacroBlock:
mov edi,TargetFrameBaseAddress mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do. add edi,esi mov esi,TargToRef add esi,edi mov TargetMacroBlockBaseAddr,edi mov Addr0MVRef,esi
;============================================================================ ; We calculate the 0-motion SWD. We use 32 match points per block, and ; write the result seperately for each block. If the SWD for the 0-motion ; vector is below a threshold, we don't bother searching for other possibly ; better motion vectors. ; ; ebp -- PITCH ; esi -- Address of ref block. ; edi -- Address of target block. ; edx -- MBlockActionStream ; ecx -- Not used. Will be linearized MV in non-zero MV search. ; ebx -- CurrSWDState, i.e. FirstMEState, times 8 ; eax -- Scratch ; mm7 -- Best SWD for macroblock. ; mm0-mm6 Scratch ;
mov cl,[edx].CodedBlocks ; Init CBP for macroblock. or cl,03FH ; Indicate all 6 blocks are coded. mov eax,DoHeuristicME ; 0 if unrestricted MVs and heur ME. ; ; 14 if restricted MVs and heur ME. ; ; 15 if suppressing heuristic ME. mov [edx].CodedBlocks,cl js IntraByDecree
xor ebx,ebx ; Avoid partial register stall. xor ecx,ecx mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom pcmpeqd mm7,mm7 ; Init previous best SWD to huge. mov bl,[edx].FirstMEState ; Test for INTRA-BY-DECREE. sub eax,ecx ; Negative iff should do heuristic ME ; ; for this macroblock. test bl,bl je IntraByDecree
sar eax,31 psrlq mm7,2 or ebx,eax ; -1 if doing heuristic ME. mov al,INTER1MV ; Speculate INTER, 1 motion vector. mov [edx].BlockType,al psrld mm7,14 ; mm7[32:63]: Previous best SWD = 0x0000FFFF. ; ; mm7[ 0:31]: Prev SWD that we diminish = 0x0003FFFF. ; ; Since we can't diminish it below 0x00020000, we ; ; won't take the short circuit exit from MblkEstQWA.
; At this point: ; ebp -- PITCH ; esi -- Address of upper left block of 0,0 ref area. ; edi -- Address of upper left block of target. ; edx -- MBlockActionStream ; ecx -- Scratch ; ebx -- CurrSWDState, i.e. FirstMEState. ; eax -- Scratch ; mm7 -- Previous best SWD initialized to huge (0xFFFF, 0x3FFFF). ; mm0-mm6 -- Scratch
;============================================================================ ; Compute SWD for macroblock.
ComputeMBSWD:
; Registers at this point: ; ebp -- PITCH ; esi -- Address of upper left block of candidate ref area. ; edi -- Address of upper left block of target. ; edx -- MBlockActionStream ; ecx -- Scratch ; ebx -- CurrSWDState ; eax -- Scratch ; mm7 -- Previous best SWD. ; mm0-mm6 -- Scratch ;
lea ecx,[ebp+ebp*4] ; Get PITCH*5 lea eax,[ebp+ebp*2] ; Get PITCH*3 movq mm0,[esi+PITCH*15] ; FL A: Ref MB, lower left block, line 15. psubw mm0,[edi+PITCH*15] ; FL B: Diff for lower left block, line 15. movq mm6,[esi+PITCH*15+8] ; FR A psllw mm0,8 ; FL C: Extract diffs for line 15 even pels. psubw mm6,[edi+PITCH*15+8] ; FR B pmaddwd mm0,mm0 ; FL D: Square of diffs for even pels. movq mm1,[esi+PITCH*9] ; 9L A psllw mm6,8 ; FR C psubw mm1,[edi+PITCH*9] ; 9L B pmaddwd mm6,mm6 ; FR D movq mm5,[esi+PITCH*9+8] ; 9R A psllw mm1,8 ; 9L C psubw mm5,[edi+PITCH*9+8] ; 9R B pmaddwd mm1,mm1 ; 9L D movq mm2,[esi+eax*4] ; CL a psllw mm5,8 ; 9R C psubw mm2,[edi+eax*4] ; CL b pmaddwd mm5,mm5 ; 9R D movq mm3,[esi+eax*4+8] ; CR a pmaddwd mm2,mm2 ; CL c: Square of diffs for odd pels. psubw mm3,[edi+eax*4+8] ; CR b paddusw mm0,mm1 ; LL + Accumulate SWD for lower left block. movq mm1,[esi+eax*1] ; 3L A pmaddwd mm3,mm3 ; CR c psubw mm1,[edi+eax*1] ; 3L B paddusw mm6,mm5 ; LR + movq mm5,[esi+eax*1+8] ; 3R A psllw mm1,8 ; 3L C psubw mm5,[edi+eax*1+8] ; 3R B paddusw mm0,mm2 ; LL + movq mm2,[esi] ; 0L a pmaddwd mm1,mm1 ; 3L D psubw mm2,[edi] ; 0L b paddusw mm6,mm3 ; LR + movq mm3,[esi+8] ; 0R a psllw mm5,8 ; 3R C psubw mm3,[edi+8] ; 0R b pmaddwd mm5,mm5 ; 3R D movq mm4,[esi+eax*2] ; 6L a pmaddwd mm2,mm2 ; 0L c psubw mm4,[edi+eax*2] ; 6L b pmaddwd mm3,mm3 ; 0R c movq PartSWDForLLBlk,mm0 ; Stash SWD for lines 9,12,15, LL blk. paddusw mm0,mm6 ; Sum SWD for lines 9,12,15 LL and LR. movq PartSWDForLRBlk,mm6 ; Stash SWD for lines 9,12,15, LR blk. pmaddwd mm4,mm4 ; 6L c movq mm6,[esi+eax*2+8] ; 6R a paddusw mm1,mm2 ; UL + psubw mm6,[edi+eax*2+8] ; 6R b paddusw mm5,mm3 ; UR + movq mm2,[esi+ebp*1] ; 1L A pmaddwd mm6,mm6 ; 6R c psubw mm2,[edi+ebp*1] ; 1L B paddusw mm1,mm4 ; UL + movq mm3,[esi+ecx*1] ; 5L A paddusw mm0,mm1 ; Sum partial SWD for LL, LR, and UL. psubw mm3,[edi+ecx*1] ; 5L B paddusw mm5,mm6 ; UR + movq mm6,[esi+ebp*4] ; 4L a paddusw mm0,mm5 ; Sum partial SWD for all blocks. movq PartSWDForURBlk,mm5 ; Stash SWD for lines 0,3,6, UR blk. punpckldq mm5,mm0 ; Get low sum into high bits. psubw mm6,[edi+ebp*4] ; 4L b paddusw mm5,mm0 ; Total up SWD for every third line. movq mm0,[esi+ebp*2] ; 2L a psrlq mm5,47 ; Position, and double. psubw mm0,[edi+ebp*2] ; 2L b pcmpgtd mm5,mm7 ; Is 2 * SWD for 6 lines > prev SWD? pmaddwd mm0,mm0 ; 2L c psllw mm2,8 ; 1L C movdf eax,mm5 pmaddwd mm2,mm2 ; 1L D test eax,eax jne MblkEst_EarlyOut
lea eax,[ecx+ebp*2] ; PITCH*7 psllw mm3,8 ; 5L C paddusw mm1,mm2 ; UL + pmaddwd mm3,mm3 ; 5L D movq mm5,[esi+eax*1] ; 7L A psubw mm5,[edi+eax*1] ; 7L B pmaddwd mm6,mm6 ; 4L c movq mm2,[esi+PITCH*11+8] ; BR A psllw mm5,8 ; 7L C psubw mm2,[edi+PITCH*11+8] ; BR B paddusw mm1,mm3 ; UL + movq mm3,[esi+PITCH*13+8] ; DR A paddusw mm1,mm0 ; UL + psubw mm3,[edi+PITCH*13+8] ; DR B pmaddwd mm5,mm5 ; 7L D movq mm0,[esi+ebp*8+8] ; 8R a paddusw mm1,mm6 ; UL + psubw mm0,[edi+ebp*8+8] ; 8R b psllw mm2,8 ; BR C movq mm4,[esi+ecx*2+8] ; AR a paddusw mm1,mm5 ; UL + psubw mm4,[edi+ecx*2+8] ; AR b punpckldq mm6,mm1 ; Get low SWD accum to hi order of mm6. movq mm5,[esi+eax*2+8] ; ER a paddusw mm6,mm1 ; mm6[48:63] is SWD for upper left blk. psubw mm5,[edi+eax*2+8] ; ER b psrlq mm6,48 ; mm6 is SWD for upper left block. psubusw mm7,mm6 ; Diminish prev best SWD by cand UL blk. pmaddwd mm2,mm2 ; BR D pmaddwd mm0,mm0 ; 8R c psllw mm3,8 ; DR C movq mm1,[esi+ebp*1+8] ; 1R A pmaddwd mm3,mm3 ; DR D paddusw mm2,PartSWDForLRBlk ; LR + pmaddwd mm4,mm4 ; AR c psubw mm1,[edi+ebp*1+8] ; 1R B paddusw mm2,mm0 ; LR + movq mm0,[esi+ecx*1+8] ; 5R A pmaddwd mm5,mm5 ; ER c psubw mm0,[edi+ecx*1+8] ; 5R B paddusw mm2,mm3 ; LR + movq mm3,[esi+eax*1+8] ; 7R A paddusw mm2,mm4 ; LR + paddusw mm2,mm5 ; LR + psllw mm1,8 ; 1R C psubw mm3,[edi+eax*1+8] ; 7R B punpckldq mm5,mm2 ; Get low SWD accum to hi order of mm5. paddusw mm5,mm2 ; mm5[48:63] is SWD for lower right blk. pmaddwd mm1,mm1 ; 1R D movq mm2,[esi+ebp*2+8] ; 2R a psrlq mm5,48 ; mm5 is SWD for lower right block. psubusw mm7,mm5 ; Diminish prev best SWD by cand LR blk. punpckldq mm6,mm5 ; mm6[0:31] UL SWD; mm6[32:63] LR SWD. psubw mm2,[edi+ebp*2+8] ; 2R b psllw mm0,8 ; 5R C movq mm5,[esi+ebp*4+8] ; 4R a pmaddwd mm0,mm0 ; 5R D psubw mm5,[edi+ebp*4+8] ; 4R b psllw mm3,8 ; 7R C paddusw mm1,PartSWDForURBlk ; UR + pmaddwd mm3,mm3 ; 7R D paddusw mm1,mm0 ; UR + pmaddwd mm2,mm2 ; 2R c movq mm0,[esi+PITCH*11] ; BL A pmaddwd mm5,mm5 ; 4R c psubw mm0,[edi+PITCH*11] ; BL B paddusw mm1,mm3 ; UR + movq mm3,[esi+ecx*2] ; AL a paddusw mm1,mm2 ; UR + psubw mm3,[edi+ecx*2] ; AL b paddusw mm1,mm5 ; UR + pmaddwd mm3,mm3 ; AL c psllw mm0,8 ; BL C movq mm2,[esi+PITCH*13] ; DL A pmaddwd mm0,mm0 ; BL D psubw mm2,[edi+PITCH*13] ; DL B punpckldq mm5,mm1 ; Get low SWD accum to hi order of mm5. movq mm4,[esi+ebp*8] ; 8L a paddusw mm5,mm1 ; mm5[48:63] is SWD for upper right blk. psubw mm4,[edi+ebp*8] ; 8L b psllw mm2,8 ; DL C movq mm1,[esi+eax*2] ; EL a pmaddwd mm2,mm2 ; DL D psubw mm1,[edi+eax*2] ; EL b pmaddwd mm4,mm4 ; 8L c paddusw mm3,PartSWDForLLBlk ; LL + pmaddwd mm1,mm1 ; EL c paddusw mm3,mm0 ; LL + psrlq mm5,48 ; mm5 is SWD for upper right block. paddusw mm3,mm2 ; LL + psubusw mm7,mm5 ; Diminish prev best SWD by cand UR blk. paddusw mm3,mm4 ; LL + movq mm0,mm7 paddusw mm3,mm1 ; LL + psrlq mm7,32 ; Get original Best SWD punpckldq mm1,mm3 pxor mm2,mm2 paddusw mm1,mm3 psrlq mm1,48 punpckldq mm5,mm1 ; mm5[32:63] SWD for LL. mm5[0:31] SWD for UR. psubusw mm0,mm1 psubusw mm7,mm0 ; BestSWD dim (BestSWD dim CandSWD) --> new best. pcmpeqd mm2,mm0 ; [0:31] == 0 iff cand better, else -1.
; Registers at this point: ; ebp -- PITCH ; edi -- Target MacroBlock Base Address. ; esi -- Address of upper left block of candidate ref area. ; edx -- MBlockActionStream ; ebx -- CurrSWDState ; mm7 -- New best SWD for macroblock. ; mm6 -- [0:31] SWD for upper left; [32:63] SWD for lower right. ; mm5 -- [0:31] SWD for upper right; [32:63] SWD for lower left. ; mm2 -- [0:31] 0 if cand better, else -1.
cmp ebx,LASTINITIALMESTATE ; Did we just do zero motion vector? jg MEForNonZeroMVDone
movdf eax,mm7 ; SWD for this candidate. punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63]. test ebx,ebx jns ZeroMVDoneForNonHeuristicME
HeuristicME_EarlyOut:
movq mm0,EMVLimitsForThisMB ; Speculate no extended motion vectors. pcmpeqb mm1,mm1 ; <FFFF FFFF FFFF FFFF> xor ecx,ecx cmp bl,-3 mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom jle HeuristicME_CaseSigMVDone_or_CaseAboveMVDone
sub eax,NONZEROMVDIFFERENTIAL inc bl mov ebx,DoExtendedMotionVectors ; 7 iff doing extende MVs, else 0. jne HeuristicME_CaseLeftMVDone
HeuristicME_Case0MVDone:
movq SWDULandLR,mm6 pcmpeqb mm4,mm4 ; <FFFF FFFF FFFF FFFF> movq SWDURandLL,mm5 psllw mm4,15 ; <8000 8000 8000 8000> cmp eax,ZEROVECTORTHRESHOLD-NONZEROMVDIFFERENTIAL ; ; Compare 0-MV against ZeroVectorThreshold. jl BelowZeroThresh ; Jump if 0-MV is good enough.
mov SWDForNon0MVToBeat,eax and ebx,ecx ; Elim flag for bottom row. 0 iff no ExtMV. mov eax,BlockAbove[4] je NotExtendedMVs ; Jump if not doing extended MVs?
; Below: A==left; B==above; C==above rt. movdt mm3,ValidRemoteVectors[ebx*4] ; <mask(A) (C) (B) (A)> movq mm2,mm4 ; <8000 8000 8000 8000>
IF SIZEOF T_MacroBlockActionDescr-128 **** error: Due to assembler weakness, can't use spaces here, so SIZEOF **** T_MacroBlockActionDescr is replaced by constant. If assembly error **** occurs, the constant has been changed, and the three instructions in **** the next 10 lines have to change. ENDIF IF SIZEOF T_Blk-16 **** error: Due to assembler weakness, can't use spaces here, so SIZEOF T_Blk **** is replaced by constant. If assembly error occurs, the constant has been **** changed, and the three instructions in the next 10 lines have to change. ENDIF movdt mm0,[edx-128].BestFullPelMBMVs ; <x x Av,h x > punpcklbw mm3,mm3 ; mask for both MV parts movdt mm1,[edx+eax-2*16+128].BestFullPelMBMVs ; <x x Cv,h x > psrlw mm2,8 ; <0080 0080 0080 0080> por mm4,mm2 ; <8080 ...> bias value. punpcklwd mm1,mm0 ; <Av,h Cv,h x x > punpcklwd mm0,[edx+eax-2*16].BestFullPelMBMVs ; <Bv,h Av,h x x > ; punpckhdq mm0,mm1 ; <Av,h Cv,h Bv,h Av,h> ; pand mm0,mm3 ; Set to 0 any off edge. and ebx,4 ; If zero, we're on the top edge. paddb mm0,mm4 ; <Av,h Cv,h Bv,h Av,h> biased je @f ; If on top edge, cause LEFT to be taken. movq mm1,mm0 ; <Av,h Cv,h Bv,h Av,h> psrlq mm0,16 ; <x Av,h Cv,h Bv,h> psubusb mm0,mm1 ; <x floor(A-C) floor(C-B) floor(B-A)> ; paddb mm0,mm1 ; <x max(A,C) max(C,B) max(B,A)> ; movq mm1,mm0 ; <x max(A,C) max(C,B) max(B,A)> psrlq mm0,16 ; <x x max(A,C) max(C,B)> pxor mm1,mm0 ; Part of median calc. psrlq mm0,16 ; <x x x max(A,C)> pxor mm0,mm1 ; <x x x median(A,B,C)> biased by +128. ;
@@:
punpcklbw mm0,mm0 ; 2 copies of median predictor MVs. pcmpeqb mm1,mm1 punpcklwd mm0,mm0 ; 4 copies. Will now calc the following: ; ; [ 0: 7] -- HMV lower limit for sig search ; ; [ 8:15] -- HMV lower limit ; ; [16:23] -- HMV upper limit for sig search ; ; [24:31] -- HMV upper limit ; ; [32:39] -- VMV lower limit for sig search ; ; [40:47] -- VMV lower limit ; ; [48:55] -- VMV upper limit for sig search ; ; [56:63] -- VMV upper limit ; psubusb mm0,EMV_ClampLowerEnd[ecx*8-40] psllw mm1,3 ; <FF F8 FF F8 FF F8 FF F8> i.e. Mask to ; ; set sig srch range to mult of 8. paddusb mm0,EMV_ClampUpperEnd[ecx*8-40]
psubb mm0,EMV_RestoreRange[ecx*8-40]
NotExtendedMVs:
movq SWD0MVURandLL,mm5 pand mm0,mm1 ; Set sig search at multiples of four. movq SWD0MVULandLR,mm6 pcmpeqb mm2,mm2 ; Set cand as worse than 0MV, in case skip. movq EMVLimitsForThisMB,mm0 and cl,1 je HeuristicME_SkipLeftMV
mov BestOfFourStartingPoints,esi mov ebx,-2 ; Indicate trying MV of MB to left. movsx ecx,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBVMV movsx eax,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBHMV
ClampHeurMECandidateToRange:
movsx esi,PB EMVLimitsForThisMB+5 ; VMV lower limit. cmp ecx,esi jl ClampVMV_1
movsx esi,PB EMVLimitsForThisMB+7 ; VMV upper limit. cmp ecx,esi jle @f
ClampVMV_1:
mov ecx,esi
@@:
movsx esi,PB EMVLimitsForThisMB+1 ; HMV lower limit. cmp eax,esi jl ClampHMV_1
movsx esi,PB EMVLimitsForThisMB+3 ; HMV upper limit. cmp eax,esi jle @f
ClampHMV_1:
mov eax,esi
@@:
sar eax,1 lea ecx,[ecx+ecx*2] IF PITCH-384 *** error: The magic here assumes a pitch of 384. ENDIF shl ecx,6 mov esi,Addr0MVRef add eax,ecx ; Clamped Linearized Motion Vector ; sub eax,1 jc MblkEst_EarlyOut ; Jump if Lin MV is zero.
lea esi,[esi+eax+1] ; Candidate reference address. jmp ComputeMBSWD
HeuristicME_SkipLeftMV:
mov BestOfFourStartingPoints,esi mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
HeuristicME_CaseLeftMVDone:
movdf eax,mm2 ; eax == 0 iff cand better, else -1. mov ebx,BlockAbove[4] and cl,4 movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss). punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63]. movq SWDURandLL[eax*8],mm5 pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip. mov BestOfFourStartingPoints[eax*4],esi je HeuristicME_SkipAboveMV
movsx ecx,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBVMV movsx eax,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBHMV mov ebx,-3 ; Indicate trying MV of MB above. jmp ClampHeurMECandidateToRange
HeuristicME_CaseSigMVDone_or_CaseAboveMVDone: HeuristicME_SkipAboveMV:
movdf eax,mm2 ; eax == 0 iff cand better, else -1. jne HeuristicME_CaseSigMVDone
HeuristicME_CaseAboveMVDone:
mov cl,4 lea ebx,C0001000100010001 movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss). pxor mm0,mm0 movq SWDURandLL[eax*8],mm5 pxor mm1,mm1 mov BestOfFourStartingPoints[eax*4],esi lea esi,TargetSigContribForRowPairs movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV. pcmpeqb mm7,mm7 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
; ebp -- Pitch ; edi -- Address of target macroblock. ; esi -- Address at which to store target macroblock's signature contributions. ; cl -- Loop counter. ; mm0 -- Accumulator for target MB's sig contrib for first four even columns. ; mm1 -- Accumulator for target MB's sig contrib for last four even columns.
movq mm2,[edi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00> pcmpeqb mm5,mm5 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF> paddb mm2,[edi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...> psrlw mm5,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
@@:
movq mm3,[edi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20> movq mm4,mm2 ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...> paddb mm3,[edi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...> psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11> pmaddwd mm2,[ebx] ; D:<P07+P17+P05+P15 P03+P13+P01+P11> movq mm7,mm5 ; W:<0x00FF 0x00FF 0x00FF 0x00FF> pand mm5,mm3 ; W:<P26+P36 P24+P34 P22+P32 P20+P30> psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31> pmaddwd mm3,[ebx] ; D:<P27+P37+P25+P35 P23+P33+P21+P31> paddw mm0,mm5 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)> movq mm5,[edi+ebp*2+8] ; B:<P2F P2E P2D P2C P2B P2A P29 P28> pand mm4,mm7 ; W:<P06+P16 P04+P14 P02+P12 P00+P10> paddb mm5,[edi+PITCH*3+8] ; B:<P2F+P3F P2E+P3E P2D+P3D P2C+P3C ...> paddw mm0,mm4 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)> movq mm4,[edi+8] ; B:<P0F P0E P0D P0C P0B P0A P09 P08> movq mm6,mm7 ; W:<0x00FF 0x00FF 0x00FF 0x00FF> paddb mm4,[edi+ebp*1+8] ; B:<P0F+P1F P0E+P1E P0D+P1D P0C+P1C ...> pand mm7,mm5 ; W:<P2E+P3E P2C+P3C P2A+P3A P28+P38> pand mm6,mm4 ; W:<P0E+P1E P0C+P1C P0A+P1A P08+P18> psrlw mm5,8 ; W:<P2F+P3F P2D+P3D P2B+P3B P29+P39> pmaddwd mm5,[ebx] ; D:<P2F+P3F+P2D+P3D P2B+P3B+P29+P39> psrlw mm4,8 ; W:<P0F+P1F P0D+P1D P0B+P1B P09+P19> pmaddwd mm4,[ebx] ; D:<P0F+P1F+P0D+P1D P0B+P1B+P09+P19> paddw mm1,mm7 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)> paddw mm1,mm6 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)> lea edi,[edi+ebp*4] ; Advance input cursor paddw mm3,mm5 ; D:<P2F+P3F+P2D+P3D+P27+P37+P25+P35 ; ; P2B+P3B+P29+P39+P23+P33+P21+P31> pcmpeqb mm5,mm5 ; Next W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF> paddw mm4,mm2 ; D:<P0F+P1F+P0D+P1D+P07+P17+P05+P15 ; ; P0B+P1B+P09+P19+P03+P13+P01+P11> punpckldq mm7,mm3 ; D:<P0B+P1B+P09+P19+P03+P13+P01+P11 junk> paddw mm7,mm3 ; [32:47]:<sum of odd pels of lines 0 and 1> punpckldq mm6,mm4 ; W:<P2B+P3B+P29+P39+P23+P33+P21+P31 junk> movq mm2,[edi] ; Next B:<P07 P06 P05 P04 P03 P02 P01 P00> paddw mm6,mm4 ; [32:47]:<sum of odd pels of lines 2 and 3> paddb mm2,[edi+ebp*1] ; Next B:<P07+P17 P06+P16 P05+P15 ...> punpckhwd mm6,mm7 ; [0:31] W:<Line_0&1_odd Line_2&3_odd> mov MBlockActionStream,edx dec cl movdf [esi],mm6 ; Save W:<Line_0&1_odd Line_2&3_odd> psrlw mm5,8 ; Next W:<0x00FF 0x00FF 0x00FF 0x00FF> lea esi,[esi+4] ; Advance output cursor jne @b
; ebp -- Pitch ; edi -- Address of candidate reference MB's signature contribs. ; esi -- Address at which target MB's signature contribs were stored, plus 16. ; edx -- Scratch. ; ecx -- Count down number of lines of signatures to try. ; ebx -- Increment to get from end of one line of signatures to start of next. ; al -- Count down number of signatures to try in a line. ; ah -- Reinits counter of signatures to try in a line. ; mm0 -- Target MB's sig contrib for first four even columns. ; mm1 -- Target MB's sig contrib for last four even columns. ; mm2 -- Target MB's sig contrib for first four pairs of rows, odd columns. ; mm3 -- Amount and address of best signature seen so far.
IF PITCH-384 *** error: The magic here assumes a pitch of 384. ENDIF xor eax,eax mov ecx,TargetToSig_Debiased mov al,EMVLimitsForThisMB+4 ; Lower vert lim for sig srch (half pels) xor ebx,ebx add edi,ecx mov bl,EMVLimitsForThisMB+0 ; Lower horz lim for sig srch (half pels) shr ebx,1 lea ecx,[eax+eax*2] shl ecx,6 add edi,ebx add edi,ecx xor ecx,ecx add ebx,ebx mov cl,EMVLimitsForThisMB+6 ; Upper vert lim for sig srch (half pels) sub ecx,eax mov al,EMVLimitsForThisMB+2 ; Upper horz lim for sig srch (half pels) shr ecx,3 ; Number of lines of sigs to do, minus 1. sub eax,ebx shr eax,3 ; Number of columns of sigs to do. lea ebx,[ebp-1+080000000H] sub ebx,eax ; 1/4th amt to add to move to next line. mov ah,al inc ah ; To reinit cntr for line. movq mm2,[esi-16] pcmpeqd mm3,mm3 ; Set winning signature artificially high. movdt mm4,[edi] psrld mm3,2 punpckldq mm4,[edi+4] ; ref sig contribs of left even cols.
TryNextSignature:
movdt mm5,[edi+8] psubw mm4,mm0 ; diffs for sums of left even columns. punpckldq mm5,[edi+12] ; ref sig contribs of right even cols. pmaddwd mm4,mm4 ; Squared differences. movdt mm6,[edi+ebp*2] ; Sums for first two pairs of rows. psubw mm5,mm1 ; diffs for sums of right even columns. punpckldq mm6,[edi+PITCH*6] ; Sums for second two pairs of rows. pmaddwd mm5,mm5 ; Squared differences. movdt mm7,[edi+PITCH*10] ; Sums for third two pairs of rows. psubw mm6,mm2 ; Words: diffs for sums of first 4 pairs rows. punpckldq mm7,[edi+PITCH*14] ; Sums for last two pairs of rows. pmaddwd mm6,mm6 ; Squared differences. psubw mm7,[esi-8] ; Words: diffs for sums of first 4 pairs rows. paddd mm4,mm5 ; Accumulate squared differences. sub al,1 ; Decrement line counter. pmaddwd mm7,mm7 ; Squared differences. sbb edx,edx ; -1 if done with line, else 0. paddd mm6,mm4 ; Accumulate squared differences. and edx,ebx ; 1/4 Amt to sub to goto next line, else 0. paddd mm7,mm6 ; Accumulate squared differences. movdt mm5,edi ; Address of this signature punpckldq mm6,mm7 ; <low_order_accumulator junk> paddd mm7,mm6 ; <full_signature_amt junk> psllq mm5,32 ; <Addr_of_this_signature 0> lea edi,[edi+edx*4+4] ; advance signature position to next cand. punpckhdq mm5,mm7 ; <cand_signature_amt cand_signature_addr> sar edx,31 ; -1 if done with line, else 0. pcmpgtd mm7,mm3 ; <0xFFFFFFFF if cand not better junk> movdt mm4,[edi] punpckhdq mm7,mm7 ; <0xFFFFFFFFFFFFFFFF if cand not better> punpckldq mm4,[edi+4] pand mm3,mm7 ; 1st_best if cand not better, else 0. and dl,ah ; Num cols in a line if done with line, else 0. pandn mm7,mm5 ; cand if better than 1st_best, else 0. add al,dl ; Reinit col count if finishing with line. por mm3,mm7 ; Better of cand and 1st_best. sbb ecx,0 ; Decrement line count if just finished line. jge TryNextSignature
movdf ecx,mm3 ; Fetch address of best signature. pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip. mov edi,TargetMacroBlockBaseAddr mov ebx,-4 ; Indicate trying MV of best signature. sub ecx,edi mov eax,SigToTarget movdt mm7,BestMBFullPelSWD ; Reload SWD for best full pel MB MV. lea esi,[ecx+eax] ; Linearized motion vector add eax,ecx ; Linearized motion vector sar esi,8 ; Full pel vert lin offset div 256. mov edx,MBlockActionStream ; Reload pointer to MBA descriptor. shl eax,25 punpckldq mm7,mm7 movsx ecx,UnlinearizedVertMV[esi] ; Get full pel vert MV component. sar eax,24 ; Full pel HMV. jmp ClampHeurMECandidateToRange
HeuristicME_CaseSigMVDone: HeuristicME_SkipSigMV:
movdf eax,mm2 ; eax == 0 iff cand better, else -1. pcmpeqd mm0,mm0 ; Init previous best SWD to huge. mov ecx,Addr0MVRef ; Start to calc linearized MV. mov bh,EMVLimitsForThisMB+1 ; HMV lower limit. mov BestOfFourStartingPoints[eax*4],esi add bh,4 movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss). psrlq mm0,2 movq SWDURandLL[eax*8],mm5 psrld mm0,14 mov eax,BestOfFourStartingPoints mov bl,EMVLimitsForThisMB+5 ; VMV lower limit. mov esi,eax sub eax,ecx ; Linearized motion vector mov ecx,eax ; Linearized motion vector add al,al ; Full pel HMV. cmp al,bh jl ClampHMV_2
mov bh,EMVLimitsForThisMB+3 ; HMV upper limit sub bh,4 cmp al,bh jle NoClampHMV_2
ClampHMV_2:
sar ecx,8 ; Full pel vert lin offset div 256. add bl,4 movzx eax,bh movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component. cmp cl,bl jl @f
mov bl,EMVLimitsForThisMB+7 ; VMV upper limit. movq mm7,mm0 sub bl,4 cmp cl,bl jle NoClampVMV_2
@@:
movsx ecx,bl movq mm7,mm0
NoClampVMV_2:
sar eax,1 lea ecx,[ecx+ecx*2] shl ecx,6 mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number. mov esi,Addr0MVRef add eax,ecx ; Linearized motion vector. add esi,eax jmp ComputeMBSWD
NoClampHMV_2:
sar ecx,8 ; Full pel vert lin offset div 256. add bl,4 mov ah,bl movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component. cmp cl,ah jl @f
mov ah,EMVLimitsForThisMB+7 ; VMV upper limit. lea esi,[esi+ebp+1] sub ah,4 mov ebx,FIRST_HEURISTIC_EXHAUSTIVE ; New state number. cmp cl,ah jle ComputeMBSWD
@@:
movsx ecx,ah movzx eax,al sar eax,1 lea ecx,[ecx+ecx*2] shl ecx,6 mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number. mov esi,Addr0MVRef add eax,ecx ; Linearized motion vector. add esi,eax movq mm7,mm0 jmp ComputeMBSWD
ZeroMVDoneForNonHeuristicME:
movq SWDULandLR,mm6 movq SWDURandLL,mm5 cmp eax,ZEROVECTORTHRESHOLD ; Compare 0-MV against ZeroVectorThreshold. jl BelowZeroThresh ; Jump if 0-MV is good enough.
xor ecx,ecx sub eax,NONZEROMVDIFFERENTIAL mov cl,StateEngineFirstRule[ebx] ; MV adjustment. mov bl,StateEngineFirstRule[ebx+10] ; New state number. shl ecx,11 mov SWDForNon0MVToBeat,eax movq SWD0MVULandLR,mm6 movq SWD0MVURandLL,mm5 lea esi,[esi+ecx-PITCH*8] jmp ComputeMBSWD
MEForNonZeroMVDone:
movdf eax,mm2 ; eax == 0 iff cand better, else -1.
MblkEst_EarlyOut:
xor ecx,ecx test ebx,ebx movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss). pcmpeqb mm2,mm2 ; Set cand as worse than 0MV. mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment. js HeuristicME_EarlyOut
add esi,ecx ; Adjust ref addr for horz motion. mov bl,StateEngine[eax+ebx*4+3] ; 0:239 -> New state number; ; ; 240:255 -> flags which 1/2 pel to do. shr ecx,4 punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63]. movq SWDURandLL[eax*8],mm5 pxor mm6,mm6 ; Speculatively zero to prep for half pel ME. add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV. cmp bl,240 ; Terminal state? jb ComputeMBSWD
mov eax,esi mov ecx,Addr0MVRef ; Start to calc linearized MV. sub eax,ecx ; Linearized Motion Vector ; mov ecx,eax ; sar eax,8 ; Full pel vert lin offset div 256. and cl,07FH ; Full pel HMV add cl,cl ; mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component. IFDEF H261 ELSE mov eax,DoHalfPelME ; 0 if not, -4 if so. test eax,eax je SkipHalfPelMBME
cmp cl,EMVLimitsForThisMB+1 ; Skip half pel ME if at edge of range jle SkipHalfPelMBME
cmp cl,EMVLimitsForThisMB+3 jge SkipHalfPelMBME
cmp ch,EMVLimitsForThisMB+5 jle SkipHalfPelMBME
cmp ch,EMVLimitsForThisMB+7 jge SkipHalfPelMBME
; Registers: ; ebp -- PITCH ; esi -- Address of best full pel reference macroblock ; edx -- MBlockActionStream ; ecx -- Nothing presently. ; edi -- Address of target macroblock. ; ebx -- 240 + Flags to indicate which half pel ME to do: ; 1 --> right; 2 --> left; 4 --> down; 8 --> up ; eax -- Count from -4 to -1 for blocks of macroblock. ; mm0:mm7 -- Scratch
movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV. pxor mm7,mm7 ; Prep accumulator for half pel ME.
call HalfPelMotionEstimation
movdt mm7,InvalidateBadHalfPelMVs[eax*4] ; Need to inflate SWDs for ; ; MVs that go off frame edge. mov eax,esi mov ebx,Addr0MVRef ; Start to calc linearized MV. sub eax,ebx ; Linearized Motion Vector punpcklbw mm7,mm7 ; Expand adjustment to words. mov ecx,eax ; Linearized Motion Vector paddusw mm7,mm3 ; Now have SWDs for half pel MBME. sar eax,8 ; Full pel vert lin offset div 256. and cl,07FH ; Full pel HMV add cl,cl movq mm6,mm7 mov [edx].BestFullPelMBHMV,cl ; Save HMV mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component. movdf eax,mm7 ; eax[ 0:15] -- SWD for leftward ref. ; ; eax[16:31] -- SWD for rightward ref. psrlq mm6,32 mov [edx].BestFullPelMBVMV,ch ; Save VMV mov ebx,eax shr eax,16 ; eax -- SWD for leftward ref. and ebx,00000FFFFH ; ebx -- SWD for rightward ref. cmp eax,ebx jg MBME_RightBetterThanLeft
MBME_LeftBetterThanRight:
cmp eax,BestMBFullPelSWD jge MBME_CtrIsBestHMV
MBME_LeftBestHMV:
movdf ebx,mm6 ; ebx[ 0:15] -- SWD for downward ref. ; ; ebx[16:31] -- SWD for upward ref. mov BestHalfPelHorzSWD,eax mov eax,ebx shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jg MBME_LeftBestHMV_DownBetterThanUp
MBME_LeftBestHMV_UpBetterThanDown:
cmp eax,BestMBFullPelSWD jge MBME_LeftIsBest
MBME_LeftBestHMV_UpBestVMV:
sub esi,PITCH+1 ; Try ref 1/2 pel left and up mov BestHalfPelVertSWD,eax mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD lea esi,[esi+ebp*1+1] ; Back to center. cmp eax,ebx jle MBME_UpBetterThanUpLeft
MBME_UpLeftBetterThanUp:
cmp ebx,BestHalfPelHorzSWD jge MBME_LeftIsBest
MBME_UpLeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up dec ch ; Back up the vert MV one up. jmp MBME_HalfPelSearchDone
MBME_UpBetterThanUpLeft:
cmp eax,BestHalfPelHorzSWD jg MBME_LeftIsBest
MBME_UpIsBest:
mov ebx,eax dec ch ; Back up the vert MV one up. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up jmp MBME_HalfPelSearchDone
MBME_LeftBestHMV_DownBetterThanUp:
cmp ebx,BestMBFullPelSWD jge MBME_LeftIsBest
MBME_LeftBestHMV_DownBestVMV:
dec esi ; Try ref 1/2 pel left and down mov BestHalfPelVertSWD,ebx mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD inc esi ; Back to center. cmp eax,ebx jle MBME_DownBetterThanDownLeft
MBME_DownLeftBetterThanDown:
cmp ebx,BestHalfPelHorzSWD jge MBME_LeftIsBest
MBME_DownLeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-1] ; Best is ref 1/2 pel left and down inc ch ; Advance the vert MV one down. jmp MBME_HalfPelSearchDone
MBME_DownBetterThanDownLeft:
cmp eax,BestHalfPelHorzSWD jle MBME_DownIsBest
MBME_LeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-1] ; Best is ref 1/2 pel left. mov ebx,BestHalfPelHorzSWD jmp MBME_HalfPelSearchDone
MBME_RightBetterThanLeft:
cmp ebx,BestMBFullPelSWD jge MBME_CtrIsBestHMV
MBME_RightBestHMV:
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref. ; ; eax[16:31] -- SWD for upward ref. mov BestHalfPelHorzSWD,ebx mov ebx,eax shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jg MBME_RightBestHMV_DownBetterThanUp
MBME_RightBestHMV_UpBetterThanDown:
cmp eax,BestMBFullPelSWD jge MBME_RightIsBest
MBME_RightBestHMV_UpBestVMV:
sub esi,ebp ; Try ref 1/2 pel right and up mov BestHalfPelVertSWD,eax mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD lea esi,[esi+ebp*1] ; Back to center. cmp eax,ebx jle MBME_UpBetterThanUpRight
MBME_UpRightBetterThanUp:
cmp ebx,BestHalfPelHorzSWD jge MBME_RightIsBest
MBME_UpRightIsBest:
inc cl ; Advance the horz MV one to right. lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up dec ch ; Back up the vert MV one up. jmp MBME_HalfPelSearchDone
MBME_UpBetterThanUpRight:
cmp eax,BestHalfPelHorzSWD jle MBME_UpIsBest
MBME_RightIsBest:
mov ebx,BestHalfPelHorzSWD inc cl ; Advance the horz MV one to right. mov eax,esi jmp MBME_HalfPelSearchDone
MBME_RightBestHMV_DownBetterThanUp:
cmp ebx,BestMBFullPelSWD jge MBME_RightIsBest
MBME_RightBestHMV_DownBestVMV:
mov BestHalfPelVertSWD,ebx mov al,4
call HalfPelMotionEstimationBothWays
mov eax,BestHalfPelVertSWD cmp eax,ebx jle MBME_DownBetterThanDownRight
MBME_DownRightBetterThanDown:
cmp ebx,BestHalfPelHorzSWD jge MBME_RightIsBest
MBME_DownRightIsBest:
inc cl ; Advance the horz MV one to right. mov eax,esi inc ch ; Advance vert MV one down. jmp MBME_HalfPelSearchDone
MBME_DownBetterThanDownRight:
cmp eax,BestHalfPelHorzSWD jg MBME_RightIsBest
MBME_DownIsBest:
mov ebx,eax inc ch ; Advance vert MV one down. mov eax,esi jmp MBME_HalfPelSearchDone
MBME_CtrIsBestHMV:
movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref. ; ; eax[16:31] -- SWD for upward ref. mov ebx,eax shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jge MBME_CtrBestHMV_DownBetterThanUp
MBME_CtrBestHMV_UpBetterThanDown:
mov ebx,BestMBFullPelSWD cmp eax,ebx jge MBME_CenterIsBest
; Up is best.
mov ebx,eax dec ch ; Back up the vert MV one up. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up jmp MBME_HalfPelSearchDone
MBME_CtrBestHMV_DownBetterThanUp:
mov eax,ebx mov ebx,BestMBFullPelSWD cmp eax,ebx jge MBME_CenterIsBest
; Down is best.
mov ebx,eax inc ch ; Advande the vert MV one down. mov eax,esi jmp MBME_HalfPelSearchDone
ENDIF
SkipHalfPelMBME:
mov [edx].BestFullPelMBHMV,cl ; Save HMV movdf ebx,mm7 ; SWD for best full pel MB MV. mov [edx].BestFullPelMBVMV,ch ; Save VMV
MBME_CenterIsBest:
mov eax,esi
MBME_HalfPelSearchDone:
mov BestMBHalfPelSWD,ebx mov BestMBHalfPelMV,cl ; Save HMV mov BestMBHalfPelRefAddr,eax mov BestMBHalfPelMV+1,ch ; Save VMV
IFDEF H261 ELSE ; H263 mov bl,EMVLimitsForThisMB+1 ; Lower limit comparison. mov al,DoBlockLevelVectors ; Are we doing block level MVs? dec al jne NoBlockMotionVectors
mov cl,[edx].CodedBlocks ; Fetch coded block pattern. add bl,2 and cl,080H jne NoBlockMotionVectors ; Skip Block ME if forced intra.
mov al,[edx].BestFullPelMBHMV ; Compare full pel HMV against limits. mov cl,EMVLimitsForThisMB+3 cmp al,bl jl NoBlockMotionVectors
mov bl,EMVLimitsForThisMB+5 sub cl,2 cmp al,cl ; Upper limit comparison. jg NoBlockMotionVectors
mov al,[edx].BestFullPelMBVMV ; Compare full pel VMV against limits. add bl,2 mov cl,EMVLimitsForThisMB+7 cmp al,bl mov ebx,PD [edx].BestFullPelMBVMV-3 jl NoBlockMotionVectors
sar ebx,18 sub cl,2 cmp al,cl ; Upper limit comparison. jg NoBlockMotionVectors
mov ecx,BestMBHalfPelSWD ; Jump if SWD for MB MV < thresh. IF PITCH-384 *** error: The magic here assumes a pitch of 384. ENDIF and ebx,0FFFFFF80H ; VMV*128 cmp ecx,BLOCKMOTIONTHRESHOLD jle NoBlockMotionVectors
;========================================================================== ; Starting from the best full pel macroblock motion vector calculated above, we ; search for the best block motion vectors. ; ; ebp -- PITCH ; esi -- Address of ref block. ; edi -- Address of target block. ; edx -- Induction variable over luma blocks in MBlockAction Descriptor. ; ecx -- Scratch ; ebx -- CurrSWDState ; eax -- Scratch ; mm7 -- Best SWD for current block ; mm6 -- unused. ; mm5 -- Best SWD for right block of pair worked on by inner loop. ; mm0-mm4 Scratch ;
movq mm0,HalfPelMBMESWDAccum+8 movq mm1,HalfPelMBMESWDAccum+16 psubusw mm7,mm0 movq mm2,HalfPelMBMESWDAccum+0 psubusw mm0,mm1 movq [edx].BlkY4.BlkLvlSWD+16,mm7 psubusw mm1,mm2 movq [edx].BlkY2.BlkLvlSWD+16,mm0 movq [edx].BlkY3.BlkLvlSWD+16,mm1 movq [edx].BlkY1.BlkLvlSWD+16,mm2
movsx eax,[edx].BestFullPelMBHMV sar eax,1 lea ebx,[ebx+ebx*2] mov esi,Addr0MVRef add ebx,ebp mov Addr0MVRefBlk,esi add esi,eax lea ecx,[ecx+ecx*2] ; Best MBMV SWD times 3. add esi,ebx ; Try V+1 first shr ecx,2 ; Best MBMV SWD * 3/4. mov eax,SWDForNon0MVToBeat mov BestBlockRefAddrVP1,esi ; Stash BestBlockRefAddr sub ecx,BLOCKMVDIFFERENTIAL ; Best MBMV SWD * 3/4 - Differential. lea eax,[eax+eax*2-BLOCKMVDIFFERENTIAL*4] ; Non0MBMVSWDToBeat*3-4*Diff. mov LimitForSWDForBlkMV,ecx shr eax,2 ; Non0MBMVSWDToBeat * 3/4. mov ebx,FIRSTBLOCKMESTATE cmp eax,ecx jg @f
mov LimitForSWDForBlkMV,eax mov ecx,eax
@@:
movdt mm5,SWDURandLL ; Get SWD for best MB level full pel MVs, blk 2. test ecx,ecx jle NoBlockMotionVectors movdt mm7,SWDULandLR ; Get SWD for best MB level full pel MVs, blk 1. movdf SWDForBlock2Or4,mm5
;============================================================================ ; Compute SWD for block.
DoBlkMEForNextBlk: ComputeBlkSWD:
movq mm0,[esi+ebp*1] psubw mm0,[edi+ebp*1] ; Get diff for line 1. movq mm1,[esi+PITCH*3] ; Ref MB, upper left block, Line 3. psllw mm0,8 ; Extract diffs for line 1 even pels. psubw mm1,[edi+PITCH*3] ; Diff for line 3. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1. movq mm2,[esi+PITCH*5] psllw mm1,8 psubw mm2,[edi+PITCH*5] pmaddwd mm1,mm1 movq mm3,[esi+PITCH*7] psllw mm2,8 psubw mm3,[edi+PITCH*7] pmaddwd mm2,mm2 movq mm4,[esi] ; Ref MB, upper left blk, Line 0. psllw mm3,8 psubw mm4,[edi] ; Diff for line 0. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2). movq mm1,[esi+ebp*2] pmaddwd mm3,mm3 psubw mm1,[edi+ebp*2] paddusw mm0,mm2 movq mm2,[esi+ebp*4] pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0. psubw mm2,[edi+ebp*4] paddusw mm0,mm3 movq mm3,[esi+PITCH*6] pmaddwd mm1,mm1 psubw mm3,[edi+PITCH*6] pmaddwd mm2,mm2 paddusw mm0,mm4 pmaddwd mm3,mm3 paddusw mm0,mm1 ; paddusw mm0,mm2 ; paddusw mm0,mm3 ; punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1. movq mm4,mm7 ; Get original Best SWD for block paddusw mm1,mm0 ; mm1[48:63] is SWD for block. pxor mm2,mm2 psrlq mm1,48 ; mm1 is SWD for block. ; psubusw mm4,mm1 xor ecx,ecx pcmpeqd mm2,mm4 ; mm2[0:31] == 0 iff cand better, else -1. psubusw mm7,mm4 ; BestSWD dim (BestSWD dim CandSWD) --> new best. ; ; movdf eax,mm2 ; edi == 0 iff cand better, else -1. ;
; Registers at this point: ; ebp -- PITCH ; esi -- Address of block of candidate ref area. ; edi -- 0 iff candidate SWD better, else -1. ; edx -- Induction variable over luma blocks in MBlockAction Descriptor. ; ecx -- Scratch ; ebx -- CurrSWDState. ; eax -- CurrSWDState. ; mm7 -- New best SWD for current block ; mm6 -- Unused.
movq [edx].BlkY1.BlkLvlSWD,mm7 ; Save best blk level SWD. pxor mm6,mm6 ; Spec zero to prep for half pel ME. mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment. mov bl,StateEngine[eax+ebx*4+3] ; New state number; 255 means done. add esi,ecx ; Adjust ref addr for horz motion. mov eax,DoHalfPelME ; 0 if not, -4 if so. shr ecx,4 cmp bl,240 ; Terminal state? jae @f
add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV. jmp ComputeBlkSWD
@@: add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV. add eax,4 mov ecx,esi jne SkipHalfPelBlkME
; Registers: ; ebp -- PITCH ; esi -- Address of best full pel reference macroblock ; edx -- Induction variable over luma blocks in MBlockAction Descriptor. ; ecx -- Copy of esi. ; edi -- Address of target block. ; ebx -- Scratch ; eax -- Set to 0 to cause HalfPelMotionEstimation to quit after one block. ; mm0:mm7 -- Scratch
mov ebx,BestBlockRefAddrVP1 add ecx,ebp cmp ebx,ecx jne FullPelBlkMEMovedFromCenter
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV. movq mm3,[edx].BlkY1.BlkLvlSWD+16 ; SWDs: H+1, H-1, V+1, V-1. jmp FullPelBlkMEDidNotMoveFromCenter
FullPelBlkMEMovedFromCenter:
movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV. pxor mm7,mm7 ; Prep accumulator for half pel ME.
call HalfPelMotionEstimation
lea esi,[esi+ebp*8+8] ; Fix reference pointer. lea edi,[edi+ebp*8+8] ; Fix target pointer.
FullPelBlkMEDidNotMoveFromCenter:
mov eax,esi mov ebx,Addr0MVRefBlk ; Start to calc linearized MV. sub ecx,ebx ; Linearized Motion Vector sub eax,ebx ; Linearized Motion Vector sar eax,8 ; Full pel vert lin offset div 256. and cl,07FH ; Full pel HMV movdf ebx,mm3 ; ebx[ 0:15] -- SWD for leftward ref. ; ; ebx[16:31] -- SWD for rightward ref. psrlq mm3,32 mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component. mov eax,ebx shr eax,16 ; eax -- SWD for leftward ref. and ebx,00000FFFFH ; ebx -- SWD for rightward ref. cmp eax,ebx jg BlkME_RightBetterThanLeft
BlkME_LeftBetterThanRight:
add cl,cl mov ebx,BestBlkFullPelSWD cmp eax,ebx jge BlkME_CtrIsBestHMV
BlkME_LeftBestHMV:
movdf ebx,mm3 ; ebx[ 0:15] -- SWD for downward ref. ; ; ebx[16:31] -- SWD for upward ref. mov BestHalfPelHorzSWD,eax mov eax,ebx shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jg BlkME_LeftBestHMV_DownBetterThanUp
BlkME_LeftBestHMV_UpBetterThanDown:
cmp eax,BestBlkFullPelSWD jge BlkME_LeftIsBest
BlkME_LeftBestHMV_UpBestVMV:
sub esi,PITCH+1 ; Try ref 1/2 pel left and up mov BestHalfPelVertSWD,eax mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8] mov eax,BestHalfPelVertSWD lea esi,[esi+PITCH*9+9] ; Back to center. cmp eax,ebx jle BlkME_UpBetterThanUpLeft
BlkME_UpLeftBetterThanUp:
cmp ebx,BestHalfPelHorzSWD jge BlkME_LeftIsBest
BlkME_UpLeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up dec ch ; Back up the vert MV one up. jmp BlkME_HalfPelSearchDone
BlkME_UpBetterThanUpLeft:
cmp eax,BestHalfPelHorzSWD jg BlkME_LeftIsBest
BlkME_UpIsBest:
dec ch ; Back up the vert MV one up. mov ebx,eax lea eax,[esi-PITCH] ; Best is ref 1/2 pel up jmp BlkME_HalfPelSearchDone
BlkME_LeftBestHMV_DownBetterThanUp:
cmp ebx,BestBlkFullPelSWD jge BlkME_LeftIsBest
BlkME_LeftBestHMV_DownBestVMV:
dec esi ; Try ref 1/2 pel left and down mov BestHalfPelVertSWD,ebx mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8] mov eax,BestHalfPelVertSWD lea esi,[esi+ebp*8+9] ; Back to center. cmp eax,ebx jle BlkME_DownBetterThanDownLeft
BlkME_DownLeftBetterThanDown:
cmp ebx,BestHalfPelHorzSWD jge BlkME_LeftIsBest
BlkME_DownLeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-1] ; Best is ref 1/2 pel left and down inc ch ; Advance the vert MV one down. jmp BlkME_HalfPelSearchDone
BlkME_DownBetterThanDownLeft:
cmp eax,BestHalfPelHorzSWD jle BlkME_DownIsBest
BlkME_LeftIsBest:
dec cl ; Back up the horz MV one to the left. lea eax,[esi-1] ; Best is ref 1/2 pel left. mov ebx,BestHalfPelHorzSWD jmp BlkME_HalfPelSearchDone
BlkME_RightBetterThanLeft:
add cl,cl mov eax,BestBlkFullPelSWD cmp eax,ebx jle BlkME_CtrIsBestHMV
BlkME_RightBestHMV:
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref. ; ; eax[16:31] -- SWD for upward ref. mov BestHalfPelHorzSWD,ebx mov ebx,eax shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jg BlkME_RightBestHMV_DownBetterThanUp
BlkME_RightBestHMV_UpBetterThanDown:
cmp eax,BestBlkFullPelSWD jge BlkME_RightIsBest
BlkME_RightBestHMV_UpBestVMV:
sub esi,ebp ; Try ref 1/2 pel right and up mov BestHalfPelVertSWD,eax mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8] mov eax,BestHalfPelVertSWD lea esi,[esi+PITCH*9+8] ; Back to center. cmp eax,ebx jle BlkME_UpBetterThanUpRight
BlkME_UpRightBetterThanUp:
cmp ebx,BestHalfPelHorzSWD jge BlkME_RightIsBest
BlkME_UpRightIsBest:
inc cl ; Advance the horz MV one to right. lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up dec ch ; Back up the vert MV one up. jmp BlkME_HalfPelSearchDone
BlkME_UpBetterThanUpRight:
cmp eax,BestHalfPelHorzSWD jle BlkME_UpIsBest
BlkME_RightIsBest:
mov ebx,BestHalfPelHorzSWD inc cl ; Advance the horz MV one to right. mov eax,esi jmp BlkME_HalfPelSearchDone
BlkME_RightBestHMV_DownBetterThanUp:
cmp ebx,BestBlkFullPelSWD jge BlkME_RightIsBest
BlkME_RightBestHMV_DownBestVMV:
mov BestHalfPelVertSWD,ebx mov al,1
call HalfPelMotionEstimationBothWays
lea edi,[edi+ebp*8+8] mov eax,BestHalfPelVertSWD lea esi,[esi+ebp*8+8] ; Back to center. cmp eax,ebx jle BlkME_DownBetterThanDownRight
BlkME_DownRightBetterThanDown:
cmp ebx,BestHalfPelHorzSWD jge BlkME_RightIsBest
BlkME_DownRightIsBest:
inc cl ; Advance the horz MV one to right. mov eax,esi inc ch ; Advance vert MV one down. jmp BlkME_HalfPelSearchDone
BlkME_DownBetterThanDownRight:
cmp eax,BestHalfPelHorzSWD jg BlkME_RightIsBest
BlkME_DownIsBest:
inc ch ; Advance vert MV one down. mov ebx,eax mov eax,esi jmp BlkME_HalfPelSearchDone
BlkME_CtrIsBestHMV:
movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref. ; ; eax[16:31] -- SWD for upward ref. mov ebx,eax shr eax,16 ; eax -- SWD for upward ref. and ebx,00000FFFFH ; ebx -- SWD for downward ref. cmp eax,ebx jge BlkME_CtrBestHMV_DownBetterThanUp
BlkME_CtrBestHMV_UpBetterThanDown:
mov ebx,BestBlkFullPelSWD cmp eax,ebx jge BlkME_CenterIsBest
; Up is best.
mov ebx,eax dec ch ; Back up the vert MV one up. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up jmp BlkME_HalfPelSearchDone
BlkME_CtrBestHMV_DownBetterThanUp:
mov eax,ebx mov ebx,BestBlkFullPelSWD cmp eax,ebx jge BlkME_CenterIsBest
; Down is best.
mov ebx,eax inc ch ; Advande the vert MV one down. mov eax,esi jmp BlkME_HalfPelSearchDone
SkipHalfPelBlkME:
mov eax,esi mov ebx,Addr0MVRefBlk ; Start to calc linearized MV. sub ecx,ebx ; Linearized Motion Vector sub eax,ebx ; Linearized Motion Vector sar eax,8 ; Full pel vert lin offset div 256. and cl,07FH ; Full pel HMV add cl,cl ; mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component. ; movdf ebx,mm7 ; SWD for best full pel block MV.
BlkME_CenterIsBest:
mov eax,esi
BlkME_HalfPelSearchDone:
mov [edx].BlkY1.BlkLvlSWD,ebx mov [edx].BlkY1.PastRef,eax mov [edx].BlkY1.PHMV,cl ; Save HMV mov eax,LimitForSWDForBlkMV ; Does block's SWD put us over limit? mov [edx].BlkY1.PVMV,ch ; Save VMV sub eax,ebx jl BlkEst_EarlyOut
mov LimitForSWDForBlkMV,eax ; Remember how much is left for other blks. mov esi,BestBlockRefAddrVP1 add edi,8 ; Move to blk 2 or 4, V+4. mov ecx,Addr0MVRefBlk ; Calc addr of 0MV ref for this blk. add esi,8 ; Move to blk 2 or 4, V+4. add ecx,8 mov Addr0MVRefBlk,ecx add edx,SIZEOF T_Blk ; Increment to next block. test dl,SIZEOF T_Blk movdt mm7,SWDForBlock2Or4 mov ebx,FIRSTBLOCKMESTATE jne DoBlkMEForNextBlk ; If so, go do blk 2 or 4.
lea esi,[esi+ebp*8-8] ; Move to blk 3 lea ecx,[ecx+ebp*8-16] mov BestBlockRefAddrVP1,esi lea edi,[edi+ebp*8-16] movdt mm5,SWDULandLR+4 ; Get SWD for best MB level MVs, blk 4. movdt mm7,SWDURandLL+4 ; Get SWD for best MB level MVs, blk 3. movdf SWDForBlock2Or4,mm5 test dl,2*SIZEOF T_Blk ; Just finishing blk 2? mov Addr0MVRefBlk,ecx jne DoBlkMEForNextBlk ; If so, go do blk 3.
;============================================================================== ; Block motion vectors are best.
mov esi,[edx-4*SIZEOF T_Blk].BlkY1.BlkLvlSWD mov edi,[edx-4*SIZEOF T_Blk].BlkY4.BlkLvlSWD mov SWDULandLR,esi mov SWDULandLR+4,edi mov esi,[edx-4*SIZEOF T_Blk].BlkY3.BlkLvlSWD mov edi,[edx-4*SIZEOF T_Blk].BlkY2.BlkLvlSWD mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs mov ebx,[edx-4*SIZEOF T_Blk].BlkY2.MVs mov ecx,eax xor eax,ebx xor ecx,[edx-4*SIZEOF T_Blk].BlkY3.MVs xor ebx,[edx-4*SIZEOF T_Blk].BlkY4.MVs mov SWDURandLL,edi or eax,ebx sub edx,4*SIZEOF T_Blk ; Restore MacroBlockActionStream ptr. or eax,ecx test eax,0FFFFH mov SWDURandLL+4,esi je MotionVectorSettled
mov al,INTER4MV ; Set type for MB to INTER-coded, 4 MVs. mov [edx].BlockType,al jmp MotionVectorSettled
BlkEst_EarlyOut:
and edx,-1-3*SIZEOF T_Blk mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
BlockMVNotBigEnoughGain: ; Try MB-level motion vector.
cmp ecx,SWDForNon0MVToBeat jge NonZeroMVNotBigEnoughGain
ENDIF ; H263
mov ebx,BestMBHalfPelMV mov esi,BestMBHalfPelRefAddr ; Reload BestMBHalfPelRefAddr
NonZeroMBLevelMVBest:
; Non-zero macroblock level motion vector is best.
mov [edx].BlkY1.MVs,ebx mov [edx].BlkY2.MVs,ebx mov [edx].BlkY3.MVs,ebx mov [edx].BlkY4.MVs,ebx mov [edx].BlkY1.PastRef,esi lea ecx,[esi+ebp*8] mov [edx].BlkY3.PastRef,ecx add esi,8 mov [edx].BlkY2.PastRef,esi add ecx,8 mov [edx].BlkY4.PastRef,ecx jmp MotionVectorSettled
NoBlockMotionVectors:
mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV. mov eax,SWDForNon0MVToBeat cmp eax,ecx mov ebx,BestMBHalfPelMV mov esi,BestMBHalfPelRefAddr jge NonZeroMBLevelMVBest
NonZeroMVNotBigEnoughGain:
mov esi,Addr0MVRef ; 0-MV ref block. movq mm6,SWD0MVULandLR movq mm5,SWD0MVURandLL movq SWDULandLR,mm6 movq SWDURandLL,mm5
BelowZeroThresh:
mov [edx].BlkY1.PastRef,esi ; Save address of ref block, all blks. lea eax,[esi+8] mov [edx].BlkY2.PastRef,eax lea eax,[esi+ebp*8] mov [edx].BlkY3.PastRef,eax add eax,8 mov [edx].BlkY4.PastRef,eax xor eax,eax mov [edx].BlkY1.MVs,eax ; Set horz and vert MVs to 0 in all blks. mov [edx].BlkY2.MVs,eax mov [edx].BlkY3.MVs,eax mov [edx].BestFullPelMBHMV,al mov [edx].BlkY4.MVs,eax mov [edx].BestFullPelMBVMV,al mov BestMBHalfPelMV,eax
MotionVectorSettled:
IFDEF H261
;=============================================================================== ; For H261, we've settled on the best motion vector. Now we need to determine ; if spatial filtering should be done. ; ; ebp -- PITCH ; esi -- Address of block of ref area. ; edi -- Address of spatially filtred block. ; edx -- MBlockActionStream ; ecx -- Loop counter. ; ebx -- Address of constant 0x7F in all 8 bytes. ; eax -- Scratch ; mm7 -- Mask to extract bytes 0 and 7. (High bit of bytes 1:6 must be off). ; mm6 -- All bytes -1. ; mm5 -- Mask to extract bytes 1:6 and clear bit 8 thereof.
movdf esi,mm7 ; Restore non-SLF SWD for macroblock. cmp esi,SpatialFiltThreshold jle SkipSpatialFiltering
mov ecx,DoSpatialFiltering ; Are we doing spatial filtering? mov esi,[edx].BlkY1.PastRef test cl,cl je SkipSpatialFiltering
DoSpatialFilterForChroma: DoSpatialFilterForLuma:
movq mm5,C7F7F7F7F7F7F7F7F ; Mask to extract bytes 1:6. movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV. psllq mm5,16 psrlq mm5,8 pcmpeqb mm7,mm7 pxor mm7,mm5 ; Mask to extract bytes 0 and 7. mov edi,SpatiallyFilteredMB lea eax,[esi+ebp*4] lea ebx,C7F7F7F7F7F7F7F7F ; Address of this useful constant.
SpatialFilterLoop:
movq mm0,[esi] ; 0a: <P7 P6 P5 P4 P3 P2 P1 P0> pcmpeqb mm6,mm6 ; To add one to all bytes. movq mm4,mm0 ; 0b: <P7 P6 P5 P4 P3 P2 P1 P0> psllq mm0,16 ; 0c: <P5 P4 P3 P2 P1 P0 0 0> movq mm3,[esi+ebp*1]; 1a paddb mm0,mm4 ; 0d: <P7+P5 P6+P4 ... P3+P1 P2+P0 jnk jnk > movq mm1,mm3 ; 1b psrlq mm0,9 ; 0e: <0 (P7+P5)/2 ... (P2+P0)/2 jnk> (dirty)
SpatialFilterLoop_BlockToRight:
pand mm0,mm5 ; 0f: <0 (P7+P5)/2 ... (P2+P0)/2 0> (clean) psllq mm1,16 ; 1c paddb mm0,mm4 ; 0g: <jnk (P7+2P6+P5)/2 ... (P2+2P1+P0)/2 jnk> paddb mm1,mm3 ; 1d psubb mm0,mm6 ; 0h: <jnk (P7+2P6+P5+2)/2 ... (P2+2P1+P0+2)/2 jnk> psrlq mm1,9 ; 1e psrlq mm0,1 ; 0i: <jnk (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 jnk> pand mm4,mm7 ; 0j: <P7 0 0 0 0 0 0 P0> pand mm0,mm5 ; 0k: < 0 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 0> pand mm1,mm5 ; 1f por mm0,mm4 ; 0l: <P7 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/4 P0> paddb mm1,mm3 ; 1g movq mm2,[esi+ebp*2]; 2a psubb mm1,mm6 ; 1h movq [edi],mm0 ; 0m: Store line 0 of filtered block. This is R0. movq mm4,mm2 ; 2b psrlq mm1,1 ; 1i pand mm3,mm7 ; 1j pand mm1,mm5 ; 1k psllq mm2,16 ; 2c por mm1,mm3 ; 1l: This is R1 paddb mm2,mm4 ; 2d psubb mm1,mm6 ; 1A: R1+1 psrlq mm2,9 ; 2e pand mm2,mm5 ; 2f paddb mm0,mm1 ; 1B: R0+R1+1 paddb mm2,mm4 ; 2g psrlq mm0,1 ; 1C: (R0+R1+1)/2 (dirty) pand mm0,[ebx] ; 1D: (R0+R1+1)/2 (clean) psubb mm2,mm6 ; 2h psrlq mm2,1 ; 2i pand mm4,mm7 ; 2j movq mm3,[esi+PITCH*3] ; 3a pand mm2,mm5 ; 2k por mm2,mm4 ; 2l: This is R2. movq mm4,mm3 ; 3b paddb mm1,mm2 ; 1E & 2B: R1+R2+1 psllq mm3,16 ; 3c psrlq mm1,1 ; 1F & 2C: (R1+R2+1)/2 (dirty) paddb mm3,mm4 ; 3d pand mm1,[ebx] ; 1G & 2D: (R1+R2+1)/2 (clean) psrlq mm3,9 ; 3e paddb mm0,mm1 ; 1H: (R0+2R1+R2+2)/2 pand mm3,mm5 ; 3f psrlq mm0,1 ; 1I: (R0+2R1+R2+2)/4 (dirty) paddb mm3,mm4 ; 3g pand mm0,[ebx] ; 1J: (R0+2R1+R2+2)/4 (clean) psubb mm3,mm6 ; 3h psrlq mm3,1 ; 3i pand mm4,mm7 ; 3j movq [edi+ebp*1],mm0 ; 1K: Store line 1 of filtered block. pand mm3,mm5 ; 3k movq mm0,[eax] ; 4a por mm3,mm4 ; 3l psubb mm3,mm6 ; 3A: R3+1 movq mm4,mm0 ; 4b paddb mm2,mm3 ; 2E & 3B: R2+R3+1 psllq mm0,16 ; 4c psrlq mm2,1 ; 2F & 3C: (R2+R3+1)/2 (dirty) paddb mm0,mm4 ; 4d pand mm2,[ebx] ; 2G & 3D: (R2+R3+1)/2 (clean) psrlq mm0,9 ; 4e paddb mm1,mm2 ; 2H: (R1+2R2+R3+2)/2 pand mm0,mm5 ; 4f psrlq mm1,1 ; 2I: (R1+2R2+R3+2)/4 (dirty) paddb mm0,mm4 ; 4g pand mm1,[ebx] ; 2J: (R1+2R2+R3+2)/4 (clean) psubb mm0,mm6 ; 4h psrlq mm0,1 ; 4i pand mm4,mm7 ; 4j movq [edi+ebp*2],mm1 ; 2K: Store line 2 of filtered block. pand mm0,mm5 ; 4k movq mm1,[eax+ebp*1] ; 5a por mm0,mm4 ; 4l movq mm4,mm1 ; 5b psllq mm1,16 ; 5c paddb mm3,mm0 ; 3E & 4B: R3+R4+1 paddb mm1,mm4 ; 5d add esi,8 psrlq mm3,1 ; 3F & 4C: (R3+R4+1)/2 (dirty) pand mm3,[ebx] ; 3G & 4D: (R3+R4+1)/2 (clean) psrlq mm1,9 ; 5e paddb mm2,mm3 ; 3H: (R2+2R3+R4+2)/2 pand mm1,mm5 ; 5f psrlq mm2,1 ; 3I: (R2+2R3+R4+2)/4 (dirty) paddb mm1,mm4 ; 5g pand mm2,[ebx] ; 3J: (R2+2R3+R4+2)/4 (clean) psubb mm1,mm6 ; 5h psrlq mm1,1 ; 5i pand mm4,mm7 ; 5j movq [edi+PITCH*3],mm2 ; 3K: Store line 3 of filtered block. pand mm1,mm5 ; 5k movq mm2,[eax+ebp*2] ; 6a por mm1,mm4 ; 5l psubb mm1,mm6 ; 5A: R5+1 movq mm4,mm2 ; 6b paddb mm0,mm1 ; 4E & 5B: R4+R5+1 psllq mm2,16 ; 6c psrlq mm0,1 ; 4F & 5C: (R4+R5+1)/2 (dirty) paddb mm2,mm4 ; 6d pand mm0,[ebx] ; 4G & 5D: (R4+R5+1)/2 (clean) psrlq mm2,9 ; 6e paddb mm3,mm0 ; 4H: (R3+2R4+R5+2)/2 pand mm2,mm5 ; 6f psrlq mm3,1 ; 4I: (R3+2R4+R5+2)/4 (dirty) paddb mm2,mm4 ; 6g pand mm3,[ebx] ; 4J: (R3+2R4+R5+2)/4 (clean) psubb mm2,mm6 ; 6h psrlq mm2,1 ; 6i sub cl,2 ; Loop control movq [edi+ebp*4],mm3 ; 4K: Store line 4 of filtered block. pand mm4,mm7 ; 6j movq mm3,[eax+PITCH*3] ; 7a pand mm2,mm5 ; 6k por mm2,mm4 ; 6l movq mm4,mm3 ; 7b paddb mm1,mm2 ; 5E & 6B: R5+R6+1 psllq mm3,16 ; 7c psrlq mm1,1 ; 5F & 6C: (R5+R6+1)/2 (dirty) paddb mm3,mm4 ; 7d pand mm1,[ebx] ; 5G & 6D: (R5+R6+1)/2 (clean) psrlq mm3,9 ; 7e paddb mm0,mm1 ; 5H: (R4+2R5+R6+2)/2 pand mm3,mm5 ; 7f psrlq mm0,1 ; 5I: (R4+2R5+R6+2)/4 (dirty) paddb mm3,mm4 ; 7g pand mm0,[ebx] ; 5J: (R4+2R5+R6+2)/4 (clean) psubb mm3,mm6 ; 7h psrlq mm3,1 ; 7i pand mm4,mm7 ; 7j movq [edi+PITCH*5],mm0 ; 5K: Store line 5 of filtered block. pand mm3,mm5 ; 7k psubb mm2,mm6 ; 7A: R6+1 por mm3,mm4 ; 7l paddb mm2,mm3 ; 6E: R6+R7+1 lea eax,[esi+ebp*4] movq mm0,[esi] ; 0a: for next iteration psrlq mm2,1 ; 6F: (R6+R7+1)/2 (dirty) pand mm2,[ebx] ; 6G: (R6+R7+1)/2 (clean) movq mm4,mm0 ; 0b: for next iteration movq [edi+PITCH*7],mm3 ; 7m: Store line 7 of filtered block. paddb mm1,mm2 ; 6H: (R5+2R6+R7+2)/2 lea edi,[edi+8] ; Advance output cursor. psrlq mm1,1 ; 6I: (R5+2R6+R7+2)/4 (dirty) pand mm1,[ebx] ; 6J: (R5+2R6+R7+2)/4 (clean) psllq mm0,16 ; 0c: for next iteration movq mm3,[esi+ebp*1] ; 1a: for next iteration paddb mm0,mm4 ; 0d: for next iteration movq [edi+PITCH*6-8],mm1 ; 6K: Store line 6 of filtered block. movq mm1,mm3 ; 1b: for next iteration psrlq mm0,9 ; 0e: for next iteration jg SpatialFilterLoop_BlockToRight
lea esi,[esi+ebp*8-16] lea eax,[eax+ebp*8-16] lea edi,[edi+ebp*8-16] mov cl,4 jl SpatialFilterLoop
SpatialFilterDone:
mov edi,TargetMacroBlockBaseAddr mov esi,SpatiallyFilteredMB test ch,ch jg ReturnFromSpatialFilterForU
; Registers at this point: ; ebp -- PITCH ; esi -- Address of upper left block of spatially filtered candidate ref area. ; edi -- Address of upper left block of target. ; edx -- MBlockActionStream ; ecx -- Scratch ; ebx -- Scratch ; eax -- Loop control ; mm0-mm4 -- Scratch ; mm5,mm6 -- SWD for each block ; mm7 -- SWD for macroblock ;
movq mm0,[esi+ebp*1] pxor mm7,mm7 mov al,3 jl ReturnFromSpatialFilterForV
ComputeSWDforSLFBlock:
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
ComputeSWDforSLFBlock_BlkToRight:
movq mm1,[esi+PITCH*3] ; Ref MB, Line 3. psllw mm0,8 ; Extract diffs for line 1 even pels. psubw mm1,[edi+PITCH*3] ; Diff for line 3. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1. movq mm2,[esi+PITCH*5] psllw mm1,8 psubw mm2,[edi+PITCH*5] pmaddwd mm1,mm1 movq mm3,[esi+PITCH*7] psllw mm2,8 psubw mm3,[edi+PITCH*7] pmaddwd mm2,mm2 movq mm4,[esi] ; Ref MB, upper left blk, Line 0. psllw mm3,8 psubw mm4,[edi] ; Diff for line 0. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2). movq mm1,[esi+ebp*2] pmaddwd mm3,mm3 psubw mm1,[edi+ebp*2] paddusw mm0,mm2 movq mm2,[esi+ebp*4] pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0. psubw mm2,[edi+ebp*4] paddusw mm0,mm3 movq mm3,[esi+PITCH*6] pmaddwd mm1,mm1 psubw mm3,[edi+PITCH*6] pmaddwd mm2,mm2 paddusw mm4,mm0 pmaddwd mm3,mm3 paddusw mm4,mm1 add esi,8 paddusw mm4,mm2 add edi,8 movq mm0,[esi+ebp*1] paddusw mm4,mm3 psubw mm0,[edi+ebp*1] ; Get diff for line 1. punpckldq mm1,mm4 ; Get low order SWD accum to high order of mm1. paddusw mm1,mm4 ; mm1[48:63] is SWD for block. psllq mm6,32 ; Shift previous block's SWD left. psrlq mm1,48 ; mm1 is SWD for block. sub al,2 ; Loop control. paddusw mm7,mm1 por mm6,mm1 ; Save current block's SWD. movq mm4,mm5 jg ComputeSWDforSLFBlock_BlkToRight
movq mm0,[esi+PITCH*9-16] movq mm5,mm6 lea edi,[edi+ebp*8-16] lea esi,[esi+ebp*8-16] mov al,4 jl ComputeSWDforSLFBlock
mov ebx,BestMBFullPelSWD ; Restore non-SLF SWD for macroblock. mov eax,SpatialFiltDifferential sub ebx,eax sub edi,PITCH*16+16 movdf eax,mm7 ; SLF SWD for macroblock. cmp eax,ebx jge SpatialFilterNotAsGood
movdf SWDULandLR+4,mm5 psrlq mm5,32 movdf SWDURandLL+4,mm5 movdf SWDURandLL,mm6 psrlq mm6,32 movdf SWDULandLR,mm6 mov al,INTERSLF mov ebx,SpatiallyFilteredMB mov [edx].BlockType,al sub esi,PITCH*8-8 mov [edx].BlkY4.PastRef,esi mov [edx].BlkY1.PastRef,ebx sub esi,8 add ebx,8 mov [edx].BlkY3.PastRef,esi mov [edx].BlkY2.PastRef,ebx
SkipSpatialFiltering: SpatialFilterNotAsGood: ENDIF ; H261
;=============================================================================== ; We've settled on the motion vector that will be used if we do indeed code the ; macroblock with inter-coding. We need to determine if some or all of the ; blocks can be forced as empty (copy). If all the blocks can be forced ; empty, we force the whole macroblock to be empty.
mov esi,EMPTYTHRESHOLD ; Get threshold for forcing block empty? mov ebx,SWDULandLR ; Get SWD for block 1. mov al,[edx].CodedBlocks cmp ebx,esi ; Is SWD > threshold? jg @f
and al,0FEH ; If not, indicate block 1 is NOT coded. xor ebx,ebx
@@:
mov ecx,SWDURandLL ; Get SWD for block 2. cmp ecx,esi jg @f
and al,0FDH xor ecx,ecx
@@:
add ebx,ecx mov ecx,SWDURandLL+4 ; Get SWD for block 3. cmp ecx,esi jg @f
and al,0FBH xor ecx,ecx
@@:
add ebx,ecx mov ecx,SWDULandLR+4 ; Get SWD for block 4. cmp ecx,esi jg @f
and al,0F7H xor ecx,ecx
@@:
mov [edx].CodedBlocks,al ; Store coded block pattern. and al,00FH add ebx,ecx cmp al,00FH ; Are any blks marked empty? jne InterBest ; If some blks are empty, can't code as Intra
mov edi,TargetMacroBlockBaseAddr mov [edx].SWD,ebx cmp ebx,INTERCODINGTHRESHOLD ; Is InterSWD below inter-coding thresh? jae CalculateIntraSWD
InterBestX:
mov ebx,[edx].SWD
InterBest:
mov ecx,SWDTotal ; Add to total for this macroblock class. add ecx,ebx IFDEF H261 mov SWDTotal,ecx ELSE ;H263 mov bl,DoAdvancedPrediction mov SWDTotal,ecx test bl,bl jne OBMCDifferencing ENDIF
;============================================================================ ; Perform differencing for the non-empty luma blocks of an Inter-coded ; macroblock. This is the non-OBMC case; i.e. Advanced Prediction is ; not selected. ; ; ebp -- PITCH ; esi -- Address of reference block. ; edi -- Address of target block. ; edx -- MBlockActionStream. Used as cursor over luma blocks. ; ecx -- Not in use. ; ebx -- Scratch. Used to test half pel MV resolution. ; eax[0:3] -- Coded block pattern for luma blocks.
mov cl,INTER1MV mov ebx,TargetMacroBlockBaseAddr mov StashBlockType,cl test al,1 ; Don't diff block 1 if marked empty. mov edi,ebx je @f
mov ebx,[edx].BlkY1.MVs mov esi,[edx].BlkY1.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
mov al,[edx].CodedBlocks sub al,bl mov ebx,TargetMacroBlockBaseAddr mov [edx].CodedBlocks,al pop edi ; Adjust stack pointer StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+8] ; Get address of next macroblock to do. test al,2 ; Don't diff block 2 if marked empty. je @f
mov ebx,[edx].BlkY2.MVs mov esi,[edx].BlkY2.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,1 mov al,[edx].CodedBlocks sub al,bl mov ebx,TargetMacroBlockBaseAddr mov [edx].CodedBlocks,al pop edi ; Adjust stack pointer StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+ebp*8] ; Get address of next macroblock to do. test al,4 ; Don't diff block 3 if marked empty. je @f
mov ebx,[edx].BlkY3.MVs mov esi,[edx].BlkY3.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,2 mov al,[edx].CodedBlocks sub al,bl mov ebx,TargetMacroBlockBaseAddr mov [edx].CodedBlocks,al pop edi ; Adjust stack pointer StackOffset TEXTEQU <0>
@@:
lea edi,[ebx+ebp*8+8] ; Get address of next macroblock to do. test al,8 ; Don't diff block 4 if marked empty. je NonOBMCDifferencingDone
mov ebx,[edx].BlkY4.MVs mov esi,[edx].BlkY4.PastRef
call DoNonOBMCDifferencing
; (Finish differencing the last four lines.) movq mm4,[edi+ebp*4] ; T4 psrlq mm1,1 movq mm5,[edi+PITCH*5] psubb mm4,mm0 ; D4 = T4 - P4 movq mm0,[edi+PITCH*6] psubb mm5,mm1 movq mm1,[edi+PITCH*7] pand mm2,mm6 pand mm3,mm6 psrlq mm2,1 movq PelDiffsLine4,mm4 ; Store D4. psubb mm0,mm2 movq PelDiffsLine5,mm5 psrlq mm3,1 movq PelDiffsLine6,mm0 psubb mm1,mm3 push eax ; Adjust stack pointer StackOffset TEXTEQU <4>
call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
shl bl,3 mov al,[edx].CodedBlocks sub al,bl pop edi ; Adjust stack pointer mov [edx].CodedBlocks,al
StackOffset TEXTEQU <0> NonOBMCDifferencingDone:
IFDEF H261 ELSE mov al,IsPlainPFrame test al,al jne NextMacroBlock
movq mm6,C0101010101010101 pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
ENDIF jmp NextMacroBlock
;============================================================================ ; Register usage in the following internal function. This function does ; half pel motion estimation for whole macroblocks, or individual blocks. ; ; ebp -- PITCH ; esi -- Address of best full pel reference macroblock. For MBME unchanged ; at exit. For BlkME, adjusted by -8-8*PITCH. ; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME, ; adjusted by -8-8*PITCH. ; edx -- MBlockActionStream ; ecx -- Reserved. ; ebx -- For MBME: 240 + Flags to indicate which half pel ME to do: ; 1 --> right; 2 --> left; 4 --> down; 8 --> up ; For BlkME: Garbage ; eax -- Count from -4 to -1 for blocks of macroblock. 0 for single block. ; mm7 -- Initialized to zero. ; mm6 -- Initialized to zero. ; mm0:mm7 -- Scratch ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward ; mm3[16:31] -- SWD for ref 1/2 pel leftward ; mm3[32:47] -- SWD for ref 1/2 pel downward ; mm3[48:63] -- SWD for ref 1/2 pel upward
StackOffset TEXTEQU <4> HalfPelMotionEstimation:
and bl,15
HalfPelMBMEForUpperBlock: HalfPelMEForFirst2LinesOfBlock:
movq mm0,[esi-PITCH] ; <P^7 P^6 P^5 P^4 P^3 P^2 P^1 P^0> movq mm1,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00> movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10> paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
HalfPelMEForNext2LinesOfBlock:
movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10> psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...> movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00> psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
HalfPelMBMEForLowerBlock:
psubw mm0,[edi] ; <(P^7+P07)/2-T07 junk (P^5+P05)/2-T05 junk ...> paddb mm5,mm2 ; <P07+P17 P06+P16 P05+P15 P04+P14 ...> pmullw mm1,C0101010101010101 ; <(P07+P06)*256+P06 ...> psllw mm5,8 ; <(P06+P16) 0 (P04+P14) 0 ...> pmaddwd mm0,mm0 ; Square diff for line 0 odd pels, upward ref. psrlw mm5,1 ; <(P06+P16)/2 0 (P04+P14)/2 0 ...> movq mm3,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00> psubw mm4,mm5 ; <T16-(P06+P16)/2 junk ...> pmaddwd mm4,mm4 ; Square diff for line 1 even pels, upward ref. psrlw mm1,1 ; <(P07+P06)*128+P06/2 ...> psllw mm3,8 ; <T06 0 T04 0 T02 0 T00 0> lea edi,[edi+ebp*2] ; Advance Target cursor psubw mm3,mm1 ; <T06-(P07+P06)/2 junk T04-(P05+P03)/2 junk ...> lea esi,[esi+ebp*2] ; Advance Reference cursor psubw mm1,[edi-PITCH*2] ; <(P07+P06)/2-T07 junk (P05+P04)/2-T05 junk ...> pmaddwd mm3,mm3 ; Square diff for line 0 even pels, rightwrd ref. pmaddwd mm1,mm1 ; Square diff for line 0 odd pels, leftward ref. paddusw mm0,mm4 ; SSD for line 0 and 1, upward ref. pand mm0,CFFFF0000FFFF0000 ; Extract SSD for line 0 and 1, upward ref. movq mm4,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10> paddusw mm6,mm0 ; Accumulate SSD for line 0 and 1, upward ref. psrlq mm4,8 ; < 0 P17 P16 P15 P14 P13 P12 P11> pand mm1,CFFFF0000FFFF0000 ; Extract SSD for line 0, leftward ref. psrld mm3,16 ; Extract SSD for line 0, rightward ref. pmullw mm4,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...> paddw mm3,mm1 ; SSD for line 0, leftward and rightward refs. movq mm1,[esi] ; <P27 P26 P25 P24 P23 P22 P21 P20> movq mm0,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10> paddusw mm7,mm3 ; Accumulate SSD for line 0, left and right refs. paddb mm2,mm1 ; <P17+P27 P16+P26 P15+P25 P14+P24 ...> movq mm3,mm0 ; <P17 P16 P15 P14 P13 P12 P11 P10> psrlw mm4,1 ; <P17 (P16*P15)*128+P15/2 ...> psubw mm4,[edi-PITCH*1] ; <P17-T17 junk (P16*P15)/2-T15 junk ...> psllq mm3,8 ; <P16 P15 P14 P13 P12 P11 P10 0> pmullw mm3,C0101010101010002 ; <(P16+P15)*256+P15 ... P10*256*2> psrlw mm2,1 ; <(P17+P27)/2 junk (P15+P25)/2 junk ...> movq StashMM6,mm6 pmaddwd mm4,mm4 ; Square diff for line 1 odd pels, rightward ref. movq mm6,[edi-PITCH*1] ; <T17 T16 T15 T14 T13 T12 T11 T10> psrlw mm3,1 ; <(P16+P15)*128+P15/2 ... P10*256> psubw mm2,[edi-PITCH*1] ; <(P17+P27)/2-T17 junk (P15+P25)/2-T15 junk ...> psllw mm6,8 ; <T16 0 T14 0 T12 0 T10 0> psubw mm3,mm6 ; <(P16+P15)/2-T16 junk ... P10-T10> psrld mm4,16 ; Extract SSD for line 1, rightward ref. movq mm6,[edi-PITCH*2] ; <T07 T06 T05 T04 T03 T02 T01 T00> pmaddwd mm3,mm3 ; Square diff for line 1 even pels, leftward ref. pmaddwd mm2,mm2 ; Square diff for line 1 odd pels, downward ref. psllw mm6,8 ; <T06 0 T04 0 T02 0 T00 0> paddusw mm7,mm4 ; Accumulate SSD for line 1, rightward ref. psubw mm6,mm5 ; <T06-(P06+P16)/2 junk ...> pand mm3,CFFFF0000FFFF0000 ; Extract SSD for line 1, leftward ref. pmaddwd mm6,mm6 ; Square diff for line 0 even pels, downward ref. add bl,080H psrld mm2,16 ; Extract SSD for line 1, downward ref. paddusw mm2,StashMM6 ; Accumulate SSD for line 1, downward ref. paddusw mm7,mm3 ; Accumulate SSD for line 1, leftward ref. movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10> psrld mm6,16 ; Extract SSD for line 0, downward ref. paddusw mm6,mm2 ; Accumulate SSD for line 0, downward ref. paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...> punpckldq mm5,mm6 ; Speculatively start to accum partial SWDs. jnc HalfPelMEForNext2LinesOfBlock ; Iterate twice, for half a block.
punpckldq mm3,mm7 add bl,040H paddusw mm5,mm6 jns HalfPelMEForNext2LinesOfBlock ; Iterate twice, for a whole block.
paddusw mm3,mm7 psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...> movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10> punpckhdq mm3,mm5 ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward ; ; mm3[16:31] -- SWD for ref 1/2 pel leftward ; ; mm3[32:47] -- SWD for ref 1/2 pel downward ; ; mm3[48:63] -- SWD for ref 1/2 pel upward movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00> sub bl,080H movq HalfPelMBMESWDAccum[eax*8+32],mm3 psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0> add eax,2 jl HalfPelMBMEForLowerBlock ; Iterate twice for 2 blocks.
lea edi,[edi-PITCH*16+8] lea esi,[esi-PITCH*16+8] lea eax,[eax-3] je HalfPelMBMEForUpperBlock ; Iterate twice for macroblock.
sub edi,16 xor eax,eax sub esi,16 mov al,bl ret
StackOffset TEXTEQU <0>
;============================================================================ ; Register usage in the following internal function. This function does ; half pel motion estimation in both directions for whole macroblocks, or ; individual blocks. ; ; ebp -- PITCH ; esi -- Address of best full pel reference macroblock. For MBME unchanged ; at exit. For BlkME, adjusted by -8-8*PITCH. ; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME, ; adjusted by -8-8*PITCH. ; edx -- MBlockActionStream ; ecx -- Reserved. Contains motion vectors. ; ebx -- Returns SWD for this reference block or macroblock. ; al -- Count from 4 to 1 for blocks of macroblock. 1 for blk only. ; mm0:mm6 -- Scratch ; mm7 -- Reserved. Contains SWDs for four 1/2 pel refs at main compass points. ; mm4 -- Returns SWD for this reference block or macroblock.
StackOffset TEXTEQU <4> HalfPelMotionEstimationBothWays:
movq mm3,C0101010101010101 pxor mm6,mm6 ; Zero out SSD accumulator.
HalfPelMBMEForUpperBlockBothWays: HalfPelMEForFirst2LinesOfBlockBothWays:
movq mm0,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
HalfPelMEForNext2LinesOfBlockBothWays: HalfPelMBMEForLowerBlockBothWays:
movq mm1,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10> pmullw mm0,mm3 ; <(P07+P06)*256+P06 ...> movq mm2,[esi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20> pmullw mm3,mm1 ; <(P17+P16)*256+P16 ...> movq mm4,mm2 ; <P27 P26 P25 P24 P23 P22 P21 P20> psrlq mm2,8 ; < 0 P27 P26 P25 P24 P23 P22 P21> pmullw mm2,C0200010101010101 ; <P27*256*2 (P26+P25)*256+P25 ...> psrlq mm1,8 ; < 0 P17 P16 P15 P14 P13 P12 P11> pmullw mm1,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...> psrlw mm3,2 ; <(P17+P16)/4 junk ...> (w /2 frac bits) movq mm5,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00> psrlw mm0,2 ; <(P07+P06)/4 junk ...> (w/ 2 frac bits) paddw mm3,mm0 ; <(P07+P06+P17+P16)/4 junk ...> psrlw mm2,2 ; <P27/2 junk (P26+P25)/4 junk ...> psubw mm2,[edi+ebp*1] ; <P27/2-T17 junk (P26+P25)/4-T15 junk ...> psrlw mm1,2 ; <P17/2 junk (P16+P15)/4 junk ...> paddw mm2,mm1 ; <(P17+P27)/2-T17 junk (P16+P15+P26+P25)-T15 junk ...> psllw mm5,8 ; <T06 0 T04 0 T02 0 T00 0> psubw mm5,mm3 ; <T06-(P07+P06+P17+P16)/4 junk ...> pmaddwd mm2,mm2 ; Square diffs for odd pels of line 1. pmaddwd mm5,mm5 ; Square diffs for even pels of line 0. movq mm0,mm4 ; <P27 P26 P25 P24 P23 P22 P21 P20> lea edi,[edi+ebp*2] ; Advance target cursor. lea esi,[esi+ebp*2] ; Advance reference cursor. paddusw mm6,mm2 ; Accumulate SSD for odd pels of line 1. add al,080H movq mm3,C0101010101010101 paddusw mm6,mm5 ; Accumulate SSD for even pels of line 0. punpckldq mm4,mm6 ; Speculatively start to accum partial SWDs. jnc HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for half a block.
add al,040H paddusw mm4,mm6 ; After whole block, SSD is in mm4[48:63]. psrlq mm4,48 jns HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for a whole block.
movdf ebx,mm4 sub al,082H jg HalfPelMBMEForLowerBlockBothWays ; Iterate twice for 2 blocks.
lea edi,[edi-PITCH*16+8] lea esi,[esi-PITCH*16+8] mov al,3 je HalfPelMBMEForUpperBlockBothWays ; Iterate twice for macroblock.
sub edi,16 sub esi,16 ret
StackOffset TEXTEQU <0>
;============================================================================ ; Register usage in the following internal function. This function is also ; called to do frame differencing for chroma blocks. ; ; ebp -- PITCH ; esi -- Address of reference block. ; edi -- Address of target block. ; edx -- Unavailable. In use by caller. ; ecx -- Not in use. ; ebx -- Motion vectors for the block. bl[0] indicates whether half-pel ; horizontal interpolation is required; bh[0] same for vertical. ; This register is then used for scratch purposes. ; eax -- Unavailable. In use by caller. ; mm0-mm5 -- Scratch ; mm6 -- 8 bytes of 0xFE ; mm7 -- 8 bytes of -1
StackOffset TEXTEQU <4>
DoNonOBMCDifferencing: ; Internal Function
pcmpeqb mm7,mm7 pcmpeqb mm6,mm6 IFDEF H261 ELSE ;H263 shr bl,1 jc NonOBMCDiff_Horz ENDIF
movq mm1,[esi+ebp*1] ; BC . . . R0Dn paddb mm6,mm6 IFDEF H261 ELSE ;H263 shr bh,1 jc NonOBMCDiff_Vert ENDIF
psubb mm1,[edi+ebp*1] ; P1 - T1 pxor mm4,mm4 movq mm0,[edi] ; T0 psubb mm4,mm1 ; D1 = T1 - P1 psubb mm0,[esi] ; D0 = T0 - P0 movq mm2,[edi+ebp*2] ; T2 movq mm3,[edi+PITCH*3] ; T3 psubb mm2,[esi+ebp*2] ; D2 = T2 - P2 psubb mm3,[esi+PITCH*3] ; D3 = T3 - P3 movq PelDiffsLine0,mm0 ; Store D0. movq PelDiffsLine1,mm4 ; Store D1. movq PelDiffsLine2,mm2 ; Store D2. movq PelDiffsLine3,mm3 ; Store D3. movq mm3,[esi+PITCH*7] ; P7 movq mm2,[esi+PITCH*6] ; P6 paddb mm3,mm3 ; Double so that return will fix it. movq mm1,[esi+PITCH*5] ; P5 paddb mm2,mm2 ; Double so that return will fix it. movq mm0,[esi+ebp*4] ; P4 paddb mm1,mm1 ; Double so that return will fix it. ret
IFDEF H261 ELSE ;H263 NonOBMCDiff_Vert: ; 0123 Detail for 0
movq mm0,[esi] ; C. . R0Up psubb mm1,mm7 ; DD . R0Dn+1
call Get4LinesOfPred_InterpVert
movq mm5,[edi] ; T0 psrlq mm1,1 ; O . movq mm7,[edi+ebp*1] psubb mm5,mm0 ; D0 = T0 - P0 movq mm0,mm4 psubb mm7,mm1 movq mm1,[edi+ebp*2] pand mm2,mm6 ; .N. movq mm4,[edi+PITCH*3] pand mm3,mm6 ; . N psrlq mm2,1 ; .O. movq PelDiffsLine0,mm5 ; Store D0. psubb mm1,mm2 movq PelDiffsLine1,mm7 ; Store D1. psrlq mm3,1 ; . O movq PelDiffsLine2,mm1 ; Store D2. psubb mm4,mm3 movq mm1,[esi+ebp*1] ; BC . . . R0Dn pcmpeqb mm7,mm7 movq PelDiffsLine3,mm4 ; Store D3. psubb mm1,mm7 ; DD . . . R0Dn+1 ; jmp Get4MoreLinesOfPred_InterpVert
;=========================================================================== ; Internal function to get 4 lines of prediction, interpolating in the ; vertical direction. The first 3 lines of the function are scheduled into ; the caller's space, and so are commented out here. For 8 lines of prediction, ; a second call, to the second entry point, is called after consuming the ; outputs of the first function call. Certain registers must remain intact ; to convey information from the first call to the second. ; ; ebp -- PITCH ; edi -- Points to target block. ; esi -- Points to Upper left corner of 8 column, 9 row block that will be ; interpolated vertically to generate prediction. ; edx -- Reserved (MBlockActionStream) ; ecx -- Not in use. ; ebx -- Will be used. ; eax -- Reserved. ; mm6 -- 8 bytes of 0xFE. ; mm7 -- 8 bytes of -1. ; mm0-mm5 -- Scratch.
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables> Get4LinesOfPred_InterpVert: ; 0123 Details for line 0 ; movq mm1,[esi+ebp*1] ; BC . R0Dn ; movq mm0,[esi] ; C. . R0Up ; psubb mm1,mm7 ; DD . R0Dn+1 Get4MoreLinesOfPred_InterpVert: movq mm2,[esi+ebp*2] ; BC. paddb mm0,mm1 ; E. . R0Up+R0Dn+1 movq mm3,[esi+PITCH*3] ; .BC paddb mm1,mm2 ; E . movq mm4,[esi+ebp*4] ; . BC psubb mm3,mm7 ; .DD paddb mm2,mm3 ; .E. pand mm0,mm6 ; F. . Pre-clean paddb mm3,mm4 ; E pand mm1,mm6 ; F . lea esi,[esi+ebp*4] ; Advance to next four lines. psrlq mm0,1 ; G. . P0 = (R0Up + R0Dn + 1) / 2 ; pand mm2,mm6 ; G. ; psrlq mm1,1 ; H . ; pand mm3,mm6 ; G ; psrlq mm2,1 ; H. ; psrlq mm3,1 ; H ret StackOffset TEXTEQU <4>
;===========================================================================
NonOBMCDiff_Horz:
movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01> paddb mm6,mm6 ; . . 8 bytes of 0xFE shr bh,1 jc NonOBMCDiff_Both
movq mm7,[edi+PITCH*3] ; T3
call Get4LinesOfPred_InterpHorz
movq mm4,[edi] ; T0 psrlq mm1,1 ; O . movq mm5,[edi+ebp*1] psubb mm4,mm0 ; D0 = T0 - P0 movq mm0,[edi+ebp*2] psubb mm5,mm1 movq mm1,[edi+PITCH*3] pand mm2,mm6 ; .N. pand mm3,mm6 ; . N psrlq mm2,1 ; .O. movq PelDiffsLine0,mm4 ; Store D0. psubb mm0,mm2 movq PelDiffsLine1,mm5 ; Store D1. psrlq mm3,1 ; . O movq PelDiffsLine2,mm0 ; Store D2. psubb mm1,mm3 movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41> ; movq PelDiffsLine3,mm1 ; Store D3. ;
;=========================================================================== ; Internal function to get 4 lines of prediction, interpolating in the ; horizontal direction. The first line of the function are scheduled into ; the caller's space, and so are commented out here. For 8 lines of prediction, ; a second call, to the second entry point, is called after consuming the ; outputs of the first function call. Certain registers must remain intact ; to convey information from the first call to the second. ; ; ebp -- PITCH ; edi -- Points to target block. ; esi -- Points to Upper left corner of 9 column, 8 row block that will be ; interpolated horizontally to generate prediction. ; edx -- Reserved (MBlockActionStream) ; ecx -- Not in use. ; ebx -- Will be used. ; eax -- Reserved. ; mm6 -- 8 bytes of 0xFE. ; mm0-mm5 -- Will be used.
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables> Get4LinesOfPred_InterpHorz: Get4MoreLinesOfPred_InterpHorz:
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01> xor ebx,ebx ; . . movq mm0,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01> mov bl,[esi] ; C. . R00 psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0> movq mm1,[esi+ebp*1+1] ; A . paddb mm0,mm5 ; E. . <R08+R07 ... R02+R01 R01 > paddb mm0,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1> movq mm4,mm1 ; B . mov bl,[esi+ebp*1] ; C . psllq mm4,8 ; D . movq mm2,[esi+ebp*2+1] ; A. paddb mm1,mm4 ; E . paddb mm1,Pel_Rnd[ebx*8] ; F . movq mm5,mm2 ; B. mov bl,[esi+ebp*2] ; C. psllq mm5,8 ; D. movq mm3,[esi+PITCH*3+1] ; A paddb mm2,mm5 ; E. paddb mm2,Pel_Rnd[ebx*8] ; F. movq mm4,mm3 ; B mov bl,[esi+PITCH*3] ; C psllq mm4,8 ; D paddb mm3,mm4 ; E pand mm0,mm6 ; G. . pre-cleaned paddb mm3,Pel_Rnd[ebx*8] ; F psrlq mm0,1 ; H. . P0=<(R08+R07+1)/2 ... (R01+R00+1)/2> lea esi,[esi+ebp*4] ; Advance to next four lines. pand mm1,mm6 ; G . ; pand mm2,mm6 ; G. ; psrlq mm1,1 ; H . ; pand mm3,mm6 ; G ; psrlq mm2,1 ; H. ; psrlq mm3,1 ; H ret StackOffset TEXTEQU <4>
; The steps commented out above are scheduled into the mem-ops the caller has ; to do at the point of return. As though these ops were done, the registers ; look as follows: ; mm0 -- Prediction for line 0. ; mm1 -- Prediction for line 1. ; mm2 -- Prediction for line 2. ; mm3 -- Prediction for line 3. ; mm6 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines. ;============================================================================= NonOBMCDiff_Both:
call Get4LinesOfPred_InterpBoth
movq mm7,[edi] ; T0 psrlq mm1,1 ; O . psubb mm7,mm0 ; D0 = T0 - P0 pand mm2,mm6 ; .N. movq mm0,[edi+ebp*1] psrlq mm2,1 ; .O. movq PelDiffsLine0,mm7 ; Store D0. psubb mm0,mm1 movq mm7,[edi+ebp*2] pand mm3,mm6 ; . N movq PelDiffsLine1,mm0 psrlq mm3,1 ; . O movq mm1,[edi+PITCH*3] psubb mm7,mm2 psubb mm1,mm3 movq mm0,mm4 movq PelDiffsLine2,mm7 paddb mm5,mm5 ; . . Prepare for use for next 4 lines. movq PelDiffsLine3,mm1 ; Store D3. pcmpeqb mm7,mm7 jmp Get4MoreLinesOfPred_InterpBoth
;=========================================================================== ; Internal function to get 4 lines of prediction, interpolating in both ; directions. The first line of the function are scheduled into the ; caller's space, and so are commented out here. For 8 lines of prediction, ; a second call, to the second entry point, is called after consuming the ; outputs of the first function call. Certain registers must remain intact ; to convey information from the first call to the second. ; ; ebp -- PITCH ; edi -- Points to target block. ; esi -- Points to Upper left corner of 9*9 block that will be interpolated ; horizontally and vertically to generate prediction. ; edx -- Reserved (MBlockActionStream) ; ecx -- Not in use ; ebx -- Will be used. ; eax -- Reserved. ; mm6 -- 8 bytes of 0xFE. ; mm7 -- 8 bytes of -1. ; mm0-mm5 -- Scratch
StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables> Get4LinesOfPred_InterpBoth: ; 01234 Details for line 0
; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01> movq mm1,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01> xor ebx,ebx ; . . mov bl,[esi] ; C. . R00 psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0> paddb mm5,mm1 ; E. . <R08+R07 ... R02+R01 R01> paddb mm5,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1> movq mm0,mm6 ; G. . Mask to extract each pel's frac bit. pandn mm0,mm5 ; H. . <(R08+R07+1)&1 ...> pand mm5,mm6 ; I. . Pre-clean Get4MoreLinesOfPred_InterpBoth: ; . . movq mm2,[esi+ebp*1+1] ; A . psrlq mm5,1 ; J. . <(R08+R07+1)/2 ... (R01+R00+1)/2)> xor ebx,ebx ; . . movq mm1,mm2 ; B . mov bl,[esi+ebp*1] ; C . psllq mm2,8 ; D . movq mm3,[esi+ebp*2+1] ; .A. paddb mm2,mm1 ; E . paddb mm2,Pel_Rnd[ebx*8] ; F . movq mm1,mm3 ; .B. mov bl,[esi+ebp*2] ; .C. psllq mm3,8 ; .D. movq mm4,[esi+PITCH*3+1] ; . A paddb mm3,mm1 ; .E. paddb mm3,Pel_Rnd[ebx*8] ; .F. movq mm1,mm4 ; . B mov bl,[esi+PITCH*3] ; . C pand mm0,mm2 ; K. . <(R08+R07+1)&(R18+R17+1)&1 ...> paddb mm0,mm5 ; L. . <(R08+R07+1+((R18+R17+1)&1))/2 ...> psllq mm4,8 ; . D movq mm5,[esi+ebp*4+1] ; . .A paddb mm4,mm1 ; . E paddb mm4,Pel_Rnd[ebx*8] ; . F movq mm1,mm5 ; . .B mov bl,[esi+ebp*4] ; . .C psllq mm5,8 ; . .D paddb mm5,mm1 ; . .E movq mm1,mm6 ; G . pandn mm1,mm2 ; H . pand mm2,mm6 ; I . paddb mm5,Pel_Rnd[ebx*8] ; . .F psrlq mm2,1 ; J . paddb mm0,mm2 ; M. . <(R08+R07+R18+R17+2)/2 ...> pand mm1,mm3 ; K . paddb mm1,mm2 ; L . movq mm2,mm6 ; .G. pandn mm2,mm3 ; .H. pand mm3,mm6 ; .I. pand mm0,mm6 ; N. . Pre-clean psrlq mm3,1 ; .J. paddb mm1,mm3 ; M . pand mm2,mm4 ; .K. paddb mm2,mm3 ; .L. movq mm3,mm6 ; . G pandn mm3,mm4 ; . H pand mm4,mm6 ; . I pand mm3,mm5 ; . K psrlq mm4,1 ; . J paddb mm2,mm4 ; .M. paddb mm3,mm4 ; . L movq mm4,mm6 ; . .G psrlq mm0,1 ; O. . P0 = <(R08+R07+R18+R17+2)/4 ...> pandn mm4,mm5 ; . .H pand mm5,mm6 ; . .I pand mm1,mm6 ; N . psrlq mm5,1 ; . .J paddb mm3,mm5 ; . M lea esi,[esi+ebp*4] ; Advance to next four lines. ; pand mm2,mm6 ; .N. ; psrlq mm1,1 ; O . ; pand mm3,mm6 ; . N ; psrlq mm2,1 ; .O. ; paddb mm5,mm5 ; . . Prepare for use for next 4 lines. ; psrlq mm3,1 ; . O ret StackOffset TEXTEQU <4>
; The steps commented out above are scheduled into the mem-ops the caller has ; to do at the point of return. As though these ops were done, the registers ; look as follows: ; mm0 -- Prediction for line 0. ; mm1 -- Prediction for line 1. ; mm2 -- Prediction for line 2. ; mm3 -- Prediction for line 3. ; mm4 -- Must be moved to mm0 before computing prediction for next 4 lines. ; mm5 -- Must be doubled before computing prediction for next 4 lines. ; mm6 -- 8 bytes of 0x01. Must be this when computing pred for next 4 lines. ; mm7 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines. ;============================================================================= ENDIF
StackOffset TEXTEQU <0>
IFDEF H261 ELSE ;H263 OBMCDifferencing:
mov al,PendingOBMC ; Do OBMC for previous block, if needed.. mov bl,1 test al,al mov PendingOBMC,bl mov cl,INTER1MV je NextMacroBlock
mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame test al,al jne NextMacroBlock
add edx,-SIZEOF T_MacroBlockActionDescr movq mm6,C0101010101010101 pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr jmp NextMacroBlock
ENDIF
;============================================================================ ; Calculate the IntraSWD ; ; ebp -- PITCH ; esi -- Accumulation for IntraSWD ; edi -- Address of target macroblock. ; edx -- MBlockActionStream ; ecx -- Scratch ; ebx -- Amount IntraSWD has to be less than to be the winner. ; eax -- Reserved. Holds coded blk pattern, (except undef when IntraByDecree). ; mm7 -- SWD total for macroblock. ; mm6 -- Average pel value for block 1. ; mm5 -- Average pel value for block 2. ; mm4 -- Average pel value for block 3. ; mm3 -- Average pel value for block 4. ; mm0-mm2 Scratch ;
IntraByDecree:
mov ebx,000080000H ; Set Inter SWD artificially high.
CalculateIntraSWD:
sub ebx,INTRACODINGDIFFERENTIAL mov cl,1 movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00> pcmpeqb mm5,mm5
ComputeIntraSWDForNextBlock:
movq mm2,[edi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20> psrlw mm5,8 movq mm4,[edi+ebp*4] paddw mm0,mm2 ; <junk P06+P26 junk P04+P24 ...> movq mm6,[edi+PITCH*6] pand mm0,mm5 ; <P06+P26 P04+P24 P02+P22 P00+P20> movq mm1,[edi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10> paddw mm4,mm6 movq mm3,[edi+PITCH*3] ; <P37 P36 P35 P34 P33 P32 P31 P30> pand mm4,mm5 movq mm5,[edi+PITCH*5] paddw mm1,mm3 ; <P17+P37 junk P15+P35 junk ...> movq mm7,[edi+PITCH*7] psrlw mm1,8 ; <P17+P37 P15+P35 P13+P33 P11+P31> paddw mm0,mm1 paddw mm5,mm7 paddw mm0,mm4 psrlw mm5,8 paddw mm0,mm5 pcmpeqw mm5,mm5 ; Get words of -1 movq mm4,[edi+ebp*4] pmaddwd mm0,mm5 ; <SumHi = Sum3+Sum2 | SumLo = Sum1+Sum0> pcmpeqw mm1,mm1 psllw mm3,8 ; <P36 0 P34 0 P32 0 P30 0> movq mm5,[edi+PITCH*5] psllw mm1,3 ; 4 words of 0xFFF8 packssdw mm0,mm0 ; <SumHi | SumLo | SumHi | SumLo> mov al,[edx].CodedBlocks ; Fetch coded block pattern. pmaddwd mm0,mm1 ; <Sum = SumHi+SumLo | Sum = SumHi+SumLo> psllw mm5,8 movq mm1,[edi+ebp*1] psllw mm7,8 ; psllw mm1,8 ; packssdw mm0,mm0 ; <Sum | Sum | Sum | Sum> psubw mm1,mm0 ; <P16-Avg frac P14-Avg frac ...> psubw mm2,mm0 ; <P27-Avg frac P25-Avg frac ...> pmaddwd mm1,mm1 ; Square of diff psubw mm3,mm0 pmaddwd mm2,mm2 psubw mm4,mm0 pmaddwd mm3,mm3 psubw mm5,mm0 pmaddwd mm4,mm4 psubw mm6,mm0 psubw mm7,mm0 paddusw mm1,mm2 psubw mm0,[edi] pmaddwd mm5,mm5 pmaddwd mm6,mm6 paddusw mm1,mm3 pmaddwd mm7,mm7 paddusw mm1,mm4 pmaddwd mm0,mm0 paddusw mm1,mm5 paddusw mm1,mm6 cmp cl,2 paddusw mm1,mm7 ; paddusw mm0,mm1 ; punpckldq mm1,mm0 ; paddusw mm0,mm1 jg LowerBlkIntraDone
psrlq mm0,48 lea edi,[edi+ebp*8+8] ; Speculate going from blk 1 to blk 4 mov cl,4 je Blk2IntraDone
Blk1IntraDone:
movdf esi,mm0 sub ebx,esi jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00> pcmpeqb mm5,mm5 jmp ComputeIntraSWDForNextBlock
LowerBlkIntraDone:
psrlq mm0,48 sub edi,PITCH*8 ; Speculate going from blk 4 to blk 2 cmp cl,3 je Blk3IntraDone
Blk4IntraDone:
movdf ecx,mm0 add esi,ecx ; Accumulate IntraSWD sub ebx,ecx jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00> pcmpeqb mm5,mm5 mov cl,2 jmp ComputeIntraSWDForNextBlock
Blk2IntraDone:
movdf ecx,mm0 add esi,ecx ; Accumulate IntraSWD sub edi,16 ; Get to blk 3. sub ebx,ecx jle InterBestX
movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00> pcmpeqb mm5,mm5 mov cl,3 jmp ComputeIntraSWDForNextBlock
Blk3IntraDone:
movdf ecx,mm0 add esi,ecx ; Accumulate IntraSWD sub ebx,ecx jle InterBestX
IntraBest:
mov ecx,SWDTotal and al,07FH ; Turn off FORCE-INTRA bit. mov [edx].SWD,esi add ecx,esi ; Add to total. mov SWDTotal,ecx mov cl,INTRA mov [edx].BlockType,cl ; Indicate macroblock handling decision. xor ecx,ecx mov [edx].BlkY1.MVs,ecx mov [edx].BlkY2.MVs,ecx mov [edx].BlkY3.MVs,ecx mov [edx].BlkY4.MVs,ecx mov [edx].CodedBlocks,al
IFDEF H261 ELSE ;H263 mov al,PendingOBMC ; Do Prev MB if it needs to be OBMC'ed. mov [edx].BestFullPelMBHMV,cl ; Kill MVs so extended EMV of other ; ; blocks will work right. dec al mov [edx].BestFullPelMBVMV,cl jne @f
mov PendingOBMC,al ; Go on to next MB, unless the prev MB ; ; needs to be finished (OBMC). mov cl,INTER1MV mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame test al,al jne @f
add edx,-SIZEOF T_MacroBlockActionDescr movq mm6,C0101010101010101 pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
sub edx,-SIZEOF T_MacroBlockActionDescr
@@:
ENDIF
mov cl,INTRA mov esi,TargetMacroBlockBaseAddr mov StashBlockType,cl push eax ; Adjust stack pointer StackOffset TEXTEQU <4> call MMxDoForwardDCT mov al,[edx].CodedBlocks mov esi,TargetMacroBlockBaseAddr sub al,bl add esi,8 mov [edx].CodedBlocks,al call MMxDoForwardDCT shl bl,1 mov al,[edx].CodedBlocks sub al,bl mov esi,TargetMacroBlockBaseAddr mov [edx].CodedBlocks,al add esi,PITCH*8 call MMxDoForwardDCT shl bl,2 mov al,[edx].CodedBlocks sub al,bl mov esi,TargetMacroBlockBaseAddr mov [edx].CodedBlocks,al add esi,PITCH*8+8 call MMxDoForwardDCT shl bl,3 mov al,[edx].CodedBlocks sub al,bl pop edi ; Adjust stack pointer StackOffset TEXTEQU <0> mov [edx].CodedBlocks,al IFDEF H261 ELSE mov al,IsPlainPFrame test al,al jne NextMacroBlock
movq mm6,C0101010101010101 pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks ENDIF
jmp NextMacroBlock
IFDEF H261 ELSE; H263 StackOffset TEXTEQU <4> DoPendingOBMCDiff: ; Internal function
;============================================================================ ; Perform differencing for the non-empty luma blocks of an Inter-coded ; macroblock. This is the OBMC case; i.e. Advanced Prediction is selected.
PrevMBAD EQU [edx-SIZEOF T_MacroBlockActionDescr]
pcmpeqb mm6,mm6 pcmpeqb mm7,mm7 ; 8 bytes of -1 paddb mm6,mm6 ; 8 bytes of 0xFE mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks. test al,1 ; Check if block 1 empty. je OBMCDoneForBlock1
xor ebx,ebx mov eax,SIZEOF T_Blk ; Blk to right is blk 2 of this MB. mov bl,PrevMBAD.MBEdgeType mov ecx,1 ; Mask to extract left edge indicator. and ecx,ebx ; Extract left edge indicator. and ebx,4 ; Extract top edge indicator. mov esi,PrevMBAD.BlkY1.MVs lea edi,[eax*2] ; Blk below is blk 3 of this MB. mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV. mov edi,BlockAbove[ebx] ; Blk above is blk 3 of mb above, or off ; ; upper edge. mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 2 of mb to the ; ; left, or off left edge. mov DistToBADforBlockAbove,edi call DoOBMCForBlock mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks. sub al,bl mov PrevMBAD.CodedBlocks,al
OBMCDoneForBlock1:
add edx,SIZEOF T_Blk test al,2 ; Check if block 2 empty. je OBMCDoneForBlock2
xor ebx,ebx mov eax,2 ; Mask to extract right edge indicator. mov bl,PrevMBAD[-SIZEOF T_Blk].MBEdgeType mov edi,2*SIZEOF T_Blk ; Blk below is blk 4 of this MB. and eax,ebx ; Extract right edge indicator. and ebx,4 ; Extract top edge indicator. mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV. lea ecx,[edi-3*SIZEOF T_Blk] ; Blk to left is blk 1 of this MB. mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the ; ; right, or off right edge. mov edi,BlockAbove[ebx] ; Blk above is blk 4 of mb above, or off ; ; upper edge. mov esi,PrevMBAD.BlkY1.MVs mov DistToBADforBlockAbove,edi call DoOBMCForBlock shl bl,1 mov al,PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks sub al,bl mov PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock2:
add edx,SIZEOF T_Blk test al,4 ; Check if block 3 empty. je OBMCDoneForBlock3
xor ecx,ecx xor ebx,ebx ; Blk below is this block. mov cl,PrevMBAD[-2*SIZEOF T_Blk].MBEdgeType mov eax,SIZEOF T_Blk ; Blk to right is blk 4 of this MB. and ecx,1 ; Extract left edge indicator. mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV. lea edi,[eax-3*SIZEOF T_Blk] ; Blk above is blk 1 of this MB. mov esi,PrevMBAD.BlkY1.MVs mov DistToBADforBlockAbove,edi mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 1 of mb to the ; ; left, or off left edge. call DoOBMCForBlock shl bl,2 mov al,PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks sub al,bl mov PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock3:
add edx,SIZEOF T_Blk test al,8 ; Check if block 4 empty. je OBMCDoneForBlock4
xor eax,eax xor ebx,ebx ; Blk below is this block. mov al,PrevMBAD[-3*SIZEOF T_Blk].MBEdgeType mov ecx,-SIZEOF T_Blk ; Blk to left is blk 3 of this MB. and eax,2 ; Extract right edge indicator. mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV. lea edi,[ecx*2] ; Blk above is blk 2 of this MB. mov esi,PrevMBAD.BlkY1.MVs mov DistToBADforBlockAbove,edi mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the ; ; right, or off right edge. call DoOBMCForBlock shl bl,3 mov al,PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks sub al,bl mov PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks,al
OBMCDoneForBlock4:
sub edx,3*SIZEOF T_Blk ; Get back to MacroBlock Action Descriptor ret
StackOffset TEXTEQU <8> DoOBMCForBlock: ; Internal Function
; Present register contents. ; ebp -- PITCH ; esi -- Motion vectors for current block. ; ecx -- Distance from BAD of blk we're doing to BAD for block that provides ; remote MV from left. ; eax -- Distance from BAD of blk we're doing to BAD for block that provides ; remote MV from right. ; edx -- MBlockActionStream, adjusted to reach BAD of blk we are doing OBMC to. ; doing OBMC) ; mm7 -- 8 bytes of -1. ; mm6 -- 8 bytes of 0xFE. ; ; In the body of this code: ; ; edx -- Unchanged. ; edi -- Saved to memory. Then used for address of destination for storing ; remote prediction blocks. ; ebp -- PITCH. ; esi -- Pointer to 8*8, 8*9, 9*8, or 9*9 remote reference areas, which are ; then interpolated and stored at edi. ; ecx, eax -- Inputs are used, then these are scratch. ; ebx -- Scratch ; mm7 -- 8 bytes of -1 ; mm6 -- 8 bytes of 0xFE ; mm0-mm5 -- Scratch
; Compute left remote prediction block.
lea edi,PrevMBAD[ecx] and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to left. lea ebx,CentralPred mov AddrOfLeftPred,ebx ; Speculate that left remote MV == center MV. mov AddrOfRightPred,ebx ; Speculate that right remote MV == center MV. mov bl,[edi].BlockType cmp bl,INTRA je LeftEqCtr ; Jump if INTRA. (Use central)
mov ebx,PrevMBAD[ecx].BlkY1.MVs and ebx,00000FFFFH ; Blk to left may have B MVs set. Clear them. cmp esi,ebx je LeftEqCtr
mov edi,PrevMBAD[ecx].BlkY1.BlkOffset mov esi,PrevMBAD[ecx].BlkY1.PastRef ; Get ref addr using left remote. sub esi,edi mov edi,PrevMBAD.BlkY1.BlkOffset add esi,edi lea edi,LeftPred
call GetPredForCenterLeftOrRight
pand mm2,mm6 psrlq mm1,1 movq [edi+32],mm0 psrlq mm2,1 movq [edi+40],mm1 pand mm3,mm6 movq [edi+48],mm2 psrlq mm3,1 lea ecx,PrevMBAD[eax] and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right. mov esi,PrevMBAD.BlkY1.MVs movq [edi+56],mm3 pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
; Compute right remote prediction block.
mov AddrOfLeftPred,edi mov bl,[ecx].BlockType cmp bl,INTRA je RightEqCtrButLeftNeCtr ; Jump if INTRA.(Use central)
mov ebx,PrevMBAD[eax].BlkY1.MVs cmp esi,ebx je RightEqCtrButLeftNeCtr
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote. mov edi,PrevMBAD[eax].BlkY1.BlkOffset
RightNeCtr:
sub esi,edi mov edi,PrevMBAD.BlkY1.BlkOffset add esi,edi lea edi,RightPred
call GetPredForCenterLeftOrRight
pand mm2,mm6 psrlq mm1,1 movq [edi+32],mm0 psrlq mm2,1 movq [edi+40],mm1 pand mm3,mm6 movq [edi+48],mm2 psrlq mm3,1 mov AddrOfRightPred,edi ; movq [edi+56],mm3 pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
RightEqCtrButLeftNeCtr:
; Compute central prediction block.
mov ebx,PrevMBAD.BlkY1.MVs mov esi,PrevMBAD.BlkY1.PastRef lea edi,CentralPred mov eax,DistToBADforBlockBelow
call GetPredForCenterLeftOrRight
pand mm2,mm6 psrlq mm1,1 movq [edi+32],mm0 psrlq mm2,1 movq [edi+40],mm1 pand mm3,mm6 movq [edi+48],mm2 psrlq mm3,1 lea ecx,PrevMBAD[eax] and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below. mov esi,PrevMBAD.BlkY1.MVs movq [edi+56],mm3 pcmpeqb mm7,mm7 mov bl,[ecx].BlockType mov ecx,PrevMBAD.BlkY1.BlkOffset cmp bl,INTRA je BelowEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
; Compute bottom remote prediction block.
mov ebx,PrevMBAD[eax].BlkY1.MVs mov edi,AddrOfLeftPred cmp esi,ebx jne BelowNeCtr
BelowEqCtrButSidesDiffer:
paddb mm1,mm1 ; Prep mm0-3, which have ctr, for reuse below. paddb mm2,mm2 paddb mm3,mm3 mov edi,AddrOfLeftPred jmp BelowEqCtr
BelowNeCtr:
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote. mov eax,PrevMBAD[eax].BlkY1.BlkOffset sub esi,eax lea eax,[ecx+ebp*4]
call GetPredForAboveOrBelow
BelowEqCtr:
; Compute difference for lines 4 thru 7. ; Lines 4 and 5: Cols 0,1,6, and 7 treated same. Cols 2-5 treated same.
mov esi,AddrOfRightPred mov ebx,TargetFrameBaseAddress movdt mm5,[edi+48] ; 6B: < 0 0 0 0 R63 R62 R61 R60> pand mm2,mm6 punpckldq mm5,[esi+48+4] ; 6C: <L67 L66 L65 L64 R63 R62 R61 R60> pand mm3,mm6 movq mm4,CFFFF00000000FFFF ; 6D: < FF FF 00 00 00 00 FF FF> psrlq mm2,1 ; 6A: <B67 B66 B65 B64 B63 B62 B61 B60> pand mm4,mm5 ; 6E: <L67 L66 00 00 00 00 R61 R60> paddb mm5,mm2 ; 6F: <B67+L67 ... B65+L65 ...>
pand mm2,C0000FFFFFFFF0000 ; 6G: < 00 00 B65 B64 B63 B62 00 00> psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50> paddb mm2,mm4 ; 6H: <L67 L66 B65 B64 B63 B62 R61 R60> add ecx,ebx ; Address of target block. movdt mm4,[edi+56] ; 7B: < 0 0 0 0 R73 R72 R71 R70> psubb mm5,mm2 ; 6I: <B67 B66 L65 L64 R63 R62 B61 B60> paddb mm5,CentralPred+48 ; 6J: <C67+B67 ... C65+L65 ...> psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70> punpckldq mm4,[esi+56+4] ; 7C: <L77 L76 L75 L74 R73 R72 R71 R70> pand mm5,mm6 ; 6K: <C67+B67 ... C65+L65 ...> pre-cleaned mov eax,DistToBADforBlockAbove psrlq mm5,1 ; 6L: <(C67+B67)/2 ... (C65+L65)/2 ...> paddb mm2,mm5 ; 6M: <(C67+B67+2L67)/2 ... ; ; (C65+2B65+L65)/2 ...> lea ebx,PelDiffs movq mm5,CFF000000000000FF ; 7D: < FF 00 00 00 00 00 00 FF> pand mm2,mm6 ; 6N: pre-cleaned pandn mm5,CentralPred+56 ; 7E: < 00 C76 C75 C74 C73 C72 C71 00> psrlq mm2,1 ; 6O: <(C67+B67+2L67)/4 ... ; ; (C65+2B65+L65)/4 ...> paddb mm2,CentralPred+48 ; 6P: <(5C67+B67+2L67)/4 ... ; ; (5C65+2B65+L65)/4 ...> paddb mm5,mm4 ; 7F: <L77 C76+L76 ...> pand mm4,CFF000000000000FF ; 7G: <L77 00 00 00 00 00 00 L70> psubb mm2,mm7 ; 6Q: <(5C67+B67+2L67+4)/4 ... ; ; (5C65+2B65+L65+4)/4 ...> paddb mm4,mm5 ; 7H: <2L77 C76+L76 ...> pand mm2,mm6 ; 6R: pre-cleaned movq mm5,[ecx+PITCH*6] ; 6T: T6 psrlq mm2,1 ; 6S: P6 = <(5C67+B67+2L67+4)/8 ... ; ; (5C65+2B65+L65+4)/8 ...> psubb mm5,mm2 ; 6U: D6 = T6 - P6 ; ; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0 movdt mm2,[edi+32] ; 4B: < 0 0 0 0 R43 R42 R41 R40> pand mm4,mm6 ; 7I: <2L77 C76+L76 ...> pre-cleaned movq [ebx+6*16],mm5 ; 6V: Store D6. psrlq mm4,1 ; 7J: <2L77/2 (C76+L76)/2 ...> punpckldq mm2,[esi+32+4] ; 4C: <L47 L46 L45 L44 R43 R42 R41 R40> paddb mm3,mm4 ; 7K: <(2B77+2L77)/2 (C76+2B76+L76)/2 ...> movq mm5,CFFFF00000000FFFF ; 4D: < FF FF 00 00 00 00 FF FF> pand mm3,mm6 ; 7L: pre-cleaned movq mm4,CentralPred+32 ; 4E: <C47 C46 C45 C44 C43 C42 C41 C40> psrlq mm3,1 ; 7M: <(2B77+2L77)/4 (C76+2B76+L76)/4 ...> paddb mm3,CentralPred+56 ; 7N: <(4C77+2B77+2L77)/4 ; ; (5C76+2B76+L76)/4 ...> pand mm5,mm4 ; 4F: <C47 C46 00 00 00 00 C41 C40> psubb mm3,mm7 ; 7O: <(4C77+2B77+2L77+4)/4 ; ; (5C76+2B76+L76+4)/4 ...> paddb mm4,mm2 ; 4G: <C47+L47 ... C45+L45 ...> pand mm2,C0000FFFFFFFF0000 ; 4H: < 00 00 L45 L44 R43 R42 00 00> pand mm3,mm6 ; 7P: <(4C77+2B77+2L77+4)/4 ; ; (5C76+2B76+L76+4)/4 ...> pre-cleaned paddb mm2,mm5 ; 4I: <C47 C46 L45 L44 R43 R42 C41 C40> psrlq mm3,1 ; 7Q: P7 = <(4C77+2B77+2L77+4)/8 ; ; (5C76+2B76+L76+4)/8 ...> movdt mm5,[edi+40] ; 5B: < 0 0 0 0 R53 R52 R51 R50> psubb mm4,mm2 ; 4J: <L47 L46 C45 C44 C43 C42 R41 R40> punpckldq mm5,[esi+40+4] ; 5C: <L57 L56 L55 L54 R53 R52 R51 R50> paddb mm0,mm2 ; 4K: <C47+B47 ... B45+L45 ...> movq mm2,[ecx+PITCH*7] ; 7R: T7 pand mm0,mm6 ; 4L: <C47+B47 ... B45+L45 ...> pre-cleaned psubb mm2,mm3 ; 7S: D7 = T7 - P7 psrlq mm0,1 ; 4M: <(C47+B47)/2 ... (B45+L45)/2 ...> movq mm3,CFFFF00000000FFFF ; 5D: < FF FF 00 00 00 00 FF FF> paddb mm0,mm4 ; 4N: <(C47+B47+2L47)/2 ... ; ; (2C45+B45+L45)/2 ...> movq mm4,CentralPred+40 ; 5E: <C57 C56 C55 C54 C53 C52 C51 C50> pand mm0,mm6 ; 4O: pre-cleaned pand mm3,mm4 ; 5F: <C57 C56 00 00 00 00 C51 C50> paddb mm4,mm5 ; 5G: <C57+L57 ... C55+L55 ...> pand mm5,C0000FFFFFFFF0000 ; 5H: < 00 00 L55 L54 R53 R52 00 00> psrlq mm0,1 ; 4P: <(C47+B47+2L47)/4 ... ; ; (2C45+B45+L45)/4 ...> paddb mm0,CentralPred+32 ; 4Q: <(5C47+B47+2L47)/4 ... ; ; (6C45+B45+L45)/4 ...> paddb mm5,mm3 ; 5I: <C57 C56 L55 L54 R53 R52 C51 C50> psubb mm4,mm5 ; 5J: <L57 L56 C55 C54 C53 C52 R51 R50> paddb mm1,mm5 ; 5K: <C57+B57 ... B55+L55 ...> pand mm1,mm6 ; 5L: <C57+B57 ... B55+L55 ...> pre-cleaned psubb mm0,mm7 ; 4R: <(5C47+B47+2L47+4)/4 ... ; ; (6C45+B45+L45+4)/4 ...> pand mm0,mm6 ; 4S: pre-cleaned psrlq mm1,1 ; 5M: <(C57+B57)/2 ... (B55+L55)/2 ...> paddb mm1,mm4 ; 5N: <(C57+B57+2L57)/2 ... ; ; (2C55+B55+L55)/2 ...> psrlq mm0,1 ; 4T: P4 = <(5C47+B47+2L47+4)/8 ... ; ; (6C45+B45+L45+4)/8 ...> movq mm3,[ecx+PITCH*5] ; 5U: T5 pand mm1,mm6 ; 5O: pre-cleaned movq mm4,[ecx+ebp*4] ; 4U: T4 psrlq mm1,1 ; 5P: <(C57+B57+2L57)/4 ... ; ; (2C55+B55+L55)/4 ...> paddb mm1,CentralPred+40 ; 5Q: <(5C57+B57+2L57)/4 ... ; ; (6C55+B55+L55)/4 ...> psubb mm4,mm0 ; 4V: D4 = T4 - P4 lea esi,PrevMBAD[eax] psubb mm1,mm7 ; 5R: <(5C57+B57+2L57+4)/4 ... ; ; (6C55+B55+L55+4)/4 ...> and esi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above. pand mm1,mm6 ; 5S: pre-cleaned movq [ebx+7*16],mm2 ; 7T psrlq mm1,1 ; 5T: P5 = <(5C57+B57+2L57+4)/8 ... ; ; (6C55+B55+L55+4)/8 ...> movq [ebx+4*16],mm4 ; 4W: Store D4. psubb mm3,mm1 ; 5V: D5 = T5 - P5 mov cl,[esi].BlockType ; Bottom bit set if above neighbor is INTRA. mov esi,PrevMBAD.BlkY1.MVs movq [ebx+5*16],mm3 ; 5W: Store D5. cmp cl,INTRA je AboveEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
; Compute top remote prediction block.
mov ebx,PrevMBAD[eax].BlkY1.MVs and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them. mov ecx,PrevMBAD.BlkY1.BlkOffset cmp esi,ebx jne AboveNeCtr
AboveEqCtrButSidesDiffer:
movq mm3,CentralPred+24 ; Prep mm0-3, which have ctr, for reuse below. movq mm2,CentralPred+16 paddb mm3,mm3 movq mm1,CentralPred+8 paddb mm2,mm2 movq mm0,CentralPred paddb mm1,mm1 mov ecx,PrevMBAD.BlkY1.BlkOffset jmp AboveEqCtr
AboveNeCtr:
mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote. mov eax,PrevMBAD[eax].BlkY1.BlkOffset sub esi,eax mov eax,ecx
call GetPredForAboveOrBelow
AboveEqCtr:
; Compute difference for lines 0 thru 3.
mov esi,AddrOfRightPred mov ebx,TargetFrameBaseAddress movdt mm5,[edi+8] ; 1B: < 0 0 0 0 R13 R12 R11 R10> psrlq mm1,1 ; 1A: <A17 A16 A15 A14 A13 A12 A11 A10> punpckldq mm5,[esi+8+4] ; 1C: <L17 L16 L15 L14 R13 R12 R11 R10> pand mm3,mm6 movq mm4,CFFFF00000000FFFF ; 1D: < FF FF 00 00 00 00 FF FF> psrlq mm3,1 ; 3A: <A37 A36 A35 A34 A33 A32 A31 A30>: mm0 pand mm4,mm5 ; 1E: <L17 L16 00 00 00 00 R11 R10> paddb mm5,mm1 ; 1F: <A17+L17 ... A15+L15 ...> pand mm1,C0000FFFFFFFF0000 ; 1G: < 00 00 A15 A14 A13 A12 00 00> pand mm2,mm6 paddb mm5,CentralPred+8 ; 1H: <C17+A17+L17 ... C15+A15+L15 ...> paddb mm1,mm4 ; 1I: <L17 L16 A15 A14 A13 A12 R11 R10> ; 0A: <A07 A06 A05 A04 A03 A02 A01 A00>:mm0 movdt mm4,[edi] ; 0B: < 0 0 0 0 R03 R02 R01 R00> psubb mm5,mm1 ; 1J: <C17+A17 ... C15+L15 ...> punpckldq mm4,[esi+4] ; 0C: <L07 L06 L05 L04 R03 R02 R01 R00> pand mm5,mm6 ; 1K: <C17+A17 ... C15+L15 ...> pre-cleaned add ecx,ebx ; Address of target block. psrlq mm5,1 ; 1L: <(C17+A17)/2 ... (C15+L15)/2 ...> paddb mm1,mm5 ; 1M: <(C17+A17+2L17)/2 ... ; ; (C15+2A15+L15)/2 ...> psrlq mm2,1 ; 2A: <A27 A26 A25 A24 A23 A22 A21 A20> movq mm5,CFF000000000000FF ; 0D: < FF 00 00 00 00 00 00 FF> pand mm1,mm6 ; 1N: pre-cleaned pandn mm5,CentralPred ; 0E: < 00 C06 C05 C04 C03 C02 C01 00> psrlq mm1,1 ; 1O: <(C17+A17+2L17)/4 ... ; ; (C15+2A15+L15)/4 ...> paddb mm1,CentralPred+8 ; 1P: <(5C17+A17+2L17)/4 ... ; ; (5C15+2A15+L15)/4 ...> paddb mm5,mm4 ; 0F: <L07 C06+L06 ...> pand mm4,CFF000000000000FF ; 0G: <L07 00 00 00 00 00 00 L00> psubb mm1,mm7 ; 1Q: <(5C17+A17+2L17+4)/4 ... ; ; (5C15+2A15+L15+4)/4 ...> paddb mm4,mm5 ; 0H: <2L07 C06+L06 ...> pand mm1,mm6 ; 1R: pre-cleaned movq mm5,[ecx+ebp*1] ; 1T: T1 psrlq mm1,1 ; 1S: P1 = <(5C17+A17+2L17+4)/8 ... ; ; (5C15+2A15+L15+4)/8 ...> psubb mm5,mm1 ; 1U: D1 = T1 - P1 ; movdt mm1,[edi+24] ; 3B: < 0 0 0 0 R33 R32 R31 R30> pand mm4,mm6 ; 0I: <2L07 C06+L06 ...> pre-cleaned movq PelDiffsLine1,mm5 ; 1V: Store D1. psrlq mm4,1 ; 0J: <2L07/2 (C06+L06)/2 ...> punpckldq mm1,[esi+24+4] ; 3C: <L37 L36 L35 L34 R33 R32 R31 R30> paddb mm0,mm4 ; 0K: <(2A07+2L07)/2 (C06+2A06+L06)/2 ...> movq mm5,CFFFF00000000FFFF ; 3D: < FF FF 00 00 00 00 FF FF> pand mm0,mm6 ; 0L: pre-cleaned movq mm4,CentralPred+24 ; 3E: <C37 C36 C35 C34 C33 C32 C31 C30> psrlq mm0,1 ; 0M: <(2A07+2L07)/4 (C06+2A06+L06)/4 ...> paddb mm0,CentralPred ; 0N: <(4C07+2A07+2L07)/4 ; ; (5C06+2A06+L06)/4 ...> pand mm5,mm4 ; 3F: <C37 C36 00 00 00 00 C31 C30> psubb mm0,mm7 ; 0O: <(4C07+2A07+2L07+4)/4 ; ; (5C06+2A06+L06+4)/4 ...> paddb mm4,mm1 ; 3G: <C37+L37 ... C35+L35 ...> pand mm1,C0000FFFFFFFF0000 ; 3H: < 00 00 L35 L34 R33 R32 00 00> pand mm0,mm6 ; 0P: <(4C07+2A07+2L07+4)/4 ; ; (5C06+2A06+L06+4)/4 ...> pre-cleaned paddb mm1,mm5 ; 3I: <C37 C36 L35 L34 R33 R32 C31 C30> psrlq mm0,1 ; 0Q: P0 = <(4C07+2A07+2L07+4)/8 ; ; (5C06+2A06+L06+4)/8 ...> movdt mm5,[edi+16] ; 2B: < 0 0 0 0 R23 R22 R21 R20> psubb mm4,mm1 ; 3J: <L37 L36 C35 C34 C33 C32 R31 R30> punpckldq mm5,[esi+16+4] ; 2C: <L27 L26 L25 L24 R23 R22 R21 R20> paddb mm3,mm1 ; 3K: <C37+A37 ... A35+L35 ...> movq mm1,[ecx] ; 0R: T0 pand mm3,mm6 ; 3L: <C37+A37 ... A35+L35 ...> pre-cleaned psubb mm1,mm0 ; 0S: D0 = T0 - P0 psrlq mm3,1 ; 3M: <(C37+A37)/2 ... (A35+L35)/2 ...> movq mm0,CFFFF00000000FFFF ; 2D: < FF FF 00 00 00 00 FF FF> paddb mm3,mm4 ; 3N: <(C37+A37+2L37)/2 ... ; ; (2C35+A35+L35)/2 ...> movq mm4,CentralPred+16 ; 2E: <C27 C26 C25 C24 C23 C22 C21 C20> pand mm3,mm6 ; 3O: pre-cleaned pand mm0,mm4 ; 2F: <C27 C26 00 00 00 00 C21 C20> paddb mm4,mm5 ; 2G: <C27+L27 ... C25+L25 ...> pand mm5,C0000FFFFFFFF0000 ; 2H: < 00 00 L25 L24 R23 R22 00 00> psrlq mm3,1 ; 3P: <(C37+A37+2L37)/4 ... ; ; (2C35+A35+L35)/4 ...> paddb mm3,CentralPred+24 ; 3Q: <(5C37+A37+2L37)/4 ... ; ; (6C35+A35+L35)/4 ...> paddb mm5,mm0 ; 2I: <C27 C26 L25 L24 R23 R22 C21 C20> psubb mm4,mm5 ; 2J: <L27 L26 C25 C24 C23 C22 R21 R20> paddb mm2,mm5 ; 2K: <C27+A27 ... A25+L25 ...> pand mm2,mm6 ; 2L: <C27+A27 ... A25+L25 ...> pre-cleaned psubb mm3,mm7 ; 3R: <(5C37+A37+2L37+4)/4 ... ; ; (6C35+A35+L35+4)/4 ...> pand mm3,mm6 ; 3S: pre-cleaned psrlq mm2,1 ; 2M: <(C27+A27)/2 ... (A25+L25)/2 ...> paddb mm2,mm4 ; 2N: <(C27+A27+2L27)/2 ... ; ; (2C25+A25+L25)/2 ...> psrlq mm3,1 ; 3T: P3 = <(5C37+A37+2L37+4)/8 ... ; ; (6C35+A35+L35+4)/8 ...> movq mm0,[ecx+ebp*2] ; 2U: T2 pand mm2,mm6 ; 2O: pre-cleaned movq mm4,[ecx+PITCH*3] ; 3U: T3 psrlq mm2,1 ; 2P: <(C27+A27+2L27)/4 ... ; ; (2C25+A25+L25)/4 ...> paddb mm2,CentralPred+16 ; 2Q: <(5C27+A27+2L27)/4 ... ; ; (6C25+A25+L25)/4 ...> psubb mm4,mm3 ; 3V: D3 = T3 - P3 movq PelDiffsLine0,mm1 ; 0T psubb mm2,mm7 ; 2R: <(5C27+A27+2L27+4)/4 ... ; ; (6C25+A25+L25+4)/4 ...> movq PelDiffsLine3,mm4 ; 3W: Store D3. pand mm2,mm6 ; 2S: pre-cleaned psrlq mm2,1 ; 2T: P2 = <(5C27+A27+2L27+4)/8 ... ; ; (6C25+A25+L25+4)/8 ...> ; psubb mm0,mm2 ; 2V: D2 = T2 - P2 ; ; ; movq PelDiffsLine2,mm0 ; 2W: Store D2. ; jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
LeftEqCtr:
; Left remote motion vector was same as center. ; Compute right remote prediction block.
lea edi,PrevMBAD[eax] and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right. mov esi,PrevMBAD.BlkY1.MVs ; ; mov cl,[edi].BlockType mov ebx,PrevMBAD[eax].BlkY1.MVs cmp cl,INTRA je LeftEqCtrAndRightEqCtr ; Jump if INTRA. (Use central)
cmp esi,ebx mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote. mov edi,PrevMBAD[eax].BlkY1.BlkOffset jne RightNeCtr
; Left and right remote motion vectors were same as center. ; Compute central prediction block.
LeftEqCtrAndRightEqCtr:
mov ebx,PrevMBAD.BlkY1.MVs mov esi,PrevMBAD.BlkY1.PastRef lea edi,CentralPred mov eax,DistToBADforBlockBelow
call GetPredForCenterLeftOrRight
pand mm2,mm6 psrlq mm1,1 movq [edi+32],mm0 psrlq mm2,1 movq [edi+40],mm1 pand mm3,mm6 movq [edi+48],mm2 psrlq mm3,1 lea ecx,PrevMBAD[eax] and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below. mov esi,PrevMBAD.BlkY1.MVs movq [edi+56],mm3 pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1 mov bl,[ecx].BlockType mov ecx,PrevMBAD.BlkY1.BlkOffset cmp bl,INTRA mov edi,AddrOfLeftPred mov ebx,PrevMBAD[eax].BlkY1.MVs je BottomHalfAllSame ; Jump if INTRA. (Use central)
; Compute bottom remote prediction block.
cmp esi,ebx mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote. mov eax,PrevMBAD[eax].BlkY1.BlkOffset je BottomHalfAllSame
sub esi,eax lea eax,[ecx+ebp*4]
call GetPredForAboveOrBelow
; Compute difference for lines 4 thru 7. Only the remote motion vector below ; was different than the central motion vector.
; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0 movq mm5,CentralPred+48 ; 6b: <C67 C66 C65 C64 C63 C62 C61 C60> pand mm2,mm6 movq mm4,CentralPred+32 ; 4B: <C47 C46 C45 C44 C43 C42 C41 C40> psrlq mm2,1 ; 6a: <B67 B66 B65 B64 B63 B62 B61 B60> paddb mm2,mm5 ; 6c: <C67+B67 ... C65+B65 ...> paddb mm0,mm4 ; 4C: <C47+B47> pand mm0,mm6 ; 4D: <C47+B47> pre-cleaned psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50> pand mm2,mm6 ; 6d: <C67+B67 ... C65+B65 ...> pre-cleaned psrlq mm0,1 ; 4E: <(C47+B47)/2 ...> paddb mm0,mm4 ; 4F: <(3C47+B47)/2 ...> psrlq mm2,1 ; 6e: <(C67+B67)/2 ... (C65+B65)/2 ...> pmullw mm2,C0001000200020001 ; 6f: <(C67+B67)/2 ... (2C65+2B65)/2 ...> pand mm0,mm6 ; 4G: <(3C47+B47)/2 ...> pre-cleaned pand mm3,mm6 psrlq mm0,1 ; 4H: <(3C47+B47)/4 ...> paddb mm0,mm4 ; 4I: <(7C47+B47)/4 ...> psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70> movq mm4,C0000FFFFFFFF0000 ; 6g: < 00 00 FF FF FF FF 00 00> psubb mm0,mm7 ; 4J: <(7C47+B47+4)/4 ...> pandn mm4,mm5 ; 6h: <C67 C66 00 00 00 00 C61 C60> psubb mm5,mm7 ; 6i: <C67+1 ... C65+1 ...> paddb mm2,mm4 ; 6j: <(3C67+B67)/2 ... (2C65+2B65)/2 ...> pand mm0,mm6 ; 4K: <(7C47+B47+4)/4 ...> pre-cleaned movq mm4,CentralPred+40 ; 5B pand mm2,mm6 ; 6k: pre-cleaned paddb mm1,mm4 ; 5C psrlq mm0,1 ; 4L: <(7C47+B47+4)/8 ...> pand mm1,mm6 ; 5D psrlq mm2,1 ; 6l: <(3C67+B67)/4 ... (2C65+2B65)/4 ...> paddb mm2,mm5 ; 6m: <(7C67+B67+4)/4 ... (6C65+2B65+4)/4...> psrlq mm1,1 ; 5E movq mm5,CentralPred+56 ; 7B: <C77 C76 C75 C74 C73 C72 C71 C70> paddb mm1,mm4 ; 5F paddb mm3,mm5 ; 7C: <C77+B47> pand mm1,mm6 ; 5G pand mm3,mm6 ; 7D: <C77+B47> pre-cleaned psrlq mm1,1 ; 5H paddb mm1,mm4 ; 5I psrlq mm3,1 ; 7E: <(C77+B47)/2 ...> psubb mm1,mm7 ; 5J paddb mm3,mm5 ; 7F: <(3C77+B47)/2 ...> pand mm1,mm6 ; 5K psubb mm3,mm7 ; 7G: <(3C77+B47+2)/2 ...> pand mm2,mm6 ; 6n: pre-cleaned psrlq mm1,1 ; 5L pand mm3,mm6 ; 7H: <(3C77+B47+2)/2 ...> pre-cleaned psrlq mm2,1 ; 6o: <(7C67+B67+4)/8 ... (6C65+2B65+4)/8...> psrlq mm3,1 ; 7I: <(3C77+B47+2)/4 ...>
BottomHalfAllSame:
mov ebx,TargetFrameBaseAddress mov eax,DistToBADforBlockAbove mov esi,PrevMBAD.BlkY1.MVs movq mm5,[ecx+ebx+PITCH*5] ; 5M add ecx,ebx ; Address of target block. lea ebx,PrevMBAD[eax]
and ebx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above. psubb mm5,mm1 ; 5N movq mm4,[ecx+ebp*4] ; 4M: T4 movq mm1,[ecx+PITCH*7] ; 7J: T7 psubb mm4,mm0 ; 4N: D4 = T4 - P4 movq mm0,[ecx+PITCH*6] ; 6p: T6 psubb mm1,mm3 ; 7K: D7 = T7 - P7 movq PelDiffsLine4,mm4 ; 4O: Store D4. psubb mm0,mm2 ; 6q: D6 = T6 - P6 movq PelDiffsLine5,mm5 ; 5O movq PelDiffsLine6,mm0 ; 6r movq PelDiffsLine7,mm1 ; 7L mov cl,[ebx].BlockType cmp cl,INTRA mov ecx,PrevMBAD.BlkY1.BlkOffset mov ebx,PrevMBAD[eax].BlkY1.MVs je SidesEqCtrAndAboveEqCtr ; Jump if INTRA. (Use central)
; Compute top remote prediction block.
and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them. cmp esi,ebx mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote. mov eax,PrevMBAD[eax].BlkY1.BlkOffset jne SidesEqCtrButAboveNeCtr
SidesEqCtrAndAboveEqCtr:
movq mm0,CentralPred movq mm1,CentralPred+8 paddb mm0,mm0 movq mm2,CentralPred+16 paddb mm1,mm1 movq mm3,CentralPred+24 paddb mm2,mm2 jmp TopHalfAllSame
SidesEqCtrButAboveNeCtr:
sub esi,eax mov eax,ecx
call GetPredForAboveOrBelow
; Compute difference for lines 0 thru 3. Only the remote motion vector above ; was different than the central motion vector.
movq mm5,CentralPred+8 ; 1b pand mm3,mm6 movq mm4,CentralPred+24 ; 3B psrlq mm3,1 ; 3A paddb mm3,mm4 ; 3C psrlq mm1,1 ; 1A paddb mm1,mm5 ; 1c pand mm3,mm6 ; 3D pand mm1,mm6 ; 1d psrlq mm3,1 ; 3E paddb mm3,mm4 ; 3F psrlq mm1,1 ; 1e pmullw mm1,C0001000200020001 ; 1f pand mm3,mm6 ; 3G pand mm2,mm6 psrlq mm3,1 ; 3H paddb mm3,mm4 ; 3I psrlq mm2,1 ; 2a movq mm4,C0000FFFFFFFF0000 ; 1g psubb mm3,mm7 ; 3J pandn mm4,mm5 ; 1h psubb mm5,mm7 ; 1i paddb mm1,mm4 ; 1j pand mm3,mm6 ; 3K movq mm4,CentralPred+16 ; 2B pand mm1,mm6 ; 1k paddb mm2,mm4 ; 2C psrlq mm3,1 ; 3L pand mm2,mm6 ; 2D psrlq mm1,1 ; 1l paddb mm1,mm5 ; 1m psrlq mm2,1 ; 2E movq mm5,CentralPred ; 0B paddb mm2,mm4 ; 2F paddb mm0,mm5 ; 0C pand mm2,mm6 ; 2G pand mm0,mm6 ; 0D psrlq mm2,1 ; 2H paddb mm2,mm4 ; 2I psrlq mm0,1 ; 0E psubb mm2,mm7 ; 2J paddb mm0,mm5 ; 0F pand mm2,mm6 ; 2K psubb mm0,mm7 ; 0G
TopHalfAllSame:
mov ebx,TargetFrameBaseAddress lea edi,[ecx+ebx] pand mm1,mm6 ; 1n movq mm7,[ecx+ebx] ; 0J pand mm0,mm6 ; 0H movq mm5,[edi+PITCH*3] ; 3M psrlq mm2,1 ; 2L movq mm4,[edi+ebp*2] ; 2M psubb mm5,mm3 ; 3N psubb mm4,mm2 ; 2N psrlq mm1,1 ; 1o movq mm3,[edi+ebp*1] ; 1p psubb mm3,mm1 ; 1q movq PelDiffsLine3,mm5 ; 3O psrlq mm0,1 ; 0I movq PelDiffsLine2,mm4 ; 2O psubb mm7,mm0 ; 0K movq PelDiffsLine1,mm3 ; 1r movq PelDiffsLine0,mm7 ; 0L jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
;============================================================================= ; This internal function computes the OBMC contribution for the reference ; block that uses the left, central, or right remote motion vector. ; ; ebp -- PITCH ; edi -- Address of where to put the contribution. ; esi -- Address of reference block. ; edx -- Reserved. MBlockActionStream ; ecx -- Unavailable. ; ebx -- Scratch. Initially the horizontal and vertical motion vectors. ; eax -- Unavailable. ; mm7 -- 8 bytes of -1 ; mm6 -- 8 bytes of 0xFE ; mm0-mm5 -- Scratch
StackOffset TEXTEQU <12_ButAccessToLocalVariablesShouldNotBeNeeded>
GetPredForCenterLeftOrRight:
shr ebx,1 jc HorzInterpInCLRPred movq mm1,[esi+ebp*1] and bl,080H je NoInterpInCLRPred
VertInterpInCLRPred:
movq mm0,[esi] psubb mm1,mm7
call Get4LinesOfPred_InterpVert
pand mm2,mm6 psrlq mm1,1 movq [edi+0],mm0 pand mm3,mm6 movq [edi+8],mm1 psrlq mm2,1 movq mm1,[esi+ebp*1] psrlq mm3,1 movq [edi+16],mm2 movq mm0,mm4 movq [edi+24],mm3 psubb mm1,mm7 jmp Get4MoreLinesOfPred_InterpVert
HorzInterpInCLRPred: movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01> and bl,080H jne BothInterpInCLRPred
call Get4LinesOfPred_InterpHorz
pand mm2,mm6 psrlq mm1,1 movq [edi+0],mm0 pand mm3,mm6 movq [edi+8],mm1 psrlq mm2,1 movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41> psrlq mm3,1 movq [edi+16],mm2 ; movq [edi+24],mm3 ; jmp Get4MoreLinesOfPred_InterpHorz
BothInterpInCLRPred:
call Get4LinesOfPred_InterpBoth
pand mm2,mm6 psrlq mm1,1 movq [edi+0],mm0 pand mm3,mm6 movq [edi+8],mm1 psrlq mm2,1 movq mm1,[esi+ebp*1] psrlq mm3,1 movq [edi+16],mm2 movq mm0,mm4 movq [edi+24],mm3 psubb mm1,mm7 paddb mm5,mm5 jmp Get4MoreLinesOfPred_InterpBoth
NoInterpInCLRPred:
movq mm0,[esi] movq mm2,[esi+ebp*2] movq mm3,[esi+PITCH*3] movq [edi+0],mm0 movq [edi+8],mm1 movq [edi+16],mm2 movq [edi+24],mm3 movq mm3,[esi+PITCH*7] movq mm2,[esi+PITCH*6] paddb mm3,mm3 movq mm1,[esi+PITCH*5] paddb mm2,mm2 movq mm0,[esi+ebp*4] paddb mm1,mm1 ret
;============================================================================= ; This internal function computes the OBMC contribution for the reference ; block that uses the remote motion vector from block above or below. ; ; ebp -- PITCH ; edi -- Not used. ; esi -- Address of reference block (after ecx is added in). ; edx -- Reserved. MBlockActionStream ; ecx -- Unavailable. Must not be changed. ; ebx -- Scratch. Initially the horizontal and vertical motion vectors. ; eax -- Offset within frame for block being worked on. ; mm7 -- 8 bytes of -1 ; mm6 -- 8 bytes of 0xFE ; mm0-mm5 -- Scratch
GetPredForAboveOrBelow:
shr ebx,1 lea esi,[esi+eax] jc HorzInterpInABPred movq mm1,[esi+ebp*1] movq mm0,[esi] psubb mm1,mm7 and bl,080H jne Get4LinesOfPred_InterpVert
movq mm2,[esi+ebp*2] paddb mm1,mm7 movq mm3,[esi+PITCH*3] paddb mm1,mm1 paddb mm2,mm2 paddb mm3,mm3 ret
HorzInterpInABPred: movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01> and bl,080H jne Get4LinesOfPred_InterpBoth
jmp Get4LinesOfPred_InterpHorz
StackOffset TEXTEQU <0> ;============================================================================= ENDIF
Done:
IFDEF H261 ELSE; H263 mov bl,PendingOBMC mov cl,INTER1MV test bl,bl je TrulyDone
mov StashBlockType,cl
call DoPendingOBMCDiff
mov al,IsPlainPFrame add edx,-SIZEOF T_MacroBlockActionDescr test al,al jne TrulyDone
movq mm6,C0101010101010101 pxor mm7,mm7 ; Initialize SWD accumulator
call MMxDoBFrameLumaBlocks
ENDIF TrulyDone:
emms IFDEF H261 mov eax,SWDTotal mov esp,StashESP mov edi,[esp+PSWDTotal] mov [edi],eax ELSE mov eax,SWDTotal mov ebx,BSWDTotal mov esp,StashESP mov edi,[esp+PSWDTotal] mov esi,[esp+PBSWDTotal] mov [edi],eax mov [esi],ebx ENDIF pop ebx pop ebp pop edi pop esi rturn
MMxEDTQ endp
END
|