windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/exmme.asm


								;////////////////////////////////////////////////////////////////////////////

								;//

								;//              INTEL CORPORATION PROPRIETARY INFORMATION

								;//

								;//      This software is supplied under the terms of a license

								;//      agreement or nondisclosure agreement with Intel Corporation

								;//      and may not be copied or disclosed except in accordance

								;//      with the terms of that agreement.

								;//

								;////////////////////////////////////////////////////////////////////////////

								;//

								;// $Header:   S:\h26x\src\enc\exmme.asv   1.37   13 Dec 1996 17:19:38   MBODART  $

								;//

								;// $Log:   S:\h26x\src\enc\exmme.asv  $

								;//

								;//    Rev 1.37   13 Dec 1996 17:19:38   MBODART

								;// Tuned the ME parameters for H.261.

								;//

								;//    Rev 1.36   06 Nov 1996 16:18:24   BNICKERS

								;// Improve performance.

								;//

								;//    Rev 1.35   30 Oct 1996 17:30:36   BNICKERS

								;// Fix UMV table for right edge macroblocks.

								;//

								;//    Rev 1.34   30 Oct 1996 14:49:20   KLILLEVO

								;// zero motion vectors for intra blocks in PB-frame mode.

								;// This is necesseary in the Extended Motion Vector mode

								;//

								;//    Rev 1.33   18 Oct 1996 16:57:16   BNICKERS

								;// Fixes for EMV

								;//

								;//    Rev 1.32   15 Oct 1996 17:53:04   BNICKERS

								;//

								;// Fix major bug w.r.t. EMV ME.

								;//

								;//    Rev 1.31   14 Oct 1996 13:10:14   BNICKERS

								;//

								;// Correct several problems wrt H261 ME.

								;//

								;//    Rev 1.30   11 Oct 1996 16:53:12   KLILLEVO

								;//

								;// Fix threshold

								;//

								;//    Rev 1.29   11 Oct 1996 16:52:18   KLILLEVO

								;// Another EMV fix.

								;//

								;//    Rev 1.28   11 Oct 1996 15:43:16   KLILLEVO

								;// Really fix the handling of the top row of MBs for EMV ME.

								;//

								;//    Rev 1.27   11 Oct 1996 15:24:38   BNICKERS

								;// Special handling of top row of MBs for EMV ME.

								;//

								;//    Rev 1.26   11 Oct 1996 14:47:42   KLILLEVO

								;// Kill full pel MV for Intra blocks so that EMV of adjacent blocks will work.

								;//

								;//    Rev 1.25   10 Oct 1996 16:42:56   BNICKERS

								;// Initial debugging of Extended Motion Vectors.

								;//

								;//    Rev 1.24   04 Oct 1996 08:48:02   BNICKERS

								;// Add EMV.

								;//

								;//    Rev 1.23   24 Sep 1996 10:42:24   BNICKERS

								;// For H261, zero out motion vectors when classifying MB as intra.

								;//

								;//    Rev 1.22   12 Sep 1996 10:56:24   BNICKERS

								;// Add arguments for thresholds and differentials.

								;//

								;//    Rev 1.21   22 Jul 1996 15:23:24   BNICKERS

								;// Reduce code size.  Implement H261 spatial filter.

								;//

								;//    Rev 1.20   18 Jul 1996 16:54:26   KLILLEVO

								;// changed emptythreshold to 40 instead of 128 to remove some blockiness

								;// from the still frame mode on MMX

								;//

								;//    Rev 1.19   26 Jun 1996 12:49:02   KLILLEVO

								;// Fix minor booboo left in by Brian.

								;//

								;//    Rev 1.18   26 Jun 1996 12:21:50   BNICKERS

								;// Make heuristic ME work without unrestricted motion vectors.

								;//

								;//    Rev 1.17   25 Jun 1996 14:24:58   BNICKERS

								;// Implement heuristic motion estimation for MMX, AP mode.

								;//

								;//    Rev 1.16   15 May 1996 16:57:14   BNICKERS

								;// Fix SWD tabulation (again)! @#$%!%

								;//

								;//    Rev 1.15   15 May 1996 16:53:24   BNICKERS

								;//

								;// Fix SWD tabulation.

								;//

								;//    Rev 1.14   15 May 1996 11:33:28   BNICKERS

								;// Bug fix for calc of total SWD.

								;//

								;//    Rev 1.13   14 May 1996 12:18:58   BNICKERS

								;// Initial debugging of MMx B-Frame ME.

								;//

								;//    Rev 1.12   03 May 1996 14:03:50   BNICKERS

								;//

								;// Minor bug fixes and integration refinements.

								;//

								;//    Rev 1.11   02 May 1996 12:00:32   BNICKERS

								;// Initial integration of B Frame ME, MMX version.

								;//

								;//    Rev 1.10   16 Apr 1996 16:40:14   BNICKERS

								;// Fix some important but simple bugs.  Start adding table inits for B frm ME.

								;//

								;//    Rev 1.9   10 Apr 1996 13:13:44   BNICKERS

								;// Recoding of Motion Estimation, Advanced Prediction.

								;//

								;//    Rev 1.8   05 Apr 1996 12:28:10   BNICKERS

								;// Improvements to baseline half pel ME.

								;//

								;//    Rev 1.7   26 Mar 1996 12:00:22   BNICKERS

								;// Did some tuning for MMx encode.

								;//

								;//    Rev 1.6   20 Mar 1996 17:01:44   KLILLEVO

								;// fixed bug in new quant code

								;//

								;//    Rev 1.5   20 Mar 1996 15:26:40   KLILLEVO

								;// changed quantization to match IA quantization

								;//

								;//    Rev 1.3   15 Mar 1996 15:51:16   BECHOLS

								;// Completed monolithic - Brian

								;//

								;//    Rev 1.0   16 Feb 1996 17:12:12   BNICKERS

								;// Initial revision.

								;//

								;////////////////////////////////////////////////////////////////////////////

								;

								; MMxMotionEstimation -- This function performs motion estimation for the

								;                        macroblocks identified in the input list.  This is

								;                        the MMx version.  Conditional assembly selects either

								;                        the H263 or H261 version.

								;

								; Arguments:   See ex5me.asm.

								;

								; Other assumptions:  See ex5me.asm.  Most of the read-only tables needed in

								;                     ex5me.asm are not needed here.

								;


								OPTION PROLOGUE:None

								OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro

								OPTION M510

								OPTION CASEMAP:NONE


								IFDEF H261

								ZEROVECTORTHRESHOLD          =  600

								NONZEROMVDIFFERENTIAL        =  256

								BLOCKMOTIONTHRESHOLD         = 1152

								BLOCKMVDIFFERENTIAL          =  768

								EMPTYTHRESHOLD               =   40

								INTERCODINGTHRESHOLD         =  300

								INTRACODINGDIFFERENTIAL      =  200

								ELSE

								ZEROVECTORTHRESHOLD          =  450

								NONZEROMVDIFFERENTIAL        =  375

								BLOCKMOTIONTHRESHOLD         = 1152

								BLOCKMVDIFFERENTIAL          =  768

								EMPTYTHRESHOLD               =   40

								INTERCODINGTHRESHOLD         = 1152

								INTRACODINGDIFFERENTIAL      = 1000

								ENDIF


								include iammx.inc

								include e3inst.inc

								include e3mbad.inc


								.xlist

								include memmodel.inc

								.list


								include exEDTQ.inc


								MMXMEDATA SEGMENT PAGE

								ALIGN 16


								;  Storage for Target and Reference frames can interleave into 8K of the 16K

								;  cache.  Pitch must be 384.

								;

								;     C# -- Stands for row number "#" of target macroblock in *C*urrent P frame.

								;     B# -- Stands for row number "#" of target macroblock in current *B* frame.

								;     R# -- Stands for row number "#" of 0MV *R*ef macroblock in past frame.

								;     v  -- Stands for a row below 0MV, reference macroblock.

								;           These same cache lines would hit reference lines >8 above the 0MV.

								;     ^  -- Stands for a row below 0MV, reference macroblock.

								;           These same cache lines would hit reference lines >8 below the 0MV.

								;     +-+-+

								;     |   | -- A cache line (32 bytes).  Position of letters,<, and > indicate

								;     +-+-+    which 16 bytes may be used in the cache line.

								;

								;     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

								;     |C0 |   |  v|   |Cb |   |  ^|   |B6 |   | R6|   |

								;     |C1 |   |  v|   |Cc |   |  ^|   |B7 |   | R7|   |

								;     |C2 |   |  v|   |Cd |   |  ^|   |B8 |   | R8|   |

								;     |C3 |   |  v|   |Ce |   |  ^|   |B9 |   | R9|   |

								;     |C4 |   |  v|   |Cf |   |  ^|   |Ba |   | Ra|   |

								;     |C5 |   |  v|   |B0 |   | R0|   |Bb |   | Rb|   |

								;     |C6 |   |  v|   |B1 |   | R1|   |Bc |   | Rc|   |

								;     |C7 |   |  v|   |B2 |   | R2|   |Bd |   | Rd|   |

								;     |C8 |   |  ^|   |B3 |   | R3|   |Be |   | Re|   |

								;     |C9 |   |  ^|   |B4 |   | R4|   |Bf |   | Rf|   |

								;     |Ca |   |  ^|   |B5 |   | R5|   +-+-+-+-+-+-+-+-+

								;     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

								;


								; The static storage space used for read-only tables, and the stack usage

								; are coordinated such that they mesh in the data cache, and use only one

								; 4K way of the 4-way, 16K cache.

								;

								; The first 32 bytes of the static storage space are unallocated, because

								; the top of stack ranges in this area.  As local procedure calls are made

								; within this function, return addresses get pushed into these 32 bytes.

								; (32 bytes;    0:  31)


								  DB 32 DUP (?)   ; Static space place-holder.  Stack frame hits these addrs.


								;

								; The next 608 bytes of the static storage space are unallocated, because

								; the local stack frame is made to hit cache at these addresses.  More of

								; the local stack frame is allocated after a gap of 64 bytes.

								; (608 bytes;   32: 639)


								LocalStorage LABEL DWORD


								  DB 608 DUP (?)   ; Static space place-holder.  Stack frame hits these addrs.


								; Motion Estimation State Engine adjustments to reference block address to get

								; to next candidate reference block.

								; (64 bytes; 640: 703)


								FullPelMotionVectorAdjustment LABEL DWORD


								        DD   -16*PITCH-8

								VMG     EQU  000H+0+8

								VMGHM8  EQU  000H-8+8


								        DD   -8*PITCH-8-010H

								VM8HM8  EQU  010H


								        DD   -8*PITCH-020H

								VM8     EQU  020H

								VM8HP8  EQU  020H+8


								        DD   -4*PITCH-8-030H

								VM4HM8  EQU  030H-8+8

								VM4HM4  EQU  030H-4+8

								VM4     EQU  030H+0+8

								VM4HP4  EQU  030H+4+8


								        DD   -4*PITCH+8-040H

								VM4HP8  EQU  040H+8-8

								VM4HPG  EQU  040H+16-8


								        DD   -2*PITCH-4-050H

								VM2HM4  EQU  050H-4+4

								VM2HM2  EQU  050H-2+4

								VM2HM1  EQU  050H-1+4

								VM2     EQU  050H+0+4

								VM2HP1  EQU  050H+1+4

								VM2HP2  EQU  050H+2+4

								VM2HP4  EQU  050H+4+4

								VM2HP8  EQU  050H+8+4


								        DD   -1*PITCH-2-060H

								VM1HM2  EQU  060H-2+2

								VM1HM1  EQU  060H-1+2

								VM1     EQU  060H+0+2

								VM1HP1  EQU  060H+1+2

								VM1HP2  EQU  060H+2+2

								VM1HP4  EQU  060H+4+2


								        DD   -16-070H

								HMG     EQU  070H-16+16

								HM8     EQU  070H-8+16

								HM4     EQU  070H-4+16

								HM3     EQU  070H-3+16

								HM2     EQU  070H-2+16

								HM1     EQU  070H-1+16


								        DD   -080H

								NOADJ   EQU  080H

								HP1     EQU  080H+1

								HP2     EQU  080H+2

								HP4     EQU  080H+4

								HP8     EQU  080H+8


								        DD   1*PITCH-2-090H

								VP1HM2  EQU  090H-2+2

								VP1HM1  EQU  090H-1+2

								VP1     EQU  090H+0+2

								VP1HP1  EQU  090H+1+2

								VP1HP2  EQU  090H+2+2

								VP1HP4  EQU  090H+4+2


								        DD   2*PITCH-4-0A0H

								VP2HM4  EQU  0A0H-4+4

								VP2HM2  EQU  0A0H-2+4

								VP2HM1  EQU  0A0H-1+4

								VP2     EQU  0A0H+0+4

								VP2HP1  EQU  0A0H+1+4

								VP2HP2  EQU  0A0H+2+4

								VP2HP4  EQU  0A0H+4+4

								VP2HP8  EQU  0A0H+8+4


								        DD   4*PITCH-8-0B0H

								VP4HM8  EQU  0B0H-8+8

								VP4HM4  EQU  0B0H-4+8

								VP4HM2  EQU  0B0H-2+8

								VP4     EQU  0B0H+0+8

								VP4HP2  EQU  0B0H+2+8

								VP4HP4  EQU  0B0H+4+8


								        DD   4*PITCH+8-0C0H

								VP4HP8  EQU  0C0H+8-8

								VP4HPG  EQU  0C0H+16-8


								        DD   8*PITCH-8-0D0H

								VP8HM8  EQU  0D0H-8+8

								VP8HM4  EQU  0D0H-4+8


								        DD   8*PITCH-0E0H

								VP8     EQU  0E0H+0

								VP8HP4  EQU  0E0H+4

								VP8HP8  EQU  0E0H+8


								        DD   16*PITCH-0F0H

								VPG     EQU  0F0H+0

								VPGHP8  EQU  0F0H+8


								; Additional space reserved for stack variables.  If more space is needed,

								; it should go here.

								; (160 bytes; 704: 863)


								  DB 160 DUP (?)   ; Static space place-holder.  Stack frame hits these addrs.


								; QWORD Constants used by motion estimation, frame differencing, and FDCT.

								; (144 bytes;   864:1007)


								C0101010101010101 DD 001010101H, 001010101H

								CFFFF0000FFFF0000 DD 0FFFF0000H, 0FFFF0000H

								C0200010101010101 DD 001010101H, 002000101H

								C0001000200020001 DD 000020001H, 000010002H

								CFFFF00000000FFFF DD 00000FFFFH, 0FFFF0000H

								C0000FFFFFFFF0000 DD 0FFFF0000H, 00000FFFFH

								CFF000000000000FF DD 0000000FFH, 0FF000000H

								C0101010101010002 DD 001010002H, 001010101H

								C0100010001000100 DD 001000100H, 001000100H

								C0001000100010001 DD 000010001H, 000010001H

								C7F7F7F7F7F7F7F7F DD 07F7F7F7FH, 07F7F7F7FH

								C1                DD 07D8A7D8AH, 07D8A7D8AH

								C2                DD 076417641H, 076417641H

								C3                DD 06A6D6A6DH, 06A6D6A6DH

								C4                DD 05A825A82H, 05A825A82H

								C5                DD 0471D471DH, 0471D471DH

								C6                DD 030FC30FCH, 030FC30FCH

								C7                DD 018F818F8H, 018F818F8H


								; Distances to Block Action Descriptors for blocks that provide remote vectors

								; for OBMC.  Which element accessed depends on edge condition.  Top edge is

								; stack based variable, since different instances may have different distances

								; to BAD of block above.  Bottom edge is always a constant, regardless of

								; edge condition.  This is used in OBMC frame differencing.

								; (16 bytes; 1008:1023)


								BlockToLeft  DD 0, -SIZEOF T_MacroBlockActionDescr+SIZEOF T_Blk

								BlockToRight DD 0,  SIZEOF T_MacroBlockActionDescr-SIZEOF T_Blk


								; Table to map linearized motion vector to vertical part, used by motion

								; estimation.  (Shift linearized motion vector right by 8 bits, and then

								; use result as index into this array to get vertical MV.)

								; (96 bytes; 1024:1119)


								IF PITCH-384

								*** error:  The magic of this table assumes a pitch of 384.

								ENDIF

								   DB -64, -64

								   DB -62

								   DB -60, -60

								   DB -58

								   DB -56, -56

								   DB -54

								   DB -52, -52

								   DB -50

								   DB -48, -48

								   DB -46

								   DB -44, -44

								   DB -42

								   DB -40, -40

								   DB -38

								   DB -36, -36

								   DB -34

								   DB -32, -32

								   DB -30

								   DB -28, -28

								   DB -26

								   DB -24, -24

								   DB -22

								   DB -20, -20

								   DB -18

								   DB -16, -16

								   DB -14

								   DB -12, -12

								   DB -10

								   DB  -8,  -8

								   DB  -6

								   DB  -4,  -4

								   DB  -2

								   DB   0

								UnlinearizedVertMV  DB 0

								   DB   2

								   DB   4,   4

								   DB   6

								   DB   8,   8

								   DB  10

								   DB  12,  12

								   DB  14

								   DB  16,  16

								   DB  18

								   DB  20,  20

								   DB  22

								   DB  24,  24

								   DB  26

								   DB  28,  28

								   DB  30

								   DB  32,  32

								   DB  34

								   DB  36,  36

								   DB  38

								   DB  40,  40

								   DB  42

								   DB  44,  44

								   DB  46

								   DB  48,  48

								   DB  50

								   DB  52,  52

								   DB  54

								   DB  56,  56

								   DB  58

								   DB  60,  60

								   DB  62

								; Table to provide index value in low byte, and rounding term of 1 in all bytes.

								; Used in frame differencing, when half pel horizontal interpolation is needed.

								; (1024 bytes; 1120:2143)


								Pel_Rnd LABEL DWORD

								CNT = 0

								REPEAT 128

								 DD CNT+001010101H, 001010101H

								 CNT = CNT + 1

								ENDM


								; Motion Estimation State Engine Rules.

								; (896 bytes;2144:3039)


								StateEngineFirstRule LABEL BYTE ; Rules that govern state engine of estimator.

								StateEngine EQU StateEngineFirstRule-20+2


								   ; Starting States:


								IF PITCH-384

								*** error:  The magic of this table assumes a pitch of 384.

								ENDIF

								 DB       ?      ;  0:  not used.

								 DB       3      ;  1: Upper left corner.

								 DB       3      ;  2: Upper edge.

								 DB       3      ;  3: Upper right corner.

								 DB       3      ;  4: Left edge.

								 DB       3      ;  5: Interior MB, not doing block search.

								 DB       0      ;  6: Right edge.

								 DB       0      ;  7: Lower left corner.

								 DB       0      ;  8: Lower edge.

								 DB       0      ;  9: Lower right corner.


								 DB       ?      ;  0:  not used.

								 DB      34      ;  1: Upper left corner.

								 DB      66      ;  2: Upper edge.

								 DB      42      ;  3: Upper right corner.

								 DB      98      ;  4: Left edge.

								 DB      16      ;  5: Interior MB, not doing block search.

								 DB     114      ;  6: Right edge.

								 DB      50      ;  7: Lower left corner.

								 DB      82      ;  8: Lower edge.

								 DB      58      ;  9: Lower right corner.


								 DB     ?,?      ; Skip 2 bytes.


								LASTINITIALMESTATE EQU 9


								   ; Interior Telescoping States:


								      ;  Try +/- 8,4,2,1, vertically first, then horizontally.


								FIRSTBLOCKMESTATE EQU 10


								 DB     VM2,    VM2,   12,   11  ;  10: V+1 better/worse than central.  Try V-1.

								 DB  VP2HP1,    HP1,   13,   13  ;  11: Accept V+1/V-1 as best.         Try H+1.

								 DB  VP1HP1,    HP1,   13,   13  ;  12: Accept central/V-1 as best.     Try H+1.

								 DB     HM2,    HM2,   15,   14  ;  13: H+1 better/worse than central.  Try H-1.

								 DB     HP2,  NOADJ, 0FFH, 0FFH  ;  14: Accept H+1/H-1 as best.         Done.

								 DB     HP1,  NOADJ, 0FFH, 0FFH  ;  15: Accept central/H-1 as best.     Done.


								 DB     VMG,    VMG,   18,   17  ;  16: V+8 better/worse than central.  Try V-8.

								 DB  VPGHP8,    HP8,   19,   19  ;  17: Accept V+8/V-8 as best.         Try H+8.

								 DB  VP8HP8,    HP8,   19,   19  ;  18: Accept central/V-8 as best.     Try H+8.

								 DB     HMG,    HMG,   21,   20  ;  19: H+8 better/worse than central.  Try H-8.

								 DB  VP4HPG,    VP4,   22,   22  ;  20: Accept H+8/H-8 as best.         Try V+4.

								 DB  VP4HP8,    VP4,   22,   22  ;  21: Accept central/H-8 as best.     Try V+4.


								 DB     VM8,    VM8,   24,   23  ;  22: V+4 better/worse than central.  Try V-4.

								 DB  VP8HP4,    HP4,   25,   25  ;  23: Accept V+4/V-4 as best.         Try H+4.

								 DB  VP4HP4,    HP4,   25,   25  ;  24: Accept central/V-4 as best.     Try H+4.

								 DB     HM8,    HM8,   27,   26  ;  25: H+4 better/worse than central.  Try H-4.

								 DB  VP2HP8,    VP2,   28,   28  ;  26: Accept H+4/H-4 as best.         Try V+2.

								 DB  VP2HP4,    VP2,   28,   28  ;  27: Accept central/H-4 as best.     Try V+2.


								 DB     VM4,    VM4,   30,   29  ;  28: V+2 better/worse than central.  Try V-2.

								 DB  VP4HP2,    HP2,   31,   31  ;  29: Accept V+2/V-2 as best.         Try H+2.

								 DB  VP2HP2,    HP2,   31,   31  ;  30: Accept central/V-2 as best.     Try H+2.

								 DB     HM4,    HM4,   33,   32  ;  31: H+2 better/worse than central.  Try H-2.

								 DB  VP1HP4,    VP1,   10,   10  ;  32: Accept H+2/H-2 as best.         Try V+1.

								 DB  VP1HP2,    VP1,   10,   10  ;  33: Accept central/H-2 as best.     Try V+1.


								   ; Boundary States:


								     ; Upper left corner:


								 DB  VM8HP8,    HP8,   35,  101  ;  34: Accept corner/V+8.              Try H+8.

								 DB  VP4HM8,    VP4,   36,   70  ;  35: Accept corner/H+8.              Try V+4.

								 DB  VM4HP4,    HP4,   37,  105  ;  36: Accept corner/V+4.              Try H+4.

								 DB  VP2HM4,    VP2,   38,   74  ;  37: Accept corner/H+4.              Try V+2.

								 DB  VM2HP2,    HP2,   39,  109  ;  38: Accept corner/V+2.              Try H+2.

								 DB  VP1HM2,    VP1,   40,   78  ;  39: Accept corner/H+2.              Try V+1.

								 DB  VM1HP1,    HP1,   41,  113  ;  40: Accept corner/V+1.              Try H+1.

								 DB     HM1,  NOADJ, 0F5H, 0F7H  ;  41: Accept corner/H+1.              Done.


								     ; Upper right corner:


								 DB  VM8HM8,    HM8,   43,  117  ;  42: Accept corner/V+8.              Try H-8.

								 DB  VP4HP8,    VP4,   44,   70  ;  43: Accept corner/H-8.              Try V+4.

								 DB  VM4HM4,    HM4,   45,  121  ;  44: Accept corner/V+4.              Try H-4.

								 DB  VP2HP4,    VP2,   46,   74  ;  45: Accept corner/H-4.              Try V+2.

								 DB  VM2HM2,    HM2,   47,  125  ;  46: Accept corner/V+2.              Try H-2.

								 DB  VP1HP2,    VP1,   48,   78  ;  47: Accept corner/H-2.              Try V+1.

								 DB  VM1HM1,    HM1,   49,  129  ;  48: Accept corner/V+1.              Try H-1.

								 DB     HP1,  NOADJ, 0F6H, 0F7H  ;  49: Accept corner/H-1.              Done


								     ; Lower left corner:


								 DB  VP8HP8,    HP8,   51,  101  ;  50: Accept corner/V-8.              Try H+8.

								 DB  VM4HM8,    VM4,   52,   86  ;  51: Accept corner/H+8.              Try V-4.

								 DB  VP4HP4,    HP4,   53,  105  ;  52: Accept corner/V-4.              Try H+4.

								 DB  VM2HM4,    VM2,   54,   90  ;  53: Accept corner/H+4.              Try V-2.

								 DB  VP2HP2,    HP2,   55,  109  ;  54: Accept corner/V-2.              Try H+2.

								 DB  VM1HM2,    VM1,   56,   94  ;  55: Accept corner/H+2.              Try V-1.

								 DB  VP1HP1,    HP1,   57,  113  ;  56: Accept corner/V-1.              Try H+1.

								 DB     HM1,  NOADJ, 0F9H, 0FBH  ;  57: Accept corner/H+1.              Done.


								     ; Lower right corner:


								 DB  VP8HM8,    HM8,   59,  117  ;  58: Accept corner/V-8.              Try H-8.

								 DB  VM4HP8,    VM4,   60,   86  ;  59: Accept corner/H-8.              Try V-4.

								 DB  VP4HM4,    HM4,   61,  121  ;  60: Accept corner/V-4.              Try H-4.

								 DB  VM2HP4,    VM2,   62,   90  ;  61: Accept corner/H-4.              Try V-2.

								 DB  VP2HM2,    HM2,   63,  125  ;  62: Accept corner/V-2.              Try H-2.

								 DB  VM1HP2,    VM1,   64,   94  ;  63: Accept corner/H-2.              Try V-1.

								 DB  VP1HM1,    HM1,   65,  129  ;  64: Accept corner/V-1.              Try H-1.

								 DB     HP1,  NOADJ, 0FAH, 0FBH  ;  65: Accept corner/H-1.              Done.


								     ; Upper edge:


								 DB  VM8HP8,    HP8,   67,   19  ;  66: Accept central/V+8 as best.     Try H+8.

								 DB     HMG,    HMG,   69,   68  ;  67: H+8 worse/better than central.  Try H-8.

								 DB  VP4HPG,    VP4,   70,   70  ;  68: Accept H+8/H-8 as best.         Try V+4.

								 DB  VP4HP8,    VP4,   70,   70  ;  69: Accept central/H-8 as best.     Try V+4.

								 DB  VM4HP4,    HP4,   71,   25  ;  70: Accept central/V+4 as best.     Try H+4.

								 DB     HM8,    HM8,   73,   72  ;  71: H+4 worse/better than central.  Try H-4.

								 DB  VP2HP8,    VP2,   74,   74  ;  72: Accept H+4/H-4 as best.         Try V+2.

								 DB  VP2HP4,    VP2,   74,   74  ;  73: Accept central/H-4 as best.     Try V+2.

								 DB  VM2HP2,    HP2,   75,   31  ;  74: Accept central/V+2 as best.     Try H+2.

								 DB     HM4,    HM4,   77,   76  ;  75: H+2 worse/better than central.  Try H-2.

								 DB  VP1HP4,    VP1,   78,   78  ;  76: Accept H+2/H-2 as best.         Try V+1.

								 DB  VP1HP2,    VP1,   78,   78  ;  77: Accept central/H-2 as best.     Try V+1.

								 DB  VM1HP1,    HP1,   79,   13  ;  78: Accept central/V+1 as best.     Try H+1.

								 DB     HM2,    HM2,   81,   80  ;  79: H+1 worse/better than central.  Try H-1.

								 DB     HP2,  NOADJ, 0F7H, 0F7H  ;  80: Accept H+1/H-1 as best.         Done.

								 DB     HP1,  NOADJ, 0F7H, 0F7H  ;  81: Accept central/H-1 as best.     Done.


								     ; Lower edge:


								 DB  VP8HP8,    HP8,   83,   19  ;  82: Accept central/V-8 as best.     Try H+8.

								 DB     HMG,    HMG,   85,   84  ;  83: H+8 worse/better than central.  Try H-8.

								 DB  VM4HPG,    VM4,   86,   86  ;  84: Accept H+8/H-8 as best.         Try V-4.

								 DB  VM4HP8,    VM4,   86,   86  ;  85: Accept central/H-8 as best.     Try V-4.

								 DB  VP4HP4,    HP4,   87,   25  ;  86: Accept central/V-4 as best.     Try H+4.

								 DB     HM8,    HM8,   89,   88  ;  87: H+4 worse/better than central.  Try H-4.

								 DB  VM2HP8,    VM2,   90,   90  ;  88: Accept H+4/H-4 as best.         Try V-2.

								 DB  VM2HP4,    VM2,   90,   90  ;  89: Accept central/H-4 as best.     Try V-2.

								 DB  VP2HP2,    HP2,   91,   31  ;  90: Accept central/V-2 as best.     Try H+2.

								 DB     HM4,    HM4,   93,   92  ;  91: H+2 worse/better than central.  Try H-2.

								 DB  VM1HP4,    VM1,   94,   94  ;  92: Accept H+2/H-2 as best.         Try V-1.

								 DB  VM1HP2,    VM1,   94,   94  ;  93: Accept central/H-2 as best.     Try V-1.

								 DB  VP1HP1,    HP1,   95,   13  ;  94: Accept central/V-1 as best.     Try H+1.

								 DB     HM2,    HM2,   97,   96  ;  95: H+1 worse/better than central.  Try H-1.

								 DB     HP2,  NOADJ, 0FBH, 0FBH  ;  96: Accept H+1/H-1 as best.         Done.

								 DB     HP1,  NOADJ, 0FBH, 0FBH  ;  97: Accept central/H-1 as best.     Done.


								     ; Left edge:


								 DB     VMG,    VMG,  100,   99  ;  98: V+8 worse/better than central.  Try V-8.

								 DB  VPGHP8,    HP8,  101,  101  ;  99: Accept V+8/V-8 as best.         Try H+8.

								 DB  VP8HP8,    HP8,  101,  101  ; 100: Accept central/V-8 as best.     Try H+8.

								 DB  VP4HM8,    VP4,  102,   22  ; 101: Accept central/H+8 as best.     Try V+4.

								 DB     VM8,    VM8,  104,  103  ; 102: V+4 worse/better than central.  Try V-4.

								 DB  VP8HP4,    HP4,  105,  105  ; 103: Accept V+4/V-4 as best.         Try H+4.

								 DB  VP4HP4,    HP4,  105,  105  ; 104: Accept central/V-4 as best.     Try H+4.

								 DB  VP2HM4,    VP2,  106,   28  ; 105: Accept central/H+4 as best.     Try V+2.

								 DB     VM4,    VM4,  108,  107  ; 106: V+2 worse/better than central.  Try V-2.

								 DB  VP4HP2,    HP2,  109,  109  ; 107: Accept V+2/V-2 as best.         Try H+2.

								 DB  VP2HP2,    HP2,  109,  109  ; 108: Accept central/V-2 as best.     Try H+2.

								 DB  VP1HM2,    VP1,  110,   10  ; 109: Accept central/H+2 as best.     Try V+1.

								 DB     VM2,    VM2,  112,  111  ; 110: V+1 worse/better than central.  Try V-1.

								 DB  VP2HP1,    HP1,  113,  113  ; 111: Accept V+1/V-1 as best.         Try H+1.

								 DB  VP1HP1,    HP1,  113,  113  ; 112: Accept central/V-1 as best.     Try H+1.

								 DB     HM1,  NOADJ, 0FDH, 0FDH  ; 113: Accept central/H+1 as best.     Done.


								     ; Right edge:


								 DB     VPG,    VPG,  116,  115  ; 114: V-8 worse/better than central.  Try V+8.

								 DB  VMGHM8,    HM8,  117,  117  ; 115: Accept V-8/V+8 as best.         Try H-8.

								 DB  VM8HM8,    HM8,  117,  117  ; 116: Accept central/V+8 as best.     Try H-8.

								 DB  VP4HP8,    VP4,  118,   22  ; 117: Accept central/H+8 as best.     Try V+4.

								 DB     VM8,    VM8,  120,  119  ; 118: V+4 worse/better than central.  Try V-4.

								 DB  VP8HM4,    HM4,  121,  121  ; 119: Accept V+4/V-4 as best.         Try H-4.

								 DB  VP4HM4,    HM4,  121,  121  ; 120: Accept central/V-4 as best.     Try H-4.

								 DB  VP2HP4,    VP2,  122,   28  ; 121: Accept central/H+4 as best.     Try V+2.

								 DB     VM4,    VM4,  124,  123  ; 122: V+2 worse/better than central.  Try V-2.

								 DB  VP4HM2,    HM2,  125,  125  ; 123: Accept V+2/V-2 as best.         Try H-2.

								 DB  VP2HM2,    HM2,  125,  125  ; 124: Accept central/V-2 as best.     Try H-2.

								 DB  VP1HP2,    VP1,  126,   10  ; 125: Accept central/H+2 as best.     Try V+1.

								 DB     VM2,    VM2,  128,  127  ; 126: V+1 worse/better than central.  Try V-1.

								 DB  VP2HM1,    HM1,  129,  129  ; 127: Accept V+1/V-1 as best.         Try H-1.

								 DB  VP1HM1,    HM1,  129,  129  ; 128: Accept central/V-1 as best.     Try H-1.

								 DB     HP1,  NOADJ, 0FEH, 0FEH  ; 129: Accept central/H+1 as best.     Done.


								     ; Exhaustive search, radius 1 here, reaching out to radius 2 further below.

								     ;     .   .   .   .   .

								     ;     .   2   5   3   .   C = center.

								     ;     .   7   C   8   .

								     ;     .   4   6   1   .   # = order to try additional candidates.

								     ;     .   .   .   .   .


								FIRST_HEURISTIC_EXHAUSTIVE = 130


								 DB  VM2HM2, VM2HM2,  131, 138 ; 130: #1 worse/better than  C. Try #2.

								 DB     HP2,    HP2,  132, 145 ; 131: #2 worse/better than  C. Try #3.

								 DB  VP2HM2, VP2HM2,  133, 151 ; 132: #3 worse/better than  C. Try #4.

								 DB  VM2HP1, VM2HP1,  134, 156 ; 133: #4 worse/better than  C. Try #5.

								 DB     VP2,    VP2,  135, 160 ; 134: #5 worse/better than  C. Try #6.

								 DB  VM1HM1, VM1HM1,  136, 163 ; 135: #6 worse/better than  C. Try #7.

								 DB     HP2,    HP2,  137, 165 ; 136: #7 worse/better than  C. Try #8.

								 DB     HM1,    HP1, 0FFH, 166 ; 137: If C best, quit.  If 8 best, keep going.

								 DB     HP2,    HP2,  139, 145 ; 138: #2 worse/better than #1. Try #3.

								 DB  VP2HM2, VP2HM2,  140, 151 ; 139: #3 worse/better than #1. Try #4.

								 DB  VM2HP1, VM2HP1,  141, 156 ; 140: #4 worse/better than #1. Try #5.

								 DB     VP2,    VP2,  142, 160 ; 141: #5 worse/better than #1. Try #6.

								 DB  VM1HM1, VM1HM1,  143, 163 ; 142: #6 worse/better than #1. Try #7.

								 DB     HP2,    HP2,  144, 165 ; 143: #7 worse/better than #1. Try #8.

								 DB     HP1,    HP1,  199, 166 ; 144: #8 worse/better than #1. Take best, go on.

								 DB  VP2HM2, VP2HM2,  146, 151 ; 145: #3 worse/better than #2. Try #4.

								 DB  VM2HP1, VM2HP1,  147, 156 ; 146: #4 worse/better than #2. Try #5.

								 DB     VP2,    VP2,  148, 160 ; 147: #5 worse/better than #2. Try #6.

								 DB  VM1HM1, VM1HM1,  149, 163 ; 148: #6 worse/better than #2. Try #7.

								 DB     HP2,    HP2,  150, 165 ; 149: #7 worse/better than #2. Try #8.

								 DB     HM3,    HP1,  208, 166 ; 150: #8 worse/better than #2. Take best, go on.

								 DB  VM2HP1, VM2HP1,  152, 156 ; 151: #4 worse/better than #3. Try #5.

								 DB     VP2,    VP2,  153, 160 ; 152: #5 worse/better than #3. Try #6.

								 DB  VM1HM1, VM1HM1,  154, 163 ; 153: #6 worse/better than #3. Try #7.

								 DB     HP2,    HP2,  155, 165 ; 154: #7 worse/better than #3. Try #8.

								 DB     HP1,    HP1,  217, 166 ; 155: #8 worse/better than #3. Take best, go on.

								 DB     VP2,    VP2,  157, 160 ; 156: #5 worse/better than #4. Try #6.

								 DB  VM1HM1, VM1HM1,  158, 163 ; 157: #6 worse/better than #4. Try #7.

								 DB     HP2,    HP2,  159, 165 ; 158: #7 worse/better than #4. Try #8.

								 DB     HM3,    HP1,  190, 166 ; 159: #8 worse/better than #4. Take best, go on.

								 DB  VM1HM1, VM1HM1,  161, 163 ; 160: #6 worse/better than #5. Try #7.

								 DB     HP2,    HP2,  162, 165 ; 161: #7 worse/better than #5. Try #8.

								 DB  VM2HM1,    HP1,  184, 166 ; 162: #8 worse/better than #5. Take best, go on.

								 DB     HP2,    HP2,  164, 165 ; 163: #7 worse/better than #6. Try #8.

								 DB  VP2HM1,    HP1,  176, 166 ; 164: #8 worse/better than #6. Take best, go on.

								 DB     HM3,    HP1,  172, 166 ; 165: #8 worse/better than #7. Take best, go on.


								     ;     .   .   .   .   .   C = center.

								     ;     .   ~   ~   ~   2   ~ = tried, but not as good.

								     ;     .   ~   C   X   1   X = best so far.

								     ;     .   ~   ~   ~   3   # = order to try additional candidates.

								     ;     .   .   .   .   .


								 DB     VM1,    VM1,  167, 169 ; 166: #1 better/worse than  X.  Try #2.

								 DB     VP2,    VP2,  168, 171 ; 167: #2 better/worse than  X.  Try #3.

								 DB  VM1HM1,  NOADJ, 0FFH,0FFH ; 168: #3 better/worse than  X.  Take best, quit.

								 DB     VP2,    VP2,  170, 171 ; 169: #2 better/worse than #1.  Try #3.

								 DB     VM1,  NOADJ, 0FFH,0FFH ; 170: #3 better/worse than #1.  Take best, quit.

								 DB     VM2,  NOADJ, 0FFH,0FFH ; 171: #3 better/worse than #2.  Take best, quit.


								     ;     .   .   .   .   .   C = center.

								     ;     2   ~   ~   ~   .   ~ = tried, but not as good.

								     ;     1   X   C   ~   .   X = best so far.

								     ;     3   ~   ~   ~   .   # = order to try additional candidates.

								     ;     .   .   .   .   .


								 DB     VM1,    VM1,  173, 175 ; 172: #1 better/worse than  X.  Try #2.

								 DB     VP2,    VP2,  174, 177 ; 173: #2 better/worse than  X.  Try #3.

								 DB  VM1HP1,  NOADJ, 0FFH,0FFH ; 174: #3 better/worse than  X.  Take best, quit.

								 DB     VP2,    VP2,  176, 177 ; 175: #2 better/worse than #1.  Try #3.

								 DB     VM1,  NOADJ, 0FFH,0FFH ; 176: #3 better/worse than #1.  Take best, quit.

								 DB     VM2,  NOADJ, 0FFH,0FFH ; 177: #3 better/worse than #2.  Take best, quit.


								     ;     .   .   .   .   .   C = center.

								     ;     .   ~   ~   ~   .   ~ = tried, but not as good.

								     ;     .   ~   C   ~   .   X = best so far.

								     ;     .   ~   X   ~   .   # = order to try additional candidates.

								     ;     .   2   1   3   .


								 DB     HM1,    HM1,  179, 181 ; 178: #1 better/worse than  X.  Try #2.

								 DB     HP2,    HP2,  180, 183 ; 179: #2 better/worse than  X.  Try #3.

								 DB  VM1HM1,  NOADJ, 0FFH,0FFH ; 180: #3 better/worse than  X.  Take best, quit.

								 DB     HP2,    HP2,  182, 183 ; 181: #2 better/worse than #1.  Try #3.

								 DB     HM1,  NOADJ, 0FFH,0FFH ; 182: #3 better/worse than #1.  Take best, quit.

								 DB     HM2,  NOADJ, 0FFH,0FFH ; 183: #3 better/worse than #2.  Take best, quit.


								     ;     .   2   1   3   .   C = center.

								     ;     .   ~   X   ~   .   ~ = tried, but not as good.

								     ;     .   ~   C   ~   .   X = best so far.

								     ;     .   ~   ~   ~   .   # = order to try additional candidates.

								     ;     .   .   .   .   .


								 DB     HM1,    HM1,  185, 187 ; 184: #1 better/worse than  X.  Try #2.

								 DB     HP2,    HP2,  186, 189 ; 185: #2 better/worse than  X.  Try #3.

								 DB  VP1HM1,  NOADJ, 0FFH,0FFH ; 186: #3 better/worse than  X.  Take best, quit.

								 DB     HP2,    HP2,  188, 189 ; 187: #2 better/worse than #1.  Try #3.

								 DB     HM1,  NOADJ, 0FFH,0FFH ; 188: #3 better/worse than #1.  Take best, quit.

								 DB     HM2,  NOADJ, 0FFH,0FFH ; 189: #3 better/worse than #2.  Take best, quit.


								     ;     .   .   .   .   .   C = center.

								     ;     .   ~   ~   ~   .   ~ = tried, but not as good.

								     ;     1   ~   C   ~   .   X = best so far.

								     ;     2   X   ~   ~   .   # = order to try additional candidates.

								     ;     4   3   5   .   .


								 DB     VP1,    VP1,  191, 195 ; 190: #1 better/worse than  X.  Try #2.

								 DB  VP1HP1, VP1HP1,  178, 192 ; 191: #2 better/worse than  X.  Try #3.

								 DB     HM1,    HM1,  193, 181 ; 192: #3 better/worse than #2.  Try #4.

								 DB     HP2,    HP2,  194, 183 ; 193: #4 better/worse than #2.  Try #5.

								 DB  VM1HM2,  NOADJ, 0FFH,0FFH ; 194: #5 better/worse than #2.  Take best, quit.

								 DB  VP1HP1, VP1HP1,  196, 192 ; 195: #2 better/worse than #1.  Try #3.

								 DB     HM1,    HM1,  197, 181 ; 196: #3 better/worse than #1.  Try #4.

								 DB     HP2,    HP2,  198, 183 ; 197: #4 better/worse than #1.  Try #5.

								 DB  VM2HM2,  NOADJ, 0FFH,0FFH ; 198: #5 better/worse than #1.  Take best, quit.


								     ;     .   .   .   .   .   C = center.

								     ;     .   ~   ~   ~   .   ~ = tried, but not as good.

								     ;     .   ~   C   ~   1   X = best so far.

								     ;     .   ~   ~   X   2   # = order to try additional candidates.

								     ;     .   .   4   3   5


								 DB     VP1,    VP1,  200, 204 ; 199: #1 better/worse than  X.  Try #2.

								 DB  VP1HM1, VP1HM1,  178, 201 ; 200: #2 better/worse than  X.  Try #3.

								 DB     HM1,    HM1,  202, 181 ; 201: #3 better/worse than #2.  Try #4.

								 DB     HP2,    HP2,  203, 183 ; 202: #4 better/worse than #2.  Try #5.

								 DB     VM1,  NOADJ, 0FFH,0FFH ; 203: #5 better/worse than #2.  Take best, quit.

								 DB  VP1HM1, VP1HM1,  205, 201 ; 204: #2 better/worse than #1.  Try #3.

								 DB     HM1,    HM1,  206, 181 ; 205: #3 better/worse than #1.  Try #4.

								 DB     HP2,    HP2,  207, 183 ; 206: #4 better/worse than #1.  Try #5.

								 DB     VM2,  NOADJ, 0FFH,0FFH ; 207: #5 better/worse than #1.  Take best, quit.


								     ;     4   3   5   .   .   C = center.

								     ;     2   X   ~   ~   .   ~ = tried, but not as good.

								     ;     1   ~   C   ~   .   X = best so far.

								     ;     .   ~   ~   ~   .   # = order to try additional candidates.

								     ;     .   .   .   .   .


								 DB     VM1,    VM1,  209, 213 ; 208: #1 better/worse than  X.  Try #2.

								 DB  VM1HP1, VM1HP1,  184, 210 ; 209: #2 better/worse than  X.  Try #3.

								 DB     HM1,    HM1,  211, 187 ; 210: #3 better/worse than #2.  Try #4.

								 DB     HP2,    HP2,  212, 189 ; 211: #4 better/worse than #2.  Try #5.

								 DB  VP1HM2,  NOADJ, 0FFH,0FFH ; 212: #5 better/worse than #2.  Take best, quit.

								 DB  VM1HP1, VM1HP1,  214, 210 ; 213: #2 better/worse than #1.  Try #3.

								 DB     HM1,    HM1,  215, 187 ; 214: #3 better/worse than #1.  Try #4.

								 DB     HP2,    HP2,  216, 189 ; 215: #4 better/worse than #1.  Try #5.

								 DB  VP2HM2,  NOADJ, 0FFH,0FFH ; 216: #5 better/worse than #1.  Take best, quit.


								     ;     .   .   4   3   5   C = center.

								     ;     .   ~   ~   X   2   ~ = tried, but not as good.

								     ;     .   ~   C   ~   1   X = best so far.

								     ;     .   ~   ~   ~   .   # = order to try additional candidates.

								     ;     .   .   .   .   .


								 DB     VM1,    VM1,  218, 222 ; 217: #1 better/worse than  X.  Try #2.

								 DB  VM1HM1, VM1HM1,  184, 219 ; 218: #2 better/worse than  X.  Try #3.

								 DB     HM1,    HM1,  220, 187 ; 219: #3 better/worse than #2.  Try #4.

								 DB     HP2,    HP2,  221, 189 ; 220: #4 better/worse than #2.  Try #5.

								 DB     VP1,  NOADJ, 0FFH,0FFH ; 221: #5 better/worse than #2.  Take best, quit.

								 DB  VM1HM1, VM1HM1,  223, 219 ; 222: #2 better/worse than #1.  Try #3.

								 DB     HM1,    HM1,  224, 187 ; 223: #3 better/worse than #1.  Try #4.

								 DB     HP2,    HP2,  225, 189 ; 224: #4 better/worse than #1.  Try #5.

								 DB     VP2,  NOADJ, 0FFH,0FFH ; 225: #5 better/worse than #1.  Take best, quit.


								FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR = 226


								 DB  VP1HP1, VP1HP1,  130, 130 ; 226: Redoing ctr, away from limiting edge.


								 DB  ?, ?, ?, ?, ?, ?


								; Table of values to add to SWDs for half pel reference macroblocks, to cause

								; those that are off the edge of the frame to produce artificially high SWDs.

								; (64 bytes;3040:3103)


								InvalidateBadHalfPelMVs LABEL DWORD


								  DD    0FFFFFFFFH, 0FFFFFF00H, 0FFFF00FFH, 0FFFF0000H

								  DD    0FF00FFFFH, 0FF00FF00H, 0FF0000FFH, 0FF000000H

								  DD    000FFFFFFH, 000FFFF00H, 000FF00FFH, 000FF0000H

								  DD    00000FFFFH, 00000FF00H, 0000000FFH, 000000000H


								; Tables (interleaved) to select case from next table (below these) to drive

								; the weighting of the future and past predictions in the construction of

								; B-frame reference blocks.

								; (448 bytes;3104:3551)


								VertWtSel LABEL BYTE

								  DB   0

								HorzWtSel LABEL BYTE

								  DB   240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   1,   0

								  DB   1,   0

								  DB   2,  16

								  DB   2,  16

								  DB   3,  32

								  DB   3,  32

								  DB   4,  48

								  DB   4,  48

								  DB   5,  64

								  DB   5,  64

								  DB   6,  80

								  DB   6,  80

								  DB   7,  96

								  DB   7,  96

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   8, 112

								  DB   9, 128

								  DB   9, 128

								  DB  10, 144

								  DB  10, 144

								  DB  11, 160

								  DB  11, 160

								  DB  12, 176

								  DB  12, 176

								  DB  13, 192

								  DB  13, 192

								  DB  14, 208

								  DB  14, 208

								  DB  15, 224

								  DB  15, 224

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240  ; Chroma starts here

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240  ; Luma ends here

								  DB   0, 240

								  DB   0, 240

								  DB   1,   0

								  DB   1,   0

								  DB   2,  16

								  DB   2,  16

								  DB   3,  32

								  DB   3,  32

								  DB   4,  48

								  DB   4,  48

								  DB   5,  64

								  DB   5,  64

								  DB   6,  80

								  DB   6,  80

								  DB   7,  96

								  DB   7,  96

								  DB   8, 112

								  DB   9, 128

								  DB   9, 128

								  DB  10, 144

								  DB  10, 144

								  DB  11, 160

								  DB  11, 160

								  DB  12, 176

								  DB  12, 176

								  DB  13, 192

								  DB  13, 192

								  DB  14, 208

								  DB  14, 208

								  DB  15, 224

								  DB  15, 224

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240

								  DB   0, 240


								; Table indexed by VertWtSel and HorzWtSel to get index of weight to apply to

								; future and past predictions in the construction of B-frame reference blocks

								; for frame differencing.

								; (264 bytes;3552:3815)

								;

								; Indexed by VertWtSel[VMV]+HorzWtSel[HMV]+N  to get idx of weight for line N.


								P8F0 =  0*8

								F1P7 =  1*8

								F2P6 =  2*8

								F3P5 =  3*8

								F4P4 =  4*8

								F5P3 =  5*8

								F6P2 =  6*8

								F7P1 =  7*8

								F8P0 =  8*8

								P1F7 =  9*8

								P2F6 = 10*8

								P3F5 = 11*8

								P4F4 = 12*8

								P5F3 = 13*8

								P6F2 = 14*8

								P7F1 = 15*8


								Diff_IdxRefWts LABEL BYTE


								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0

								  DB  P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0


								BFrmSWDState LABEL BYTE  ; State engine rules for finding best motion vector.

								; (48 bytes; 3816:3863)


								; 1st number:  Horizontal Motion displacement to try, in half pel increments.

								; 2nd number:  Vertical Motion displacement to try, in half pel increments.

								; 3rd number:  Next state to enter if previous best is still best.

								; 4th number:  Next state to enter if this motion is better than previous best.


								   DB    -2,   0,  4,  8   ;  0 -- ( 0, 0) Try (-2, 0)

								   DB     2,   0, 12, 12   ;  4 -- ( 0, 0) Try ( 2, 0)

								   DB     4,   0, 12, 12   ;  8 -- (-2, 0) Try ( 2, 0)

								   DB     0,  -2, 16, 20   ; 12 -- ( N, 0) Try ( N,-2)  (N = {-2,0,2})

								   DB     0,   2, 24, 24   ; 16 -- ( N, 0) Try ( N, 2)

								   DB     0,   4, 24, 24   ; 20 -- ( N,-2) Try ( N, 2)


								   DB    -1,   0, 28, 32   ; 24

								   DB     1,   0, 36, 36   ; 28

								   DB     2,   0, 36, 36   ; 32

								   DB     0,  -1, 40, 44   ; 36

								   DB     0,   1,  0,  0   ; 40

								   DB     0,   2,  0,  0   ; 44


								; Table used by Quant RLE to navigate the zigzag order of quantized coeffs.

								; Contents of this table are initialized by first entry to MMxEDTQ.  In

								; unlikely event of race condition, it will just get initialized by more

								; than one encoder instance.

								; (128 bytes; 3864:3991)


								NextZigZagCoeff LABEL BYTE


								  DB 128 DUP (0FFH)


								; Table used to initial above table.

								; (64 bytes: 3992:4055)


								InitZigZagCoeff LABEL BYTE


								  DB Q01,Q10,Q20,Q11,Q02,Q03,Q12,Q21,Q30,Q40,Q31,Q22,Q13,Q04,Q05,Q14

								  DB Q23,Q32,Q41,Q50,Q60,Q51,Q42,Q33,Q24,Q15,Q06,Q07,Q16,Q25,Q34,Q43

								  DB Q52,Q61,Q70,Q71,Q62,Q53,Q44,Q35,Q26,Q17,Q27,Q36,Q45,Q54,Q63,Q72

								  DB Q73,Q64,Q55,Q46,Q37,Q47,Q56,Q65,Q74,Q75,Q66,Q57,Q67,Q76,Q77,  0


								; Constants needed by the Quant RLE phase.

								; (128 bytes; 4056:4183)


								Recip2QP LABEL DWORD

								  WORD 0H, 0H           ; QP = 000h

								  WORD 04000H, 04000H   ; QP = 001h

								  WORD 02000H, 02000H   ; QP = 002h

								  WORD 01555H, 01555H   ; QP = 003h

								  WORD 01000H, 01000H   ; QP = 004h

								  WORD 00CCCH, 00CCCH   ; QP = 005h

								  WORD 00AAAH, 00AAAH   ; QP = 006h

								  WORD 00924H, 00924H   ; QP = 007h

								  WORD 00800H, 00800H   ; QP = 008h

								  WORD 0071CH, 0071CH   ; QP = 009h

								  WORD 00666H, 00666H   ; QP = 00Ah

								  WORD 005D1H, 005D1H   ; QP = 00Bh

								  WORD 00555H, 00555H   ; QP = 00Ch

								  WORD 004ECH, 004ECH   ; QP = 00Dh

								  WORD 00492H, 00492H   ; QP = 00Eh

								  WORD 00444H, 00444H   ; QP = 00Fh

								  WORD 00400H, 00400H   ; QP = 010h

								  WORD 003C3H, 003C3H   ; QP = 011h

								  WORD 0038EH, 0038EH   ; QP = 012h

								  WORD 0035EH, 0035EH   ; QP = 013h

								  WORD 00333H, 00333H   ; QP = 014h

								  WORD 0030CH, 0030CH   ; QP = 015h

								  WORD 002E8H, 002E8H   ; QP = 016h

								  WORD 002C8H, 002C8H   ; QP = 017h

								  WORD 002AAH, 002AAH   ; QP = 018h

								  WORD 0028FH, 0028FH   ; QP = 019h

								  WORD 00276H, 00276H   ; QP = 01Ah

								  WORD 0025EH, 0025EH   ; QP = 01Bh

								  WORD 00249H, 00249H   ; QP = 01Ch

								  WORD 00234H, 00234H   ; QP = 01Dh

								  WORD 00222H, 00222H   ; QP = 01Eh

								  WORD 00210H, 00210H   ; QP = 01Fh


								; Skip over space to get to where the following tables can go.  They will

								; hit the cache at the same point as a portion of the StateEngine states

								; that aren't used in the heuristic ME mode.

								; (2056 bytes; 4184:6239)


								  DB 2056 DUP (?)   ; Static space place-holder.


								; Table to select base address in next table below to use for particular block

								; of macroblock.  First column provides address of base element of HorzWtSel

								; to use to map horizontal MV to list of weighting indices to use.  ; Second

								; column is similar, but for Vertical MV.  Third and fourth columns not used.

								; 6 rows; one for each block in a macroblock.

								; (88 bytes; 6240:6327)


								LeftRightBlkPosition LABEL DWORD

								  DD HorzWtSel+0-64

								UpDownBlkPosition LABEL DWORD

								  DD                   VertWtSel+0-64,   0DEADBEEFH, 0DEADBEEFH

								  DD HorzWtSel+32-64,  VertWtSel+0-64,   0DEADBEEFH, 0DEADBEEFH

								  DD HorzWtSel+0-64,   VertWtSel+32-64,  0DEADBEEFH, 0DEADBEEFH

								  DD HorzWtSel+32-64,  VertWtSel+32-64,  0DEADBEEFH, 0DEADBEEFH

								  DD HorzWtSel+128,    VertWtSel+128,    0DEADBEEFH

								BlkEmptyFlag LABEL BYTE  ; sneak this in here

								  DB       16, 0, 32, 0

								  DD HorzWtSel+128,    VertWtSel+128


								; The following table, indexed by MBEdgeType&7, returns a mask which is used to

								; zero-out the motion vectors for predictors that are off the edge of the

								; frame.  The index is a 3 bit value, each bit being set if the macroblock

								; is NOT on the corresponding edge.  1 == left;  2 == right;  4 == top;

								; The value gotten out is (where A==left; B==above; C==above right):

								;    <mask(A) mask(A) mask(C) mask(C) mask(B) mask(B) mask(A) mask(A)>

								; The mask is 0xFF if the corresponding remote block is NOT off the edge, and

								; 0x00 if it is off the edge.

								; (32 bytes: 6328: 6359)


								ValidRemoteVectors LABEL DWORD

								  DWORD 0DEADBEEFH   ;  0: Can't be on left and right edges at once.

								  DWORD 0FF0000FFH   ;  1: Top right corner.

								  DWORD 000000000H   ;  2: Top left corner.

								  DWORD 0FF0000FFH   ;  3: Top edge.

								  DWORD 0DEADBEEFH   ;  4: Can't be on left and right edges at once.

								  DWORD 0FF00FFFFH   ;  5: Right edge.

								  DWORD 000FFFF00H   ;  6: Left edge.

								  DWORD 0FFFFFFFFH   ;  7: Central macroblock.


								; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes

								; to be subtracted with saturation to the predicted motion vector for extended

								; motion vector search.  Since saturation occurs at 0, the values here are

								; such that the motion vectors are biased to the appropriate point for the

								; clamping effect.  The index is a 4 bit value, each bit being set if the

								; macroblock is NOT on the corresponding edge.  1 == left;  2 == right;

								; 4 == top;  8 == bottom.  The 8 values being calculated are as follows:

								;    ; [ 0: 7] -- HMV lower limit for signature search

								;    ; [ 8:15] -- HMV lower limit

								;    ; [16:23] -- HMV upper limit for signature search

								;    ; [24:31] -- HMV upper limit

								;    ; [32:39] -- VMV lower limit for signature search

								;    ; [40:47] -- VMV lower limit

								;    ; [48:55] -- VMV upper limit for signature search

								;    ; [56:63] -- VMV upper limit

								; (88 bytes: 6360:6447)


								EMV_ClampLowerEnd LABEL DWORD

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  0: Can't be on all edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  1: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  2: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  3: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  4: Can't be on left and right edges at once.

								  BYTE   87,  94,  97, 100,      ;  5: Bottom right corner.

								         87,  94,  97, 100

								  BYTE  119, 126,  97, 100,      ;  6: Bottom left corner.

								         87,  94,  97, 100

								  BYTE   87,  94,  97, 100,      ;  7: Bottom edge.

								         87,  94,  97, 100

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ;  8: Can't be on left and right edges at once.

								  BYTE   87,  94,  97, 100,      ;  9: Top right corner.

								        119, 126,  97, 100

								  BYTE  119, 126,  97, 100,      ; 10: Top left corner.

								        119, 126,  97, 100

								  BYTE   87,  94,  97, 100,      ; 11: Top edge.

								        119, 126,  97, 100

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ; 12: Can't be on left and right edges at once.

								  BYTE   87,  94,  97, 100,      ; 13: Right edge.

								         87,  94,  97, 100

								  BYTE  119, 126,  97, 100,      ; 14: Left edge.

								         87,  94,  97, 100

								  BYTE   87,  94,  97, 100,      ; 15: Central macroblock.

								         87,  94,  97, 100


								; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes

								; to be added with saturation to the result of the application of the preceed-

								; ing table, to clamp the upper limit on the motion vector search parameters.

								; Since saturation occurs at 255, the values here are such that the motion

								; vectors are biased to the appropriate point for the clamping effect.

								; (88 bytes: 6448:6535)


								EMV_ClampUpperEnd LABEL DWORD

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  0: Can't be on all edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  1: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  2: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  3: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  4: Can't be on left and right edges at once.

								  BYTE  184, 193, 216, 225,      ;  5: Bottom right corner.

								        184, 193, 216, 225

								  BYTE  216, 225, 184, 193,      ;  6: Bottom left corner.

								        184, 193, 216, 225

								  BYTE  184, 193, 184, 193,      ;  7: Bottom edge.

								        184, 193, 216, 225

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ;  8: Can't be on left and right edges at once.

								  BYTE  184, 193, 216, 225,      ;  9: Top right corner.

								        216, 225, 184, 193

								  BYTE  216, 225, 184, 193,      ; 10: Top left corner.

								        216, 225, 184, 193

								  BYTE  184, 193, 184, 193,      ; 11: Top edge.

								        216, 225, 184, 193

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ; 12: Can't be on left and right edges at once.

								  BYTE  184, 193, 216, 225,      ; 13: Right edge.

								        184, 193, 184, 193

								  BYTE  216, 225, 184, 193,      ; 14: Left edge.

								        184, 193, 184, 193

								  BYTE  184, 193, 184, 193,      ; 15: Central macroblock.

								        184, 193, 184, 193


								; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes

								; to be added without saturation to the result of the application of the

								; preceeding table, to return the the motion vector search parameters to the

								; proper range for subsequent use.

								; (88 bytes: 6536:6623)


								EMV_RestoreRange LABEL DWORD

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  0: Can't be on all edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  1: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  2: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  3: Can't be on top and bottom edges at once.

								; DWORD 0DEADBEEFH, 0DEADBEEFH   ;  4: Can't be on left and right edges at once.

								  BYTE  120, 255,  88, 225,      ;  5: Bottom right corner.

								        120, 255,  88, 225

								  BYTE  120, 255,  56, 193,      ;  6: Bottom left corner.

								        120, 255,  88, 225

								  BYTE  120, 255,  56, 193,      ;  7: Bottom edge.

								        120, 255,  88, 225

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ;  8: Can't be on left and right edges at once.

								  BYTE  120, 255,  88, 225,      ;  9: Top right corner.

								        120, 255,  56, 193

								  BYTE  120, 255,  56, 193,      ; 10: Top left corner.

								        120, 255,  56, 193

								  BYTE  120, 255,  56, 193,      ; 11: Top edge.

								        120, 255,  56, 193

								  DWORD 0DEADBEEFH, 0DEADBEEFH   ; 12: Can't be on left and right edges at once.

								  BYTE  120, 255,  88, 225,      ; 13: Right edge.

								        120, 255,  56, 193

								  BYTE  120, 255,  56, 193,      ; 14: Left edge.

								        120, 255,  56, 193

								  BYTE  120, 255,  56, 193,      ; 15: Central macroblock.

								        120, 255,  56, 193


								; Tables indexed by indices fetched from Diff_IdxRefWts.  These tables return

								; a multipler to apply to past or future predictions to construct the

								; B-frame candidate reference blocks.

								; (128 bytes;6624:6751)


								FutureWt_FF_or_00 LABEL DWORD


								  DD 000000000H, 000000000H

								  DD 000000000H, 0FF000000H

								  DD 000000000H, 0FFFF0000H

								  DD 000000000H, 0FFFFFF00H

								  DD 000000000H, 0FFFFFFFFH

								  DD 0FF000000H, 0FFFFFFFFH

								  DD 0FFFF0000H, 0FFFFFFFFH

								  DD 0FFFFFF00H, 0FFFFFFFFH

								  DD 0FFFFFFFFH, 0FFFFFFFFH

								  DD 0FFFFFFFFH, 000FFFFFFH

								  DD 0FFFFFFFFH, 00000FFFFH

								  DD 0FFFFFFFFH, 0000000FFH

								  DD 0FFFFFFFFH, 000000000H

								  DD 000FFFFFFH, 000000000H

								  DD 00000FFFFH, 000000000H

								  DD 0000000FFH, 000000000H


								MMXMEDATA ENDS


								;=============================================================================


								.CODE EDTQ


								ASSUME cs : FLAT

								ASSUME ds : FLAT

								ASSUME es : FLAT

								ASSUME fs : FLAT

								ASSUME gs : FLAT

								ASSUME ss : FLAT


								EXTERN MMxDoForwardDCT:NEAR

								EXTERN MMxDoForwardDCTx:NEAR

								EXTERN MMxDoForwardDCTy:NEAR

								IFDEF H261

								ELSE

								EXTERN MMxDoBFrameLumaBlocks:NEAR

								EXTERN MMxDoBFrameChromaBlocks:NEAR

								ENDIF


								MMxEDTQ  proc C AMBAS:   DWORD,

								ATarg:   DWORD,

								APrev:   DWORD,

								ABTarg:  DWORD,

								AWtFwd:  DWORD,

								AWtBwd:  DWORD,

								AFrmWd:  DWORD,

								ADoHalf: DWORD,

								ADoBlk:  DWORD,

								ADoSF:   DWORD,

								ADoAP:   DWORD,

								ADoB:    DWORD,

								ADoLuma: DWORD,

								ADoExtMV:DWORD,

								AQP:     DWORD,

								ABQP:    DWORD,

								AB0VecT: DWORD,

								ASpaFilT:DWORD,

								ASpaFilD:DWORD,

								ASWDTot: DWORD,

								ABSWDTot:DWORD,

								ACodStr: DWORD,

								ABCodStr:DWORD


								LocalFrameSize = 1536   ; Space needed for locals


								RegStoSize = 16


								; Arguments:


								MBlockActionStream_arg       = RegStoSize +   4

								TargetFrameBaseAddress_arg   = RegStoSize +   8

								PreviousFrameBaseAddress_arg = RegStoSize +  12

								BTargetFrameBaseAddress_arg  = RegStoSize +  16

								SignatureBaseAddress_arg     = RegStoSize +  20

								WeightForwardMotion_arg      = RegStoSize +  24

								WeightBackwardMotion_arg     = RegStoSize +  28

								FrameWidth                   = RegStoSize +  32

								DoHalfPelEstimation_arg      = RegStoSize +  36

								DoBlockLevelVectors_arg      = RegStoSize +  40

								DoSpatialFiltering_arg       = RegStoSize +  44

								DoAdvancedPrediction_arg     = RegStoSize +  48

								DoBFrame_arg                 = RegStoSize +  52

								DoLumaBlocksInThisPass_arg   = RegStoSize +  56

								DoExtendedMotionVectors_arg  = RegStoSize +  60

								QuantizationLevel            = RegStoSize +  64

								BQuantizationLevel           = RegStoSize +  68

								BFrmZeroVectorThreshold_arg  = RegStoSize +  72

								SpatialFiltThreshold_arg     = RegStoSize +  76

								SpatialFiltDifferential_arg  = RegStoSize +  80

								PSWDTotal                    = RegStoSize +  84

								PBSWDTotal                   = RegStoSize +  88

								CodeStreamCursor_arg         = RegStoSize +  92

								BCodeStreamCursor_arg        = RegStoSize +  96

								EndOfArgList                 = RegStoSize + 100


								StackOffset TEXTEQU <0>

								CONST_384   TEXTEQU <384>


								  push  esi

								  push  edi

								  push  ebp

								  push  ebx


								; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.


								  mov        esi,esp

								   and       esp,0FFFFF000H

								  sub        esp,000000FE0H

								IFDEF H261


								   mov       ebp,PITCH


								CONST_384   TEXTEQU <ebp>


								  mov        eax,[esi+SpatialFiltThreshold_arg]

								   mov       ebx,[esi+SpatialFiltDifferential_arg]

								  mov        SpatialFiltThreshold,eax

								   mov       SpatialFiltDifferential,ebx

								  mov        ecx,[esi+TargetFrameBaseAddress_arg]

								   mov       ebx,[esi+SignatureBaseAddress_arg]

								  sub        ecx,ebx

								   mov       eax,[esi+TargetFrameBaseAddress_arg]

								  mov        SigToTarget,ecx

								   add       ecx,PITCH*80+64

								  neg        ecx

								  mov        TargetToSig_Debiased,ecx

								   mov       ebx,[esi+PreviousFrameBaseAddress_arg]

								  mov        PreviousFrameBaseAddress,ebx

								   mov       TargetFrameBaseAddress,eax

								  sub        ebx,eax

								   mov       ecx,[esi+QuantizationLevel]

								  mov        TargToRef,ebx

								   mov       eax,[esi+CodeStreamCursor_arg]

								  mov        ebx,ecx

								   mov       CodeStreamCursor,eax

								  shl        ebx,16

								   xor       edx,edx

								  or         ebx,ecx

								   mov       ecx,Recip2QP[ecx*4]

								  mov        QPDiv2,ebx

								   mov       Recip2QPToUse,ecx

								  mov        eax,[esi+DoSpatialFiltering_arg]

								   mov       DoExtendedMotionVectors,edx

								  test       eax,eax

								   je        @f

								  mov        eax,3

								@@:

								  mov        DoSpatialFiltering,al

								   mov       SWDTotal,edx

								  mov        BestMBHalfPelMV,edx

								   mov       ebx,PreviousFrameBaseAddress

								  mov        BlockAbove[0],edx

								   sub       ebx,16

								  mov        edx,[esi+FrameWidth]

								   mov       SpatiallyFilteredMB,ebx

								  imul       edx,-SIZEOF T_MacroBlockActionDescr/16

								  add        edx,2*SIZEOF T_Blk

								   mov       eax,14           ; 14 if restricted MVs and doing heuristic ME.

								  mov        BlockAbove[4],edx

								   mov       DoHeuristicME,eax


								ELSE


								   mov       eax,[esi+DoExtendedMotionVectors_arg]

								  test       eax,eax

								   je        @f

								  mov        eax,7

								@@:

								  mov        DoExtendedMotionVectors,eax

								   mov       eax,[esi+BFrmZeroVectorThreshold_arg]

								  mov        edi,[esi+WeightForwardMotion_arg]

								   mov       BFrmZeroVectorThreshold,eax

								  mov        ecx,60

								   mov       ebx,060606060H

								  lea        edx,WeightForwardMotion+128

								@@:

								   mov       eax,[edi+ecx]

								  and        eax,03F3F3F3FH    ; ???

								   mov       ebp,[edi+ecx+64]

								  and        ebp,03F3F3F3FH    ; ???

								   xor       eax,ebx

								  xor        ebp,ebx

								   mov       [edx+ecx+64],eax

								  mov        [edx+ecx-128],ebp

								   sub       ecx,4

								  mov        ebp,PITCH

								   jge       @b


								  mov        edi,[esi+WeightBackwardMotion_arg]

								   mov       eax,edx

								  lea        edx,WeightBackwardMotion+128

								   mov       ecx,60

								  sub        eax,edx

								   jne       @b


								CONST_384   TEXTEQU <ebp>


								  mov        ebx,[esi+PreviousFrameBaseAddress_arg]

								   mov       eax,[esi+TargetFrameBaseAddress_arg]

								  mov        PreviousFrameBaseAddress,ebx

								   mov       TargetFrameBaseAddress,eax

								  mov        ecx,[esi+BTargetFrameBaseAddress_arg]

								   sub       ebx,eax

								  mov        TargToRef,ebx

								   sub       eax,ecx

								  mov        BFrameBaseAddress,ecx

								   mov       BFrameToFuture,eax

								  mov        ecx,[esi+TargetFrameBaseAddress_arg]

								   mov       ebx,[esi+SignatureBaseAddress_arg]

								  sub        ecx,ebx

								   mov       edx,[esi+FrameWidth]

								  mov        SigToTarget,ecx

								   add       ecx,PITCH*80+64

								  neg        ecx

								  imul       edx,-SIZEOF T_MacroBlockActionDescr/16

								  mov        TargetToSig_Debiased,ecx

								   mov       ecx,[esi+DoBFrame_arg]

								  add        edx,2*SIZEOF T_Blk

								   xor       cl,1

								  mov        BlockAbove[4],edx

								   mov       IsPlainPFrame,cl

								  mov        ecx,[esi+QuantizationLevel]

								   mov       eax,[esi+CodeStreamCursor_arg]

								  mov        ebx,ecx

								   mov       CodeStreamCursor,eax

								  mov        eax,[esi+BCodeStreamCursor_arg]

								   mov       BCodeStreamCursor,eax

								  shl        ebx,16

								   mov       eax,[esi+DoHalfPelEstimation_arg]

								  or         ebx,ecx

								   mov       ecx,Recip2QP[ecx*4]

								  mov        QPDiv2,ebx

								   mov       Recip2QPToUse,ecx

								  mov        ecx,[esi+BQuantizationLevel]

								   xor       edx,edx

								  mov        ebx,ecx

								  shl        ebx,16

								   mov       BestMBHalfPelMV,edx

								  or         ebx,ecx

								   mov       ecx,Recip2QP[ecx*4]

								  mov        BQPDiv2,ebx

								   mov       BRecip2QPToUse,ecx

								  test       eax,eax

								   je        @f

								  mov        eax,-4

								@@:

								  mov        DoHalfPelME,eax

								   mov       eax,[esi+DoBlockLevelVectors_arg]

								  mov        DoBlockLevelVectors,al

								   mov       eax,[esi+DoAdvancedPrediction_arg]

								  mov        DoAdvancedPrediction,al

								   mov       SWDTotal,edx

								  test       eax,eax

								   lea       eax,[eax+14]     ; 14 if restricted MVs and doing heuristic ME.

								  je         @f

								  xor        eax,eax          ; 0 if unrestricted MVs and doing heuristic ME.

								@@:

								  mov        DoHeuristicME,eax

								   mov       BSWDTotal,edx

								  mov        PendingOBMC,edx

								   mov       BlockAbove[0],edx

								ENDIF

								  mov        eax,01E98E268H

								  mov        EMVLimitsForThisMB,eax

								  ;               ; [ 0: 7] -- HMV lower limit for sig search (biased 128)

								  ;               ; [ 8:15] -- HMV lower limit (signed)

								  ;               ; [16:23] -- HMV upper limit for sig search (biased 128)

								  ;               ; [24:31] -- HMV upper limit (signed)

								   mov       EMVLimitsForThisMB+4,eax ; Same as for HMV.

								  mov        edx,[esi+MBlockActionStream_arg]

								   mov       al,NextZigZagCoeff[Q77]

								  test       al,al

								   je        ZigZagCoeffInitialized


								  xor        ecx,ecx

								   lea       ebx,InitZigZagCoeff

								  xor        eax,eax


								@@:


								  mov        al,[ebx]

								   inc       ebx

								  mov        NextZigZagCoeff[ecx],al

								   mov       ecx,eax

								  test       eax,eax

								   jne       @b


								ZigZagCoeffInitialized:


								  mov        StashESP,esi

								   mov       eax,[esi+DoLumaBlocksInThisPass_arg]

								  test       eax,eax

								   jne       FirstMacroBlock   ; Jump if doing luma plane


								  jmp        FirstMacroBlock_ChromaProcessing


								IntraCodedChromaProcessingDone:


								IFDEF H261

								ELSE

								  mov        al,IsPlainPFrame

								  test       al,al

								   jne       NextMacroBlock_ChromaProcessing


								  mov        eax,QPDiv2

								   mov       ebx,BQPDiv2


								  call       MMxDoBFrameChromaBlocks

								ENDIF


								NextMacroBlock_ChromaProcessing:


								  mov        bl,[edx].CodedBlocks

								   sub       edx,-SIZEOF T_MacroBlockActionDescr

								  and        bl,040H               ; Check for end-of-stream

								   jne       TrulyDone


								FirstMacroBlock_ChromaProcessing:


								  mov        al,[edx].BlockType         ; Chroma handling.  Intra?  Or Inter?

								   mov       ecx,TargetFrameBaseAddress

								  cmp        al,INTRA

								   jne       ChromaIsInterCoded


								  mov        esi,[edx].BlkU.BlkOffset

								   mov       StashBlockType,al

								  add        esi,ecx

								   push      eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCT       ; Block is in target frame;  Pitch is PITCH


								  shl        bl,4

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       esi,[edx].BlkV.BlkOffset

								  mov        [edx].CodedBlocks,al

								   mov       ecx,TargetFrameBaseAddress

								  add        esi,ecx


								  call       MMxDoForwardDCT       ; Block is in target frame;  Pitch is PITCH


								  shl        bl,5

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   pop       ecx                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>

								  mov        [edx].CodedBlocks,al

								   jmp       IntraCodedChromaProcessingDone


								ChromaIsInterCoded:


								  mov        edi,[edx].BlkU.BlkOffset   ; Get address of next macroblock to do.

								   mov       ebx,[edx].BlkU.MVs

								  add        edi,ecx

								   mov       esi,[edx].BlkU.PastRef

								  mov        StashBlockType,al

								IFDEF H261

								   mov       ecx,2+256*1        ; cl==2 tells SpatialLoopFilter code to do one

								   ;                            ; block.  ch==1 causes it to return to here.

								  mov        TargetMacroBlockBaseAddr,edi  ; Store address of U block.

								   cmp       al,INTERSLF

								  je         DoSpatialFilterForChroma


								ReturnFromSpatialFilterForU:


								ENDIF


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4   ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  shl        bl,4

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       ecx,TargetFrameBaseAddress

								  mov        [edx].CodedBlocks,al

								   pop       edi                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>

								  mov        edi,[edx].BlkV.BlkOffset   ; Get address of next macroblock to do.

								   mov       ebx,[edx].BlkV.MVs

								  add        edi,ecx

								   mov       esi,[edx].BlkV.PastRef

								IFDEF H261

								   mov       ecx,2-256*1        ; cl==2 tells SpatialLoopFilter code to do one

								   ;                            ; block.  ch==-1 causes it to return to here.

								  mov        TargetMacroBlockBaseAddr,edi  ; Store address of U block.

								   mov       al,[edx].BlockType

								  cmp        al,INTERSLF

								   je        DoSpatialFilterForChroma


								ReturnFromSpatialFilterForV:


								ENDIF


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4     ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  shl        bl,5

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   pop       ecx                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>

								  mov        [edx].CodedBlocks,al

								   jmp       IntraCodedChromaProcessingDone


								;============================================================================

								;  Here we copy the target macroblock, and interpolate left, right, and both.

								;  We also accumulate the target pels for each block.  Result is four partial

								;  sums in four packed words.  After summing them all up, the final sum will

								;  be the sum of the 64 pels of each block, divided by 2.


								NextMacroBlock:


								  mov        bl,[edx].CodedBlocks

								   sub       edx,-SIZEOF T_MacroBlockActionDescr

								  and        bl,040H               ; Check for end-of-stream

								   jne       Done


								FirstMacroBlock:


								  mov        edi,TargetFrameBaseAddress

								   mov       esi,[edx].BlkY1.BlkOffset   ; Get address of next macroblock to do.

								  add        edi,esi

								   mov       esi,TargToRef

								  add        esi,edi

								   mov       TargetMacroBlockBaseAddr,edi

								  mov        Addr0MVRef,esi


								;============================================================================

								; We calculate the 0-motion SWD.  We use 32 match points per block, and

								; write the result seperately for each block.  If the SWD for the 0-motion

								; vector is below a threshold, we don't bother searching for other possibly

								; better motion vectors.

								;

								;  ebp -- PITCH

								;  esi -- Address of ref block.

								;  edi -- Address of target block.

								;  edx -- MBlockActionStream

								;  ecx -- Not used.  Will be linearized MV in non-zero MV search.

								;  ebx -- CurrSWDState, i.e. FirstMEState, times 8

								;  eax -- Scratch

								;  mm7 -- Best SWD for macroblock.

								;  mm0-mm6 Scratch

								;


								   mov       cl,[edx].CodedBlocks        ; Init CBP for macroblock.

								  or         cl,03FH                     ; Indicate all 6 blocks are coded.

								   mov       eax,DoHeuristicME           ; 0  if unrestricted MVs and heur ME.

								   ;                                     ; 14 if restricted MVs and heur ME.

								   ;                                     ; 15 if suppressing heuristic ME.

								  mov        [edx].CodedBlocks,cl

								   js        IntraByDecree


								  xor        ebx,ebx                     ; Avoid partial register stall.

								   xor       ecx,ecx

								  mov        cl,[edx].MBEdgeType         ; 1 left | 2 right | 4 top | 8 bottom

								   pcmpeqd   mm7,mm7                     ; Init previous best SWD to huge.

								  mov        bl,[edx].FirstMEState       ; Test for INTRA-BY-DECREE.

								   sub       eax,ecx                     ; Negative iff should do heuristic ME

								   ;                                     ; for this macroblock.

								  test       bl,bl

								   je        IntraByDecree


								  sar        eax,31

								   psrlq     mm7,2

								  or         ebx,eax                     ; -1 if doing heuristic ME.

								   mov       al,INTER1MV                 ; Speculate INTER, 1 motion vector.

								  mov        [edx].BlockType,al

								   psrld     mm7,14       ; mm7[32:63]:  Previous best SWD = 0x0000FFFF.

								   ;                      ; mm7[ 0:31]:  Prev SWD that we diminish = 0x0003FFFF.

								   ;                      ; Since we can't diminish it below 0x00020000, we

								   ;                      ; won't take the short circuit exit from MblkEstQWA.


								; At this point:

								;  ebp -- PITCH

								;  esi -- Address of upper left block of 0,0 ref area.

								;  edi -- Address of upper left block of target.

								;  edx -- MBlockActionStream

								;  ecx -- Scratch

								;  ebx -- CurrSWDState, i.e. FirstMEState.

								;  eax -- Scratch

								;  mm7 -- Previous best SWD initialized to huge (0xFFFF, 0x3FFFF).

								;  mm0-mm6 -- Scratch


								;============================================================================

								; Compute SWD for macroblock.


								ComputeMBSWD:


								;  Registers at this point:

								;  ebp -- PITCH

								;  esi -- Address of upper left block of candidate ref area.

								;  edi -- Address of upper left block of target.

								;  edx -- MBlockActionStream

								;  ecx -- Scratch

								;  ebx -- CurrSWDState

								;  eax -- Scratch

								;  mm7 -- Previous best SWD.

								;  mm0-mm6 -- Scratch

								;


								  lea        ecx,[ebp+ebp*4]       ; Get PITCH*5

								   lea       eax,[ebp+ebp*2]       ; Get PITCH*3

								  movq       mm0,[esi+PITCH*15]    ; FL A:  Ref MB, lower left block, line 15.

								  psubw      mm0,[edi+PITCH*15]    ; FL B:  Diff for lower left block, line 15.

								  movq       mm6,[esi+PITCH*15+8]  ; FR A

								   psllw     mm0,8                 ; FL C:  Extract diffs for line 15 even pels.

								  psubw      mm6,[edi+PITCH*15+8]  ; FR B

								   pmaddwd   mm0,mm0               ; FL D:  Square of diffs for even pels.

								  movq       mm1,[esi+PITCH*9]     ; 9L A

								   psllw     mm6,8                 ; FR C

								  psubw      mm1,[edi+PITCH*9]     ; 9L B

								   pmaddwd   mm6,mm6               ; FR D

								  movq       mm5,[esi+PITCH*9+8]   ; 9R A

								   psllw     mm1,8                 ; 9L C

								  psubw      mm5,[edi+PITCH*9+8]   ; 9R B

								   pmaddwd   mm1,mm1               ; 9L D

								  movq       mm2,[esi+eax*4]       ; CL a

								   psllw     mm5,8                 ; 9R C

								  psubw      mm2,[edi+eax*4]       ; CL b

								   pmaddwd   mm5,mm5               ; 9R D

								  movq       mm3,[esi+eax*4+8]     ; CR a

								   pmaddwd   mm2,mm2               ; CL c:  Square of diffs for odd pels.

								  psubw      mm3,[edi+eax*4+8]     ; CR b

								   paddusw   mm0,mm1               ; LL +   Accumulate SWD for lower left block.

								  movq       mm1,[esi+eax*1]       ; 3L A

								   pmaddwd   mm3,mm3               ; CR c

								  psubw      mm1,[edi+eax*1]       ; 3L B

								   paddusw   mm6,mm5               ; LR +

								  movq       mm5,[esi+eax*1+8]     ; 3R A

								   psllw     mm1,8                 ; 3L C

								  psubw      mm5,[edi+eax*1+8]     ; 3R B

								   paddusw   mm0,mm2               ; LL +

								  movq       mm2,[esi]             ; 0L a

								   pmaddwd   mm1,mm1               ; 3L D

								  psubw      mm2,[edi]             ; 0L b

								   paddusw   mm6,mm3               ; LR +

								  movq       mm3,[esi+8]           ; 0R a

								   psllw     mm5,8                 ; 3R C

								  psubw      mm3,[edi+8]           ; 0R b

								   pmaddwd   mm5,mm5               ; 3R D

								  movq       mm4,[esi+eax*2]       ; 6L a

								   pmaddwd   mm2,mm2               ; 0L c

								  psubw      mm4,[edi+eax*2]       ; 6L b

								   pmaddwd   mm3,mm3               ; 0R c

								  movq       PartSWDForLLBlk,mm0   ;       Stash SWD for lines 9,12,15, LL blk.

								   paddusw   mm0,mm6               ;       Sum SWD for lines 9,12,15 LL and LR.

								  movq       PartSWDForLRBlk,mm6   ;       Stash SWD for lines 9,12,15, LR blk.

								   pmaddwd   mm4,mm4               ; 6L c

								  movq       mm6,[esi+eax*2+8]     ; 6R a

								   paddusw   mm1,mm2               ; UL +

								  psubw      mm6,[edi+eax*2+8]     ; 6R b

								   paddusw   mm5,mm3               ; UR +

								  movq       mm2,[esi+ebp*1]       ; 1L A

								   pmaddwd   mm6,mm6               ; 6R c

								  psubw      mm2,[edi+ebp*1]       ; 1L B

								   paddusw   mm1,mm4               ; UL +

								  movq       mm3,[esi+ecx*1]       ; 5L A

								   paddusw   mm0,mm1               ;       Sum partial SWD for LL, LR, and UL.

								  psubw      mm3,[edi+ecx*1]       ; 5L B

								   paddusw   mm5,mm6               ; UR +

								  movq       mm6,[esi+ebp*4]       ; 4L a

								   paddusw   mm0,mm5               ;       Sum partial SWD for all blocks.

								  movq       PartSWDForURBlk,mm5   ;       Stash SWD for lines 0,3,6, UR blk.

								   punpckldq mm5,mm0               ;       Get low sum into high bits.

								  psubw      mm6,[edi+ebp*4]       ; 4L b

								   paddusw   mm5,mm0               ;       Total up SWD for every third line.

								  movq       mm0,[esi+ebp*2]       ; 2L a

								   psrlq     mm5,47                ;       Position, and double.

								  psubw      mm0,[edi+ebp*2]       ; 2L b

								   pcmpgtd   mm5,mm7               ;       Is 2 * SWD for 6 lines > prev SWD?

								  pmaddwd    mm0,mm0               ; 2L c

								   psllw     mm2,8                 ; 1L C

								  movdf      eax,mm5

								   pmaddwd   mm2,mm2               ; 1L D

								  test       eax,eax

								   jne       MblkEst_EarlyOut


								  lea        eax,[ecx+ebp*2]       ; PITCH*7

								   psllw     mm3,8                 ; 5L C

								  paddusw    mm1,mm2               ; UL +

								   pmaddwd   mm3,mm3               ; 5L D

								  movq       mm5,[esi+eax*1]       ; 7L A

								  psubw      mm5,[edi+eax*1]       ; 7L B

								   pmaddwd   mm6,mm6               ; 4L c

								  movq       mm2,[esi+PITCH*11+8]  ; BR A

								   psllw     mm5,8                 ; 7L C

								  psubw      mm2,[edi+PITCH*11+8]  ; BR B

								   paddusw   mm1,mm3               ; UL +

								  movq       mm3,[esi+PITCH*13+8]  ; DR A

								   paddusw   mm1,mm0               ; UL +

								  psubw      mm3,[edi+PITCH*13+8]  ; DR B

								   pmaddwd   mm5,mm5               ; 7L D

								  movq       mm0,[esi+ebp*8+8]     ; 8R a

								   paddusw   mm1,mm6               ; UL +

								  psubw      mm0,[edi+ebp*8+8]     ; 8R b

								   psllw     mm2,8                 ; BR C

								  movq       mm4,[esi+ecx*2+8]     ; AR a

								   paddusw   mm1,mm5               ; UL +

								  psubw      mm4,[edi+ecx*2+8]     ; AR b

								   punpckldq mm6,mm1               ;      Get low SWD accum to hi order of mm6.

								  movq       mm5,[esi+eax*2+8]     ; ER a

								   paddusw   mm6,mm1               ;      mm6[48:63] is SWD for upper left blk.

								  psubw      mm5,[edi+eax*2+8]     ; ER b

								   psrlq     mm6,48                ;      mm6 is SWD for upper left block.

								  psubusw    mm7,mm6               ;      Diminish prev best SWD by cand UL blk.

								   pmaddwd   mm2,mm2               ; BR D

								  pmaddwd    mm0,mm0               ; 8R c

								   psllw     mm3,8                 ; DR C

								  movq       mm1,[esi+ebp*1+8]     ; 1R A

								   pmaddwd   mm3,mm3               ; DR D

								  paddusw    mm2,PartSWDForLRBlk   ; LR +

								   pmaddwd   mm4,mm4               ; AR c

								  psubw      mm1,[edi+ebp*1+8]     ; 1R B

								   paddusw   mm2,mm0               ; LR +

								  movq       mm0,[esi+ecx*1+8]     ; 5R A

								   pmaddwd   mm5,mm5               ; ER c

								  psubw      mm0,[edi+ecx*1+8]     ; 5R B

								   paddusw   mm2,mm3               ; LR +

								  movq       mm3,[esi+eax*1+8]     ; 7R A

								   paddusw   mm2,mm4               ; LR +

								  paddusw    mm2,mm5               ; LR +

								   psllw     mm1,8                 ; 1R C

								  psubw      mm3,[edi+eax*1+8]     ; 7R B

								   punpckldq mm5,mm2               ;      Get low SWD accum to hi order of mm5.

								  paddusw    mm5,mm2               ;      mm5[48:63] is SWD for lower right blk.

								   pmaddwd   mm1,mm1               ; 1R D

								  movq       mm2,[esi+ebp*2+8]     ; 2R a

								   psrlq     mm5,48                ;      mm5 is SWD for lower right block.

								  psubusw    mm7,mm5               ;      Diminish prev best SWD by cand LR blk.

								   punpckldq mm6,mm5               ;      mm6[0:31] UL SWD;  mm6[32:63] LR SWD.

								  psubw      mm2,[edi+ebp*2+8]     ; 2R b

								   psllw     mm0,8                 ; 5R C

								  movq       mm5,[esi+ebp*4+8]     ; 4R a

								   pmaddwd   mm0,mm0               ; 5R D

								  psubw      mm5,[edi+ebp*4+8]     ; 4R b

								   psllw     mm3,8                 ; 7R C

								  paddusw    mm1,PartSWDForURBlk   ; UR +

								   pmaddwd   mm3,mm3               ; 7R D

								  paddusw    mm1,mm0               ; UR +

								   pmaddwd   mm2,mm2               ; 2R c

								  movq       mm0,[esi+PITCH*11]    ; BL A

								   pmaddwd   mm5,mm5               ; 4R c

								  psubw      mm0,[edi+PITCH*11]    ; BL B

								   paddusw   mm1,mm3               ; UR +

								  movq       mm3,[esi+ecx*2]       ; AL a

								   paddusw   mm1,mm2               ; UR +

								  psubw      mm3,[edi+ecx*2]       ; AL b

								   paddusw   mm1,mm5               ; UR +

								  pmaddwd    mm3,mm3               ; AL c

								   psllw     mm0,8                 ; BL C

								  movq       mm2,[esi+PITCH*13]    ; DL A

								   pmaddwd   mm0,mm0               ; BL D

								  psubw      mm2,[edi+PITCH*13]    ; DL B

								   punpckldq mm5,mm1               ;      Get low SWD accum to hi order of mm5.

								  movq       mm4,[esi+ebp*8]       ; 8L a

								   paddusw   mm5,mm1               ;      mm5[48:63] is SWD for upper right blk.

								  psubw      mm4,[edi+ebp*8]       ; 8L b

								   psllw     mm2,8                 ; DL C

								  movq       mm1,[esi+eax*2]       ; EL a

								   pmaddwd   mm2,mm2               ; DL D

								  psubw      mm1,[edi+eax*2]       ; EL b

								   pmaddwd   mm4,mm4               ; 8L c

								  paddusw    mm3,PartSWDForLLBlk   ; LL +

								   pmaddwd   mm1,mm1               ; EL c

								  paddusw    mm3,mm0               ; LL +

								   psrlq     mm5,48                ;      mm5 is SWD for upper right block.

								  paddusw    mm3,mm2               ; LL +

								   psubusw   mm7,mm5               ;      Diminish prev best SWD by cand UR blk.

								  paddusw    mm3,mm4               ; LL +

								   movq      mm0,mm7

								  paddusw    mm3,mm1               ; LL +

								   psrlq     mm7,32	           ; Get original Best SWD

								  punpckldq  mm1,mm3

								   pxor      mm2,mm2

								  paddusw    mm1,mm3

								  psrlq      mm1,48

								  punpckldq  mm5,mm1           ; mm5[32:63] SWD for LL.  mm5[0:31] SWD for UR.

								   psubusw   mm0,mm1

								  psubusw    mm7,mm0           ; BestSWD dim (BestSWD dim CandSWD) --> new best.

								   pcmpeqd   mm2,mm0           ; [0:31] == 0 iff cand better, else -1.


								;  Registers at this point:

								;  ebp -- PITCH

								;  edi -- Target MacroBlock Base Address.

								;  esi -- Address of upper left block of candidate ref area.

								;  edx -- MBlockActionStream

								;  ebx -- CurrSWDState

								;  mm7 -- New best SWD for macroblock.

								;  mm6 -- [0:31] SWD for upper left;   [32:63] SWD for lower right.

								;  mm5 -- [0:31] SWD for upper right;  [32:63] SWD for lower left.

								;  mm2 -- [0:31] 0 if cand better, else -1.


								  cmp        ebx,LASTINITIALMESTATE  ; Did we just do zero motion vector?

								   jg        MEForNonZeroMVDone


								  movdf      eax,mm7                 ; SWD for this candidate.

								   punpckldq mm7,mm7                 ; Put new best in mm7[0:31] and mm7[32:63].

								  test       ebx,ebx

								   jns       ZeroMVDoneForNonHeuristicME


								HeuristicME_EarlyOut:


								  movq       mm0,EMVLimitsForThisMB  ; Speculate no extended motion vectors.

								   pcmpeqb   mm1,mm1                 ; <FFFF FFFF FFFF FFFF>

								  xor        ecx,ecx

								   cmp       bl,-3

								  mov        cl,[edx].MBEdgeType     ; 1 left | 2 right | 4 top | 8 bottom

								   jle       HeuristicME_CaseSigMVDone_or_CaseAboveMVDone


								  sub        eax,NONZEROMVDIFFERENTIAL

								   inc       bl

								  mov        ebx,DoExtendedMotionVectors  ; 7 iff doing extende MVs, else 0.

								   jne       HeuristicME_CaseLeftMVDone


								HeuristicME_Case0MVDone:


								  movq       SWDULandLR,mm6

								   pcmpeqb   mm4,mm4                 ; <FFFF FFFF FFFF FFFF>

								  movq       SWDURandLL,mm5

								   psllw     mm4,15                  ; <8000 8000 8000 8000>

								  cmp        eax,ZEROVECTORTHRESHOLD-NONZEROMVDIFFERENTIAL

								  ;                                  ; Compare 0-MV against ZeroVectorThreshold.

								   jl        BelowZeroThresh         ; Jump if 0-MV is good enough.


								  mov        SWDForNon0MVToBeat,eax

								   and       ebx,ecx                 ; Elim flag for bottom row. 0 iff no ExtMV.

								  mov        eax,BlockAbove[4]

								   je        NotExtendedMVs          ; Jump if not doing extended MVs?


								                                     ; Below:  A==left;  B==above;  C==above rt.

								  movdt      mm3,ValidRemoteVectors[ebx*4]              ; <mask(A) (C) (B) (A)>

								   movq      mm2,mm4                 ; <8000 8000 8000 8000>


								IF SIZEOF T_MacroBlockActionDescr-128

								**** error:  Due to assembler weakness, can't use spaces here, so SIZEOF

								**** T_MacroBlockActionDescr is replaced by constant.  If assembly error

								**** occurs, the constant has been changed, and the three instructions in

								**** the next 10 lines have to change.

								ENDIF

								IF SIZEOF T_Blk-16

								**** error:  Due to assembler weakness, can't use spaces here, so SIZEOF T_Blk

								**** is replaced by constant.  If assembly error occurs, the constant has been

								**** changed, and the three instructions in the next 10 lines have to change.

								ENDIF

								  movdt      mm0,[edx-128].BestFullPelMBMVs             ; <x    x    Av,h x   >

								   punpcklbw mm3,mm3                                    ; mask for both MV parts

								  movdt      mm1,[edx+eax-2*16+128].BestFullPelMBMVs    ; <x    x    Cv,h x   >

								   psrlw     mm2,8                                      ; <0080 0080 0080 0080>

								  por        mm4,mm2                                    ; <8080 ...> bias value.

								   punpcklwd mm1,mm0                                    ; <Av,h Cv,h x    x   >

								  punpcklwd  mm0,[edx+eax-2*16].BestFullPelMBMVs        ; <Bv,h Av,h x x >

								   ;

								  punpckhdq  mm0,mm1                 ; <Av,h Cv,h Bv,h Av,h>

								   ;

								  pand       mm0,mm3                 ; Set to 0 any off edge.

								   and       ebx,4                   ; If zero, we're on the top edge.

								  paddb      mm0,mm4                 ; <Av,h Cv,h Bv,h Av,h> biased

								   je        @f                      ; If on top edge, cause LEFT to be taken.

								  movq       mm1,mm0                 ; <Av,h Cv,h Bv,h Av,h>

								   psrlq     mm0,16                  ; <x    Av,h Cv,h Bv,h>

								  psubusb    mm0,mm1                 ; <x    floor(A-C) floor(C-B) floor(B-A)>

								   ;

								  paddb      mm0,mm1                 ; <x    max(A,C) max(C,B) max(B,A)>

								   ;

								  movq       mm1,mm0                 ; <x    max(A,C) max(C,B) max(B,A)>

								   psrlq     mm0,16                  ; <x    x        max(A,C) max(C,B)>

								  pxor       mm1,mm0                 ; Part of median calc.

								   psrlq     mm0,16                  ; <x    x        x        max(A,C)>

								  pxor       mm0,mm1                 ; <x x x median(A,B,C)> biased by +128.

								   ;


								@@:


								  punpcklbw  mm0,mm0                 ; 2 copies of median predictor MVs.

								   pcmpeqb   mm1,mm1

								  punpcklwd  mm0,mm0                 ; 4 copies.  Will now calc the following:

								  ;                                  ; [ 0: 7] -- HMV lower limit for sig search

								  ;                                  ; [ 8:15] -- HMV lower limit

								  ;                                  ; [16:23] -- HMV upper limit for sig search

								  ;                                  ; [24:31] -- HMV upper limit

								  ;                                  ; [32:39] -- VMV lower limit for sig search

								  ;                                  ; [40:47] -- VMV lower limit

								  ;                                  ; [48:55] -- VMV upper limit for sig search

								  ;                                  ; [56:63] -- VMV upper limit

								   ;

								  psubusb    mm0,EMV_ClampLowerEnd[ecx*8-40]

								   psllw     mm1,3                   ; <FF F8 FF F8 FF F8 FF F8> i.e.  Mask to

								   ;                                 ; set sig srch range to mult of 8.

								  paddusb    mm0,EMV_ClampUpperEnd[ecx*8-40]


								  psubb      mm0,EMV_RestoreRange[ecx*8-40]


								NotExtendedMVs:


								  movq       SWD0MVURandLL,mm5

								   pand      mm0,mm1                 ; Set sig search at multiples of four.

								  movq       SWD0MVULandLR,mm6

								   pcmpeqb   mm2,mm2                 ; Set cand as worse than 0MV, in case skip.

								  movq       EMVLimitsForThisMB,mm0

								  and        cl,1

								   je        HeuristicME_SkipLeftMV


								  mov        BestOfFourStartingPoints,esi

								   mov       ebx,-2                  ; Indicate trying MV of MB to left.

								  movsx      ecx,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBVMV

								  movsx      eax,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBHMV


								ClampHeurMECandidateToRange:


								  movsx      esi,PB EMVLimitsForThisMB+5  ; VMV lower limit.

								  cmp        ecx,esi

								   jl        ClampVMV_1


								  movsx      esi,PB EMVLimitsForThisMB+7  ; VMV upper limit.

								  cmp        ecx,esi

								   jle       @f


								ClampVMV_1:


								  mov        ecx,esi


								@@:


								  movsx      esi,PB EMVLimitsForThisMB+1  ; HMV lower limit.

								  cmp        eax,esi

								   jl        ClampHMV_1


								  movsx      esi,PB EMVLimitsForThisMB+3  ; HMV upper limit.

								  cmp        eax,esi

								   jle       @f


								ClampHMV_1:


								  mov        eax,esi


								@@:


								  sar        eax,1

								   lea       ecx,[ecx+ecx*2]

								IF PITCH-384

								*** error:  The magic here assumes a pitch of 384.

								ENDIF

								  shl        ecx,6

								   mov       esi,Addr0MVRef

								  add        eax,ecx                      ; Clamped Linearized Motion Vector

								   ;

								  sub        eax,1

								   jc        MblkEst_EarlyOut             ; Jump if Lin MV is zero.


								  lea        esi,[esi+eax+1]              ; Candidate reference address.

								   jmp       ComputeMBSWD


								HeuristicME_SkipLeftMV:


								  mov        BestOfFourStartingPoints,esi

								   mov       cl,[edx].MBEdgeType   ; 1 left | 2 right | 4 top | 8 bottom


								HeuristicME_CaseLeftMVDone:


								  movdf      eax,mm2               ; eax == 0 iff cand better, else -1.

								  mov        ebx,BlockAbove[4]

								   and       cl,4

								  movq       SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).

								   punpckldq mm7,mm7               ; Put new best in mm7[0:31] and mm7[32:63].

								  movq       SWDURandLL[eax*8],mm5

								   pcmpeqb   mm2,mm2               ; Set cand as worse than prev, in case skip.

								  mov        BestOfFourStartingPoints[eax*4],esi

								   je        HeuristicME_SkipAboveMV


								  movsx      ecx,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBVMV

								  movsx      eax,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBHMV

								  mov        ebx,-3                  ; Indicate trying MV of MB above.

								   jmp       ClampHeurMECandidateToRange


								HeuristicME_CaseSigMVDone_or_CaseAboveMVDone:

								HeuristicME_SkipAboveMV:


								  movdf      eax,mm2               ; eax == 0 iff cand better, else -1.

								  jne        HeuristicME_CaseSigMVDone


								HeuristicME_CaseAboveMVDone:


								  mov        cl,4

								   lea       ebx,C0001000100010001

								  movq       SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).

								   pxor      mm0,mm0

								  movq       SWDURandLL[eax*8],mm5

								   pxor      mm1,mm1

								  mov        BestOfFourStartingPoints[eax*4],esi

								   lea       esi,TargetSigContribForRowPairs

								  movdf      BestMBFullPelSWD,mm7  ; Stash SWD for best full pel MB MV.

								   pcmpeqb   mm7,mm7               ; W:<0xFFFF  0xFFFF  0xFFFF  0xFFFF>


								; ebp -- Pitch

								; edi -- Address of target macroblock.

								; esi -- Address at which to store target macroblock's signature contributions.

								; cl  -- Loop counter.

								; mm0 -- Accumulator for target MB's sig contrib for first four even columns.

								; mm1 -- Accumulator for target MB's sig contrib for last four even columns.


								  movq       mm2,[edi]             ; B:<P07 P06 P05 P04 P03 P02 P01 P00>

								   pcmpeqb   mm5,mm5               ; W:<0xFFFF  0xFFFF  0xFFFF  0xFFFF>

								  paddb      mm2,[edi+ebp*1]       ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>

								   psrlw     mm5,8                 ; W:<0x00FF  0x00FF  0x00FF  0x00FF>


								@@:


								  movq       mm3,[edi+ebp*2]       ; B:<P27 P26 P25 P24 P23 P22 P21 P20>

								   movq      mm4,mm2               ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>

								  paddb      mm3,[edi+PITCH*3]     ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>

								   psrlw     mm2,8                 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>

								  pmaddwd    mm2,[ebx]             ; D:<P07+P17+P05+P15 P03+P13+P01+P11>

								   movq      mm7,mm5               ; W:<0x00FF  0x00FF  0x00FF  0x00FF>

								  pand       mm5,mm3               ; W:<P26+P36 P24+P34 P22+P32 P20+P30>

								   psrlw     mm3,8                 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>

								  pmaddwd    mm3,[ebx]             ; D:<P27+P37+P25+P35 P23+P33+P21+P31>

								   paddw     mm0,mm5               ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>

								  movq       mm5,[edi+ebp*2+8]     ; B:<P2F P2E P2D P2C P2B P2A P29 P28>

								   pand      mm4,mm7               ; W:<P06+P16 P04+P14 P02+P12 P00+P10>

								  paddb      mm5,[edi+PITCH*3+8]   ; B:<P2F+P3F P2E+P3E P2D+P3D P2C+P3C ...>

								   paddw     mm0,mm4               ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>

								  movq       mm4,[edi+8]           ; B:<P0F P0E P0D P0C P0B P0A P09 P08>

								   movq      mm6,mm7               ; W:<0x00FF  0x00FF  0x00FF  0x00FF>

								  paddb      mm4,[edi+ebp*1+8]     ; B:<P0F+P1F P0E+P1E P0D+P1D P0C+P1C ...>

								   pand      mm7,mm5               ; W:<P2E+P3E P2C+P3C P2A+P3A P28+P38>

								  pand       mm6,mm4               ; W:<P0E+P1E P0C+P1C P0A+P1A P08+P18>

								   psrlw     mm5,8                 ; W:<P2F+P3F P2D+P3D P2B+P3B P29+P39>

								  pmaddwd    mm5,[ebx]             ; D:<P2F+P3F+P2D+P3D P2B+P3B+P29+P39>

								   psrlw     mm4,8                 ; W:<P0F+P1F P0D+P1D P0B+P1B P09+P19>

								  pmaddwd    mm4,[ebx]             ; D:<P0F+P1F+P0D+P1D P0B+P1B+P09+P19>

								   paddw     mm1,mm7               ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>

								  paddw      mm1,mm6               ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>

								   lea       edi,[edi+ebp*4]       ; Advance input cursor

								  paddw      mm3,mm5               ; D:<P2F+P3F+P2D+P3D+P27+P37+P25+P35

								   ;                               ;    P2B+P3B+P29+P39+P23+P33+P21+P31>

								   pcmpeqb   mm5,mm5               ; Next W:<0xFFFF  0xFFFF  0xFFFF  0xFFFF>

								  paddw      mm4,mm2               ; D:<P0F+P1F+P0D+P1D+P07+P17+P05+P15

								   ;                               ;    P0B+P1B+P09+P19+P03+P13+P01+P11>

								   punpckldq mm7,mm3               ; D:<P0B+P1B+P09+P19+P03+P13+P01+P11 junk>

								  paddw      mm7,mm3               ; [32:47]:<sum of odd pels of lines 0 and 1>

								   punpckldq mm6,mm4               ; W:<P2B+P3B+P29+P39+P23+P33+P21+P31 junk>

								  movq       mm2,[edi]             ; Next B:<P07 P06 P05 P04 P03 P02 P01 P00>

								   paddw     mm6,mm4               ; [32:47]:<sum of odd pels of lines 2 and 3>

								  paddb      mm2,[edi+ebp*1]       ; Next B:<P07+P17 P06+P16 P05+P15 ...>

								   punpckhwd mm6,mm7               ; [0:31] W:<Line_0&1_odd  Line_2&3_odd>

								  mov        MBlockActionStream,edx

								   dec       cl

								  movdf      [esi],mm6             ; Save W:<Line_0&1_odd  Line_2&3_odd>

								   psrlw     mm5,8                 ; Next W:<0x00FF  0x00FF  0x00FF  0x00FF>

								  lea        esi,[esi+4]           ; Advance output cursor

								   jne       @b


								; ebp -- Pitch

								; edi -- Address of candidate reference MB's signature contribs.

								; esi -- Address at which target MB's signature contribs were stored, plus 16.

								; edx -- Scratch.

								; ecx -- Count down number of lines of signatures to try.

								; ebx -- Increment to get from end of one line of signatures to start of next.

								; al  -- Count down number of signatures to try in a line.

								; ah  -- Reinits counter of signatures to try in a line.

								; mm0 -- Target MB's sig contrib for first four even columns.

								; mm1 -- Target MB's sig contrib for last four even columns.

								; mm2 -- Target MB's sig contrib for first four pairs of rows, odd columns.

								; mm3 -- Amount and address of best signature seen so far.


								IF PITCH-384

								*** error:  The magic here assumes a pitch of 384.

								ENDIF

								  xor        eax,eax

								   mov       ecx,TargetToSig_Debiased

								  mov        al,EMVLimitsForThisMB+4 ; Lower vert lim for sig srch (half pels)

								   xor       ebx,ebx

								  add        edi,ecx

								   mov       bl,EMVLimitsForThisMB+0 ; Lower horz lim for sig srch (half pels)

								  shr        ebx,1

								   lea       ecx,[eax+eax*2]

								  shl        ecx,6

								   add       edi,ebx

								  add        edi,ecx

								   xor       ecx,ecx

								  add        ebx,ebx

								   mov       cl,EMVLimitsForThisMB+6 ; Upper vert lim for sig srch (half pels)

								  sub        ecx,eax

								   mov       al,EMVLimitsForThisMB+2 ; Upper horz lim for sig srch (half pels)

								  shr        ecx,3                   ; Number of lines of sigs to do, minus 1.

								   sub       eax,ebx

								  shr        eax,3                   ; Number of columns of sigs to do.

								   lea       ebx,[ebp-1+080000000H]

								  sub        ebx,eax                 ; 1/4th amt to add to move to next line.

								   mov       ah,al

								  inc        ah                      ; To reinit cntr for line.

								  movq       mm2,[esi-16]

								   pcmpeqd   mm3,mm3                 ; Set winning signature artificially high.

								  movdt      mm4,[edi]

								   psrld     mm3,2

								  punpckldq  mm4,[edi+4]         ; ref sig contribs of left even cols.


								TryNextSignature:


								  movdt      mm5,[edi+8]

								   psubw     mm4,mm0             ; diffs for sums of left even columns.

								  punpckldq  mm5,[edi+12]        ; ref sig contribs of right even cols.

								   pmaddwd   mm4,mm4             ; Squared differences.

								  movdt      mm6,[edi+ebp*2]     ; Sums for first two pairs of rows.

								   psubw     mm5,mm1             ; diffs for sums of right even columns.

								  punpckldq  mm6,[edi+PITCH*6]   ; Sums for second two pairs of rows.

								   pmaddwd   mm5,mm5             ; Squared differences.

								  movdt      mm7,[edi+PITCH*10]  ; Sums for third two pairs of rows.

								   psubw     mm6,mm2             ; Words: diffs for sums of first 4 pairs rows.

								  punpckldq  mm7,[edi+PITCH*14]  ; Sums for last two pairs of rows.

								   pmaddwd   mm6,mm6             ; Squared differences.

								  psubw      mm7,[esi-8]         ; Words: diffs for sums of first 4 pairs rows.

								   paddd     mm4,mm5             ; Accumulate squared differences.

								  sub        al,1                ; Decrement line counter.

								   pmaddwd   mm7,mm7             ; Squared differences.

								  sbb        edx,edx             ; -1 if done with line, else 0.

								   paddd     mm6,mm4             ; Accumulate squared differences.

								  and        edx,ebx             ; 1/4 Amt to sub to goto next line, else 0.

								   paddd     mm7,mm6             ; Accumulate squared differences.

								  movdt      mm5,edi             ; Address of this signature

								   punpckldq mm6,mm7             ; <low_order_accumulator junk>

								  paddd      mm7,mm6             ; <full_signature_amt junk>

								   psllq     mm5,32              ; <Addr_of_this_signature     0>

								  lea        edi,[edi+edx*4+4]   ; advance signature position to next cand.

								   punpckhdq mm5,mm7             ; <cand_signature_amt cand_signature_addr>

								  sar        edx,31              ; -1 if done with line, else 0.

								   pcmpgtd   mm7,mm3             ; <0xFFFFFFFF if cand not better    junk>

								  movdt      mm4,[edi]

								   punpckhdq mm7,mm7             ; <0xFFFFFFFFFFFFFFFF if cand not better>

								  punpckldq  mm4,[edi+4]

								   pand      mm3,mm7             ; 1st_best if cand not better, else 0.

								  and        dl,ah               ; Num cols in a line if done with line, else 0.

								   pandn     mm7,mm5             ; cand if better than 1st_best, else 0.

								  add        al,dl               ; Reinit col count if finishing with line.

								   por       mm3,mm7             ; Better of cand and 1st_best.

								  sbb        ecx,0               ; Decrement line count if just finished line.

								   jge       TryNextSignature


								  movdf      ecx,mm3                ; Fetch address of best signature.

								   pcmpeqb   mm2,mm2                ; Set cand as worse than prev, in case skip.

								  mov        edi,TargetMacroBlockBaseAddr

								   mov       ebx,-4                 ; Indicate trying MV of best signature.

								  sub        ecx,edi

								   mov       eax,SigToTarget

								  movdt      mm7,BestMBFullPelSWD   ; Reload SWD for best full pel MB MV.

								  lea        esi,[ecx+eax]          ; Linearized motion vector

								   add       eax,ecx                ; Linearized motion vector

								  sar        esi,8                  ; Full pel vert lin offset div 256.

								   mov       edx,MBlockActionStream ; Reload pointer to MBA descriptor.

								  shl        eax,25

								   punpckldq mm7,mm7

								  movsx      ecx,UnlinearizedVertMV[esi]  ; Get full pel vert MV component.

								  sar        eax,24                 ; Full pel HMV.

								   jmp       ClampHeurMECandidateToRange


								HeuristicME_CaseSigMVDone:

								HeuristicME_SkipSigMV:


								  movdf      eax,mm2                        ; eax == 0 iff cand better, else -1.

								   pcmpeqd   mm0,mm0                        ; Init previous best SWD to huge.

								  mov        ecx,Addr0MVRef                 ; Start to calc linearized MV.

								   mov       bh,EMVLimitsForThisMB+1        ; HMV lower limit.

								  mov        BestOfFourStartingPoints[eax*4],esi

								   add       bh,4

								  movq       SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).

								   psrlq     mm0,2

								  movq       SWDURandLL[eax*8],mm5

								   psrld     mm0,14

								  mov        eax,BestOfFourStartingPoints

								   mov       bl,EMVLimitsForThisMB+5        ; VMV lower limit.

								  mov        esi,eax

								   sub       eax,ecx                        ; Linearized motion vector

								  mov        ecx,eax                        ; Linearized motion vector

								   add       al,al                          ; Full pel HMV.

								  cmp        al,bh

								   jl        ClampHMV_2


								  mov        bh,EMVLimitsForThisMB+3        ; HMV upper limit

								  sub        bh,4

								  cmp        al,bh

								   jle       NoClampHMV_2


								ClampHMV_2:


								  sar        ecx,8                          ; Full pel vert lin offset div 256.

								   add       bl,4

								  movzx      eax,bh

								  movsx      ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.

								  cmp        cl,bl

								   jl        @f


								  mov        bl,EMVLimitsForThisMB+7        ; VMV upper limit.

								   movq      mm7,mm0

								  sub        bl,4

								  cmp        cl,bl

								   jle       NoClampVMV_2


								@@:


								  movsx      ecx,bl

								  movq       mm7,mm0


								NoClampVMV_2:


								  sar        eax,1

								   lea       ecx,[ecx+ecx*2]

								  shl        ecx,6

								   mov       ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR  ; New state number.

								  mov        esi,Addr0MVRef

								   add       eax,ecx               ; Linearized motion vector.

								  add        esi,eax

								   jmp       ComputeMBSWD


								NoClampHMV_2:


								  sar        ecx,8                          ; Full pel vert lin offset div 256.

								   add       bl,4

								  mov        ah,bl

								  movsx      ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.

								  cmp        cl,ah

								   jl        @f


								  mov        ah,EMVLimitsForThisMB+7        ; VMV upper limit.

								   lea       esi,[esi+ebp+1]

								  sub        ah,4

								   mov       ebx,FIRST_HEURISTIC_EXHAUSTIVE ; New state number.

								  cmp        cl,ah

								   jle       ComputeMBSWD


								@@:


								  movsx      ecx,ah

								  movzx      eax,al

								  sar        eax,1

								   lea       ecx,[ecx+ecx*2]

								  shl        ecx,6

								   mov       ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR  ; New state number.

								  mov        esi,Addr0MVRef

								   add       eax,ecx               ; Linearized motion vector.

								  add        esi,eax

								   movq      mm7,mm0

								  jmp        ComputeMBSWD


								ZeroMVDoneForNonHeuristicME:


								  movq       SWDULandLR,mm6

								  movq       SWDURandLL,mm5

								  cmp        eax,ZEROVECTORTHRESHOLD ; Compare 0-MV against ZeroVectorThreshold.

								   jl        BelowZeroThresh         ; Jump if 0-MV is good enough.


								  xor        ecx,ecx

								   sub       eax,NONZEROMVDIFFERENTIAL

								  mov        cl,StateEngineFirstRule[ebx]     ; MV adjustment.

								   mov       bl,StateEngineFirstRule[ebx+10]  ; New state number.

								  shl        ecx,11

								   mov       SWDForNon0MVToBeat,eax

								  movq       SWD0MVULandLR,mm6

								  movq       SWD0MVURandLL,mm5

								  lea        esi,[esi+ecx-PITCH*8]

								   jmp       ComputeMBSWD


								MEForNonZeroMVDone:


								  movdf      eax,mm2           ; eax == 0 iff cand better, else -1.


								MblkEst_EarlyOut:


								  xor        ecx,ecx

								   test      ebx,ebx

								  movq       SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).

								   pcmpeqb   mm2,mm2                 ; Set cand as worse than 0MV.

								  mov        cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.

								   js        HeuristicME_EarlyOut


								  add        esi,ecx               ; Adjust ref addr for horz motion.

								   mov       bl,StateEngine[eax+ebx*4+3] ; 0:239 -> New state number;

								   ;                                     ; 240:255 -> flags which 1/2 pel to do.

								  shr        ecx,4

								   punpckldq mm7,mm7               ; Put new best in mm7[0:31] and mm7[32:63].

								  movq       SWDURandLL[eax*8],mm5

								   pxor      mm6,mm6               ; Speculatively zero to prep for half pel ME.

								  add        esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.

								   cmp       bl,240                ; Terminal state?

								  jb         ComputeMBSWD


								  mov        eax,esi

								   mov       ecx,Addr0MVRef               ; Start to calc linearized MV.

								  sub        eax,ecx                      ; Linearized Motion Vector

								   ;

								  mov        ecx,eax

								   ;

								  sar        eax,8                        ; Full pel vert lin offset div 256.

								   and       cl,07FH                      ; Full pel HMV

								  add        cl,cl

								   ;

								  mov        ch,UnlinearizedVertMV[eax]   ; Get full pel vert MV component.

								IFDEF H261

								ELSE

								   mov       eax,DoHalfPelME ; 0 if not, -4 if so.

								  test       eax,eax

								   je        SkipHalfPelMBME


								  cmp        cl,EMVLimitsForThisMB+1      ; Skip half pel ME if at edge of range

								   jle       SkipHalfPelMBME


								  cmp        cl,EMVLimitsForThisMB+3

								   jge       SkipHalfPelMBME


								  cmp        ch,EMVLimitsForThisMB+5

								   jle       SkipHalfPelMBME


								  cmp        ch,EMVLimitsForThisMB+7

								   jge       SkipHalfPelMBME


								; Registers:

								;  ebp -- PITCH

								;  esi -- Address of best full pel reference macroblock

								;  edx -- MBlockActionStream

								;  ecx -- Nothing presently.

								;  edi -- Address of target macroblock.

								;  ebx -- 240 + Flags to indicate which half pel ME to do:

								;         1 --> right;   2 --> left;   4 --> down;   8 --> up

								;  eax -- Count from -4 to -1 for blocks of macroblock.

								;  mm0:mm7 -- Scratch


								  movdf      BestMBFullPelSWD,mm7   ; Stash SWD for best full pel MB MV.

								   pxor      mm7,mm7                ; Prep accumulator for half pel ME.


								  call       HalfPelMotionEstimation


								  movdt      mm7,InvalidateBadHalfPelMVs[eax*4] ; Need to inflate SWDs for

								  ;                                             ; MVs that go off frame edge.

								  mov        eax,esi

								   mov       ebx,Addr0MVRef               ; Start to calc linearized MV.

								  sub        eax,ebx                      ; Linearized Motion Vector

								   punpcklbw mm7,mm7                      ; Expand adjustment to words.

								  mov        ecx,eax                      ; Linearized Motion Vector

								   paddusw   mm7,mm3                      ; Now have SWDs for half pel MBME.

								  sar        eax,8                        ; Full pel vert lin offset div 256.

								   and       cl,07FH                      ; Full pel HMV

								  add        cl,cl

								   movq      mm6,mm7

								  mov        [edx].BestFullPelMBHMV,cl    ; Save HMV

								   mov       ch,UnlinearizedVertMV[eax]   ; Get full pel vert MV component.

								  movdf      eax,mm7                      ; eax[ 0:15] -- SWD for leftward ref.

								  ;                                       ; eax[16:31] -- SWD for rightward ref.

								   psrlq     mm6,32

								  mov        [edx].BestFullPelMBVMV,ch    ; Save VMV

								   mov       ebx,eax

								  shr        eax,16                       ; eax -- SWD for leftward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for rightward ref.

								  cmp        eax,ebx

								   jg        MBME_RightBetterThanLeft


								MBME_LeftBetterThanRight:


								  cmp        eax,BestMBFullPelSWD

								   jge       MBME_CtrIsBestHMV


								MBME_LeftBestHMV:


								  movdf      ebx,mm6                      ; ebx[ 0:15] -- SWD for downward ref.

								  ;                                       ; ebx[16:31] -- SWD for upward ref.

								  mov        BestHalfPelHorzSWD,eax

								   mov       eax,ebx

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jg        MBME_LeftBestHMV_DownBetterThanUp


								MBME_LeftBestHMV_UpBetterThanDown:


								  cmp        eax,BestMBFullPelSWD

								   jge       MBME_LeftIsBest


								MBME_LeftBestHMV_UpBestVMV:


								  sub        esi,PITCH+1                  ; Try ref 1/2 pel left and up

								   mov       BestHalfPelVertSWD,eax

								  mov        al,4


								  call       HalfPelMotionEstimationBothWays


								  mov        eax,BestHalfPelVertSWD

								   lea       esi,[esi+ebp*1+1]            ; Back to center.

								  cmp        eax,ebx

								   jle       MBME_UpBetterThanUpLeft


								MBME_UpLeftBetterThanUp:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       MBME_LeftIsBest


								MBME_UpLeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-PITCH-1]            ; Best is ref 1/2 pel left and up

								  dec        ch                           ; Back up the vert MV one up.

								   jmp       MBME_HalfPelSearchDone


								MBME_UpBetterThanUpLeft:


								  cmp        eax,BestHalfPelHorzSWD

								   jg        MBME_LeftIsBest


								MBME_UpIsBest:


								  mov        ebx,eax

								   dec       ch                           ; Back up the vert MV one up.

								  lea        eax,[esi-PITCH]              ; Best is ref 1/2 pel up

								   jmp       MBME_HalfPelSearchDone


								MBME_LeftBestHMV_DownBetterThanUp:


								  cmp        ebx,BestMBFullPelSWD

								   jge       MBME_LeftIsBest


								MBME_LeftBestHMV_DownBestVMV:


								  dec        esi                          ; Try ref 1/2 pel left and down

								   mov       BestHalfPelVertSWD,ebx

								  mov        al,4


								  call       HalfPelMotionEstimationBothWays


								  mov        eax,BestHalfPelVertSWD

								   inc       esi                          ; Back to center.

								  cmp        eax,ebx

								   jle       MBME_DownBetterThanDownLeft


								MBME_DownLeftBetterThanDown:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       MBME_LeftIsBest


								MBME_DownLeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-1]                  ; Best is ref 1/2 pel left and down

								  inc        ch                           ; Advance the vert MV one down.

								   jmp       MBME_HalfPelSearchDone


								MBME_DownBetterThanDownLeft:


								  cmp        eax,BestHalfPelHorzSWD

								   jle       MBME_DownIsBest


								MBME_LeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-1]                  ; Best is ref 1/2 pel left.

								  mov        ebx,BestHalfPelHorzSWD

								   jmp       MBME_HalfPelSearchDone


								MBME_RightBetterThanLeft:


								  cmp        ebx,BestMBFullPelSWD

								   jge       MBME_CtrIsBestHMV


								MBME_RightBestHMV:


								  movdf      eax,mm6                      ; eax[ 0:15] -- SWD for downward ref.

								  ;                                       ; eax[16:31] -- SWD for upward ref.

								  mov        BestHalfPelHorzSWD,ebx

								   mov       ebx,eax

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jg        MBME_RightBestHMV_DownBetterThanUp


								MBME_RightBestHMV_UpBetterThanDown:


								  cmp        eax,BestMBFullPelSWD

								   jge       MBME_RightIsBest


								MBME_RightBestHMV_UpBestVMV:


								  sub        esi,ebp                      ; Try ref 1/2 pel right and up

								   mov       BestHalfPelVertSWD,eax

								  mov        al,4


								  call       HalfPelMotionEstimationBothWays


								  mov        eax,BestHalfPelVertSWD

								   lea       esi,[esi+ebp*1]              ; Back to center.

								  cmp        eax,ebx

								   jle       MBME_UpBetterThanUpRight


								MBME_UpRightBetterThanUp:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       MBME_RightIsBest


								MBME_UpRightIsBest:


								  inc        cl                           ; Advance the horz MV one to right.

								   lea       eax,[esi-PITCH]              ; Best is ref 1/2 pel right and up

								  dec        ch                           ; Back up the vert MV one up.

								   jmp       MBME_HalfPelSearchDone


								MBME_UpBetterThanUpRight:


								  cmp        eax,BestHalfPelHorzSWD

								   jle       MBME_UpIsBest


								MBME_RightIsBest:


								  mov        ebx,BestHalfPelHorzSWD

								   inc       cl                           ; Advance the horz MV one to right.

								  mov        eax,esi

								   jmp       MBME_HalfPelSearchDone


								MBME_RightBestHMV_DownBetterThanUp:


								  cmp        ebx,BestMBFullPelSWD

								   jge       MBME_RightIsBest


								MBME_RightBestHMV_DownBestVMV:


								  mov        BestHalfPelVertSWD,ebx

								   mov       al,4


								  call       HalfPelMotionEstimationBothWays


								  mov        eax,BestHalfPelVertSWD

								  cmp        eax,ebx

								   jle       MBME_DownBetterThanDownRight


								MBME_DownRightBetterThanDown:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       MBME_RightIsBest


								MBME_DownRightIsBest:


								  inc        cl                           ; Advance the horz MV one to right.

								   mov       eax,esi

								  inc        ch                           ; Advance vert MV one down.

								   jmp       MBME_HalfPelSearchDone


								MBME_DownBetterThanDownRight:


								  cmp        eax,BestHalfPelHorzSWD

								   jg        MBME_RightIsBest


								MBME_DownIsBest:


								  mov        ebx,eax

								   inc       ch                           ; Advance vert MV one down.

								  mov        eax,esi

								   jmp       MBME_HalfPelSearchDone


								MBME_CtrIsBestHMV:


								  movdf      eax,mm6                      ; eax[ 0:15] -- SWD for downward ref.

								  ;                                       ; eax[16:31] -- SWD for upward ref.

								  mov        ebx,eax

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jge       MBME_CtrBestHMV_DownBetterThanUp


								MBME_CtrBestHMV_UpBetterThanDown:


								  mov        ebx,BestMBFullPelSWD

								  cmp        eax,ebx

								   jge       MBME_CenterIsBest


								; Up is best.


								  mov        ebx,eax

								   dec       ch                           ; Back up the vert MV one up.

								  lea        eax,[esi-PITCH]              ; Best is ref 1/2 pel up

								   jmp       MBME_HalfPelSearchDone


								MBME_CtrBestHMV_DownBetterThanUp:


								  mov        eax,ebx

								   mov       ebx,BestMBFullPelSWD

								  cmp        eax,ebx

								   jge       MBME_CenterIsBest


								; Down is best.


								  mov        ebx,eax

								   inc       ch                           ; Advande the vert MV one down.

								  mov        eax,esi

								   jmp       MBME_HalfPelSearchDone


								ENDIF


								SkipHalfPelMBME:


								  mov        [edx].BestFullPelMBHMV,cl    ; Save HMV

								  movdf      ebx,mm7                      ; SWD for best full pel MB MV.

								  mov        [edx].BestFullPelMBVMV,ch    ; Save VMV


								MBME_CenterIsBest:


								  mov        eax,esi


								MBME_HalfPelSearchDone:


								  mov        BestMBHalfPelSWD,ebx

								   mov       BestMBHalfPelMV,cl           ; Save HMV

								  mov        BestMBHalfPelRefAddr,eax

								   mov       BestMBHalfPelMV+1,ch         ; Save VMV


								IFDEF H261

								ELSE ; H263

								  mov        bl,EMVLimitsForThisMB+1     ; Lower limit comparison.

								   mov       al,DoBlockLevelVectors      ; Are we doing block level MVs?

								  dec        al

								   jne       NoBlockMotionVectors


								  mov        cl,[edx].CodedBlocks        ; Fetch coded block pattern.

								   add       bl,2

								  and        cl,080H

								   jne       NoBlockMotionVectors        ; Skip Block ME if forced intra.


								  mov        al,[edx].BestFullPelMBHMV   ; Compare full pel HMV against limits.

								   mov       cl,EMVLimitsForThisMB+3

								  cmp        al,bl

								   jl        NoBlockMotionVectors


								  mov        bl,EMVLimitsForThisMB+5

								   sub       cl,2

								  cmp        al,cl                       ; Upper limit comparison.

								   jg        NoBlockMotionVectors


								  mov        al,[edx].BestFullPelMBVMV   ; Compare full pel VMV against limits.

								   add       bl,2

								  mov        cl,EMVLimitsForThisMB+7

								   cmp       al,bl

								  mov        ebx,PD [edx].BestFullPelMBVMV-3

								   jl        NoBlockMotionVectors


								  sar        ebx,18

								   sub       cl,2

								  cmp        al,cl                       ; Upper limit comparison.

								   jg        NoBlockMotionVectors


								  mov        ecx,BestMBHalfPelSWD        ; Jump if SWD for MB MV < thresh.

								IF PITCH-384

								*** error:  The magic here assumes a pitch of 384.

								ENDIF

								   and       ebx,0FFFFFF80H              ; VMV*128

								  cmp        ecx,BLOCKMOTIONTHRESHOLD

								   jle       NoBlockMotionVectors


								;==========================================================================

								; Starting from the best full pel macroblock motion vector calculated above, we

								; search for the best block motion vectors.

								;

								;  ebp -- PITCH

								;  esi -- Address of ref block.

								;  edi -- Address of target block.

								;  edx -- Induction variable over luma blocks in MBlockAction Descriptor.

								;  ecx -- Scratch

								;  ebx -- CurrSWDState

								;  eax -- Scratch

								;  mm7 -- Best SWD for current block

								;  mm6 -- unused.

								;  mm5 -- Best SWD for right block of pair worked on by inner loop.

								;  mm0-mm4 Scratch

								;


								  movq       mm0,HalfPelMBMESWDAccum+8

								  movq       mm1,HalfPelMBMESWDAccum+16

								   psubusw   mm7,mm0

								  movq       mm2,HalfPelMBMESWDAccum+0

								   psubusw   mm0,mm1

								  movq       [edx].BlkY4.BlkLvlSWD+16,mm7

								   psubusw   mm1,mm2

								  movq       [edx].BlkY2.BlkLvlSWD+16,mm0

								  movq       [edx].BlkY3.BlkLvlSWD+16,mm1

								  movq       [edx].BlkY1.BlkLvlSWD+16,mm2


								  movsx      eax,[edx].BestFullPelMBHMV

								  sar        eax,1

								   lea       ebx,[ebx+ebx*2]

								  mov        esi,Addr0MVRef

								   add       ebx,ebp

								  mov        Addr0MVRefBlk,esi

								   add       esi,eax

								  lea        ecx,[ecx+ecx*2]               ; Best MBMV SWD times 3.

								   add       esi,ebx                       ; Try V+1 first

								  shr        ecx,2                         ; Best MBMV SWD * 3/4.

								   mov       eax,SWDForNon0MVToBeat

								  mov        BestBlockRefAddrVP1,esi       ; Stash BestBlockRefAddr

								   sub       ecx,BLOCKMVDIFFERENTIAL       ; Best MBMV SWD * 3/4 - Differential.

								  lea        eax,[eax+eax*2-BLOCKMVDIFFERENTIAL*4] ; Non0MBMVSWDToBeat*3-4*Diff.

								   mov       LimitForSWDForBlkMV,ecx

								  shr        eax,2                         ; Non0MBMVSWDToBeat * 3/4.

								   mov       ebx,FIRSTBLOCKMESTATE

								  cmp        eax,ecx

								   jg        @f


								  mov        LimitForSWDForBlkMV,eax

								   mov       ecx,eax


								@@:


								  movdt      mm5,SWDURandLL     ; Get SWD for best MB level full pel MVs, blk 2.

								  test       ecx,ecx

								   jle       NoBlockMotionVectors

								  movdt      mm7,SWDULandLR     ; Get SWD for best MB level full pel MVs, blk 1.

								  movdf      SWDForBlock2Or4,mm5


								;============================================================================

								; Compute SWD for block.


								DoBlkMEForNextBlk:

								ComputeBlkSWD:


								  movq       mm0,[esi+ebp*1]

								  psubw      mm0,[edi+ebp*1]   ; Get diff for line 1.

								  movq       mm1,[esi+PITCH*3] ; Ref MB, upper left block, Line 3.

								   psllw     mm0,8             ; Extract diffs for line 1 even pels.

								  psubw      mm1,[edi+PITCH*3] ; Diff for line 3.

								   pmaddwd   mm0,mm0           ; Square of diffs for even pels of line 1.

								  movq       mm2,[esi+PITCH*5]

								   psllw     mm1,8

								  psubw      mm2,[edi+PITCH*5]

								   pmaddwd   mm1,mm1

								  movq       mm3,[esi+PITCH*7]

								   psllw     mm2,8

								  psubw      mm3,[edi+PITCH*7]

								   pmaddwd   mm2,mm2

								  movq       mm4,[esi]         ; Ref MB, upper left blk, Line 0.

								   psllw     mm3,8

								  psubw      mm4,[edi]         ; Diff for line 0.

								   paddusw   mm0,mm1           ; Accumulate SWD (lines 0 and 2).

								  movq       mm1,[esi+ebp*2]

								   pmaddwd   mm3,mm3

								  psubw      mm1,[edi+ebp*2]

								   paddusw   mm0,mm2

								  movq       mm2,[esi+ebp*4]

								   pmaddwd   mm4,mm4           ; Square of diffs for odd pels of line 0.

								  psubw      mm2,[edi+ebp*4]

								   paddusw   mm0,mm3

								  movq       mm3,[esi+PITCH*6]

								   pmaddwd   mm1,mm1

								  psubw      mm3,[edi+PITCH*6]

								   pmaddwd   mm2,mm2

								  paddusw    mm0,mm4

								   pmaddwd   mm3,mm3

								  paddusw    mm0,mm1

								   ;

								  paddusw    mm0,mm2

								   ;

								  paddusw    mm0,mm3

								   ;

								  punpckldq  mm1,mm0           ; Get low order SWD accum to high order of mm1.

								   movq      mm4,mm7           ; Get original Best SWD for block


								  paddusw    mm1,mm0           ; mm1[48:63] is SWD for block.

								   pxor      mm2,mm2

								  psrlq      mm1,48            ; mm1 is SWD for block.

								   ;

								  psubusw    mm4,mm1

								   xor       ecx,ecx

								  pcmpeqd    mm2,mm4           ; mm2[0:31] == 0 iff cand better, else -1.

								   psubusw   mm7,mm4           ; BestSWD dim (BestSWD dim CandSWD) --> new best.

								  ;

								   ;

								  movdf      eax,mm2           ; edi == 0 iff cand better, else -1.

								   ;


								;  Registers at this point:

								;  ebp -- PITCH

								;  esi -- Address of block of candidate ref area.

								;  edi -- 0 iff candidate SWD better, else -1.

								;  edx -- Induction variable over luma blocks in MBlockAction Descriptor.

								;  ecx -- Scratch

								;  ebx -- CurrSWDState.

								;  eax -- CurrSWDState.

								;  mm7 -- New best SWD for current block

								;  mm6 -- Unused.


								  movq       [edx].BlkY1.BlkLvlSWD,mm7   ; Save best blk level SWD.

								   pxor      mm6,mm6                     ; Spec zero to prep for half pel ME.

								  mov        cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.

								   mov       bl,StateEngine[eax+ebx*4+3] ; New state number; 255 means done.

								  add        esi,ecx                     ; Adjust ref addr for horz motion.

								   mov       eax,DoHalfPelME             ; 0 if not, -4 if so.

								  shr        ecx,4

								   cmp       bl,240                      ; Terminal state?

								  jae        @f


								  add        esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.

								   jmp       ComputeBlkSWD


								@@:

								  add        esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.

								   add       eax,4

								  mov        ecx,esi

								   jne       SkipHalfPelBlkME


								; Registers:

								;  ebp -- PITCH

								;  esi -- Address of best full pel reference macroblock

								;  edx -- Induction variable over luma blocks in MBlockAction Descriptor.

								;  ecx -- Copy of esi.

								;  edi -- Address of target block.

								;  ebx -- Scratch

								;  eax -- Set to 0 to cause HalfPelMotionEstimation to quit after one block.

								;  mm0:mm7 -- Scratch


								  mov        ebx,BestBlockRefAddrVP1

								   add       ecx,ebp

								  cmp        ebx,ecx

								   jne       FullPelBlkMEMovedFromCenter


								  movdf      BestBlkFullPelSWD,mm7        ; Stash SWD for best full pel MB MV.

								  movq       mm3,[edx].BlkY1.BlkLvlSWD+16 ; SWDs: H+1, H-1, V+1, V-1.

								  jmp        FullPelBlkMEDidNotMoveFromCenter


								FullPelBlkMEMovedFromCenter:


								  movdf      BestBlkFullPelSWD,mm7   ; Stash SWD for best full pel MB MV.

								   pxor      mm7,mm7                 ; Prep accumulator for half pel ME.


								  call       HalfPelMotionEstimation


								  lea        esi,[esi+ebp*8+8]            ; Fix reference pointer.

								   lea       edi,[edi+ebp*8+8]            ; Fix target pointer.


								FullPelBlkMEDidNotMoveFromCenter:


								  mov        eax,esi

								   mov       ebx,Addr0MVRefBlk            ; Start to calc linearized MV.

								  sub        ecx,ebx                      ; Linearized Motion Vector

								   sub       eax,ebx                      ; Linearized Motion Vector

								  sar        eax,8                        ; Full pel vert lin offset div 256.

								   and       cl,07FH                      ; Full pel HMV

								  movdf      ebx,mm3                      ; ebx[ 0:15] -- SWD for leftward ref.

								  ;                                       ; ebx[16:31] -- SWD for rightward ref.

								   psrlq     mm3,32

								  mov        ch,UnlinearizedVertMV[eax]   ; Get full pel vert MV component.

								   mov       eax,ebx

								  shr        eax,16                       ; eax -- SWD for leftward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for rightward ref.

								  cmp        eax,ebx

								   jg        BlkME_RightBetterThanLeft


								BlkME_LeftBetterThanRight:


								  add        cl,cl

								   mov       ebx,BestBlkFullPelSWD

								  cmp        eax,ebx

								   jge       BlkME_CtrIsBestHMV


								BlkME_LeftBestHMV:


								  movdf      ebx,mm3                      ; ebx[ 0:15] -- SWD for downward ref.

								  ;                                       ; ebx[16:31] -- SWD for upward ref.

								  mov        BestHalfPelHorzSWD,eax

								   mov       eax,ebx

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jg        BlkME_LeftBestHMV_DownBetterThanUp


								BlkME_LeftBestHMV_UpBetterThanDown:


								  cmp        eax,BestBlkFullPelSWD

								   jge       BlkME_LeftIsBest


								BlkME_LeftBestHMV_UpBestVMV:


								  sub        esi,PITCH+1                  ; Try ref 1/2 pel left and up

								   mov       BestHalfPelVertSWD,eax

								  mov        al,1


								  call       HalfPelMotionEstimationBothWays


								  lea        edi,[edi+ebp*8+8]

								   mov       eax,BestHalfPelVertSWD

								  lea        esi,[esi+PITCH*9+9]          ; Back to center.

								   cmp       eax,ebx

								  jle        BlkME_UpBetterThanUpLeft


								BlkME_UpLeftBetterThanUp:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       BlkME_LeftIsBest


								BlkME_UpLeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-PITCH-1]            ; Best is ref 1/2 pel left and up

								  dec        ch                           ; Back up the vert MV one up.

								   jmp       BlkME_HalfPelSearchDone


								BlkME_UpBetterThanUpLeft:


								  cmp        eax,BestHalfPelHorzSWD

								   jg        BlkME_LeftIsBest


								BlkME_UpIsBest:


								  dec        ch                           ; Back up the vert MV one up.

								   mov       ebx,eax

								  lea        eax,[esi-PITCH]              ; Best is ref 1/2 pel up

								   jmp       BlkME_HalfPelSearchDone


								BlkME_LeftBestHMV_DownBetterThanUp:


								  cmp        ebx,BestBlkFullPelSWD

								   jge       BlkME_LeftIsBest


								BlkME_LeftBestHMV_DownBestVMV:


								  dec        esi                          ; Try ref 1/2 pel left and down

								   mov       BestHalfPelVertSWD,ebx

								  mov        al,1


								  call       HalfPelMotionEstimationBothWays


								  lea        edi,[edi+ebp*8+8]

								   mov       eax,BestHalfPelVertSWD

								  lea        esi,[esi+ebp*8+9]            ; Back to center.

								   cmp       eax,ebx

								  jle        BlkME_DownBetterThanDownLeft


								BlkME_DownLeftBetterThanDown:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       BlkME_LeftIsBest


								BlkME_DownLeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-1]                  ; Best is ref 1/2 pel left and down

								  inc        ch                           ; Advance the vert MV one down.

								   jmp       BlkME_HalfPelSearchDone


								BlkME_DownBetterThanDownLeft:


								  cmp        eax,BestHalfPelHorzSWD

								   jle       BlkME_DownIsBest


								BlkME_LeftIsBest:


								  dec        cl                           ; Back up the horz MV one to the left.

								   lea       eax,[esi-1]                  ; Best is ref 1/2 pel left.

								  mov        ebx,BestHalfPelHorzSWD

								   jmp       BlkME_HalfPelSearchDone


								BlkME_RightBetterThanLeft:


								  add        cl,cl

								   mov       eax,BestBlkFullPelSWD

								  cmp        eax,ebx

								   jle       BlkME_CtrIsBestHMV


								BlkME_RightBestHMV:


								  movdf      eax,mm3                    ; eax[ 0:15] -- SWD for downward ref.

								  ;                                       ; eax[16:31] -- SWD for upward ref.

								  mov        BestHalfPelHorzSWD,ebx

								   mov       ebx,eax

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jg        BlkME_RightBestHMV_DownBetterThanUp


								BlkME_RightBestHMV_UpBetterThanDown:


								  cmp        eax,BestBlkFullPelSWD

								   jge       BlkME_RightIsBest


								BlkME_RightBestHMV_UpBestVMV:


								  sub        esi,ebp                      ; Try ref 1/2 pel right and up

								   mov       BestHalfPelVertSWD,eax

								  mov        al,1


								  call       HalfPelMotionEstimationBothWays


								  lea        edi,[edi+ebp*8+8]

								   mov       eax,BestHalfPelVertSWD

								  lea        esi,[esi+PITCH*9+8]          ; Back to center.

								   cmp       eax,ebx

								  jle        BlkME_UpBetterThanUpRight


								BlkME_UpRightBetterThanUp:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       BlkME_RightIsBest


								BlkME_UpRightIsBest:


								  inc        cl                           ; Advance the horz MV one to right.

								   lea       eax,[esi-PITCH]              ; Best is ref 1/2 pel right and up

								  dec        ch                           ; Back up the vert MV one up.

								   jmp       BlkME_HalfPelSearchDone


								BlkME_UpBetterThanUpRight:


								  cmp        eax,BestHalfPelHorzSWD

								   jle       BlkME_UpIsBest


								BlkME_RightIsBest:


								  mov        ebx,BestHalfPelHorzSWD

								   inc       cl                           ; Advance the horz MV one to right.

								  mov        eax,esi

								   jmp       BlkME_HalfPelSearchDone


								BlkME_RightBestHMV_DownBetterThanUp:


								  cmp        ebx,BestBlkFullPelSWD

								   jge       BlkME_RightIsBest


								BlkME_RightBestHMV_DownBestVMV:


								  mov        BestHalfPelVertSWD,ebx

								   mov       al,1


								  call       HalfPelMotionEstimationBothWays


								  lea        edi,[edi+ebp*8+8]

								   mov       eax,BestHalfPelVertSWD

								  lea        esi,[esi+ebp*8+8]            ; Back to center.

								   cmp       eax,ebx

								  jle        BlkME_DownBetterThanDownRight


								BlkME_DownRightBetterThanDown:


								  cmp        ebx,BestHalfPelHorzSWD

								   jge       BlkME_RightIsBest


								BlkME_DownRightIsBest:


								  inc        cl                           ; Advance the horz MV one to right.

								   mov       eax,esi

								  inc        ch                           ; Advance vert MV one down.

								   jmp       BlkME_HalfPelSearchDone


								BlkME_DownBetterThanDownRight:


								  cmp        eax,BestHalfPelHorzSWD

								   jg        BlkME_RightIsBest


								BlkME_DownIsBest:


								  inc        ch                           ; Advance vert MV one down.

								   mov       ebx,eax

								  mov        eax,esi

								   jmp       BlkME_HalfPelSearchDone


								BlkME_CtrIsBestHMV:


								  movdf      eax,mm3                    ; eax[ 0:15] -- SWD for downward ref.

								  ;                                       ; eax[16:31] -- SWD for upward ref.

								  mov        ebx,eax

								  shr        eax,16                       ; eax -- SWD for upward ref.

								   and       ebx,00000FFFFH               ; ebx -- SWD for downward ref.

								  cmp        eax,ebx

								   jge       BlkME_CtrBestHMV_DownBetterThanUp


								BlkME_CtrBestHMV_UpBetterThanDown:


								  mov        ebx,BestBlkFullPelSWD

								  cmp        eax,ebx

								   jge       BlkME_CenterIsBest


								; Up is best.


								  mov        ebx,eax

								   dec       ch                           ; Back up the vert MV one up.

								  lea        eax,[esi-PITCH]              ; Best is ref 1/2 pel up

								   jmp       BlkME_HalfPelSearchDone


								BlkME_CtrBestHMV_DownBetterThanUp:


								  mov        eax,ebx

								   mov       ebx,BestBlkFullPelSWD

								  cmp        eax,ebx

								   jge       BlkME_CenterIsBest


								; Down is best.


								  mov        ebx,eax

								   inc       ch                           ; Advande the vert MV one down.

								  mov        eax,esi

								   jmp       BlkME_HalfPelSearchDone


								SkipHalfPelBlkME:


								  mov        eax,esi

								   mov       ebx,Addr0MVRefBlk            ; Start to calc linearized MV.

								  sub        ecx,ebx                      ; Linearized Motion Vector

								   sub       eax,ebx                      ; Linearized Motion Vector

								  sar        eax,8                        ; Full pel vert lin offset div 256.

								   and       cl,07FH                      ; Full pel HMV

								  add        cl,cl

								   ;

								  mov        ch,UnlinearizedVertMV[eax]   ; Get full pel vert MV component.

								   ;

								  movdf      ebx,mm7                      ; SWD for best full pel block MV.


								BlkME_CenterIsBest:


								  mov        eax,esi


								BlkME_HalfPelSearchDone:


								  mov        [edx].BlkY1.BlkLvlSWD,ebx

								   mov       [edx].BlkY1.PastRef,eax

								  mov        [edx].BlkY1.PHMV,cl        ; Save HMV

								   mov       eax,LimitForSWDForBlkMV    ; Does block's SWD put us over limit?

								  mov        [edx].BlkY1.PVMV,ch        ; Save VMV

								   sub       eax,ebx

								  jl         BlkEst_EarlyOut


								  mov        LimitForSWDForBlkMV,eax ; Remember how much is left for other blks.

								   mov       esi,BestBlockRefAddrVP1

								  add        edi,8                      ; Move to blk 2 or 4, V+4.

								   mov       ecx,Addr0MVRefBlk          ; Calc addr of 0MV ref for this blk.

								  add        esi,8                      ; Move to blk 2 or 4, V+4.

								   add       ecx,8

								  mov        Addr0MVRefBlk,ecx

								   add       edx,SIZEOF T_Blk           ; Increment to next block.

								  test       dl,SIZEOF T_Blk

								  movdt      mm7,SWDForBlock2Or4

								  mov        ebx,FIRSTBLOCKMESTATE

								   jne       DoBlkMEForNextBlk          ; If so, go do blk 2 or 4.


								  lea        esi,[esi+ebp*8-8]          ; Move to blk 3

								   lea       ecx,[ecx+ebp*8-16]

								  mov        BestBlockRefAddrVP1,esi

								   lea       edi,[edi+ebp*8-16]

								  movdt      mm5,SWDULandLR+4           ; Get SWD for best MB level MVs, blk 4.

								  movdt      mm7,SWDURandLL+4           ; Get SWD for best MB level MVs, blk 3.

								  movdf      SWDForBlock2Or4,mm5

								  test       dl,2*SIZEOF T_Blk          ; Just finishing blk 2?

								  mov        Addr0MVRefBlk,ecx

								   jne       DoBlkMEForNextBlk          ; If so, go do blk 3.


								;==============================================================================

								; Block motion vectors are best.


								  mov        esi,[edx-4*SIZEOF T_Blk].BlkY1.BlkLvlSWD

								   mov       edi,[edx-4*SIZEOF T_Blk].BlkY4.BlkLvlSWD

								  mov        SWDULandLR,esi

								   mov       SWDULandLR+4,edi

								  mov        esi,[edx-4*SIZEOF T_Blk].BlkY3.BlkLvlSWD

								   mov       edi,[edx-4*SIZEOF T_Blk].BlkY2.BlkLvlSWD

								  mov        eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs

								   mov       ebx,[edx-4*SIZEOF T_Blk].BlkY2.MVs

								  mov        ecx,eax

								   xor       eax,ebx

								  xor        ecx,[edx-4*SIZEOF T_Blk].BlkY3.MVs

								   xor       ebx,[edx-4*SIZEOF T_Blk].BlkY4.MVs

								  mov        SWDURandLL,edi

								   or        eax,ebx

								  sub        edx,4*SIZEOF T_Blk         ; Restore MacroBlockActionStream ptr.

								   or        eax,ecx

								  test       eax,0FFFFH

								   mov       SWDURandLL+4,esi

								  je         MotionVectorSettled


								  mov        al,INTER4MV               ; Set type for MB to INTER-coded, 4 MVs.

								  mov        [edx].BlockType,al

								   jmp       MotionVectorSettled


								BlkEst_EarlyOut:


								  and        edx,-1-3*SIZEOF T_Blk

								   mov       ecx,BestMBHalfPelSWD       ; Get total SWD for macroblock MV.


								BlockMVNotBigEnoughGain:               ; Try MB-level motion vector.


								  cmp        ecx,SWDForNon0MVToBeat

								   jge       NonZeroMVNotBigEnoughGain


								ENDIF ; H263


								  mov        ebx,BestMBHalfPelMV

								   mov       esi,BestMBHalfPelRefAddr         ; Reload BestMBHalfPelRefAddr


								NonZeroMBLevelMVBest:


								; Non-zero macroblock level motion vector is best.


								  mov        [edx].BlkY1.MVs,ebx

								   mov       [edx].BlkY2.MVs,ebx

								  mov        [edx].BlkY3.MVs,ebx

								   mov       [edx].BlkY4.MVs,ebx

								  mov        [edx].BlkY1.PastRef,esi

								   lea       ecx,[esi+ebp*8]

								  mov        [edx].BlkY3.PastRef,ecx

								   add       esi,8

								  mov        [edx].BlkY2.PastRef,esi

								   add       ecx,8

								  mov        [edx].BlkY4.PastRef,ecx

								   jmp       MotionVectorSettled


								NoBlockMotionVectors:


								  mov        ecx,BestMBHalfPelSWD       ; Get total SWD for macroblock MV.

								   mov       eax,SWDForNon0MVToBeat

								  cmp        eax,ecx

								   mov       ebx,BestMBHalfPelMV

								  mov        esi,BestMBHalfPelRefAddr

								   jge       NonZeroMBLevelMVBest


								NonZeroMVNotBigEnoughGain:


								  mov        esi,Addr0MVRef             ; 0-MV ref block.

								  movq       mm6,SWD0MVULandLR

								  movq       mm5,SWD0MVURandLL

								  movq       SWDULandLR,mm6

								  movq       SWDURandLL,mm5


								BelowZeroThresh:


								  mov        [edx].BlkY1.PastRef,esi   ; Save address of ref block, all blks.

								   lea       eax,[esi+8]

								  mov        [edx].BlkY2.PastRef,eax

								   lea       eax,[esi+ebp*8]

								  mov        [edx].BlkY3.PastRef,eax

								   add       eax,8

								  mov        [edx].BlkY4.PastRef,eax

								   xor       eax,eax

								  mov        [edx].BlkY1.MVs,eax       ; Set horz and vert MVs to 0 in all blks.

								   mov       [edx].BlkY2.MVs,eax

								  mov        [edx].BlkY3.MVs,eax

								   mov       [edx].BestFullPelMBHMV,al

								  mov        [edx].BlkY4.MVs,eax

								   mov       [edx].BestFullPelMBVMV,al

								  mov        BestMBHalfPelMV,eax


								MotionVectorSettled:


								IFDEF H261


								;===============================================================================

								; For H261, we've settled on the best motion vector.  Now we need to determine

								; if spatial filtering should be done.

								;

								;  ebp -- PITCH

								;  esi -- Address of block of ref area.

								;  edi -- Address of spatially filtred block.

								;  edx -- MBlockActionStream

								;  ecx -- Loop counter.

								;  ebx -- Address of constant 0x7F in all 8 bytes.

								;  eax -- Scratch

								;  mm7 -- Mask to extract bytes 0 and 7.  (High bit of bytes 1:6 must be off).

								;  mm6 -- All bytes -1.

								;  mm5 -- Mask to extract bytes 1:6 and clear bit 8 thereof.


								  movdf      esi,mm7                 ; Restore non-SLF SWD for macroblock.

								  cmp        esi,SpatialFiltThreshold

								   jle       SkipSpatialFiltering


								  mov        ecx,DoSpatialFiltering   ; Are we doing spatial filtering?

								   mov       esi,[edx].BlkY1.PastRef

								  test       cl,cl

								   je        SkipSpatialFiltering


								DoSpatialFilterForChroma:

								DoSpatialFilterForLuma:


								  movq       mm5,C7F7F7F7F7F7F7F7F   ; Mask to extract bytes 1:6.

								  movdf      BestMBFullPelSWD,mm7    ; Stash SWD for best full pel MB MV.

								   psllq     mm5,16

								  psrlq      mm5,8

								   pcmpeqb   mm7,mm7

								  pxor       mm7,mm5                 ; Mask to extract bytes 0 and 7.

								   mov       edi,SpatiallyFilteredMB

								  lea        eax,[esi+ebp*4]

								   lea       ebx,C7F7F7F7F7F7F7F7F ; Address of this useful constant.


								SpatialFilterLoop:


								  movq       mm0,[esi]      ; 0a: <P7 P6 P5 P4 P3 P2 P1 P0>

								   pcmpeqb   mm6,mm6        ;     To add one to all bytes.

								  movq       mm4,mm0        ; 0b: <P7 P6 P5 P4 P3 P2 P1 P0>

								   psllq     mm0,16         ; 0c: <P5 P4 P3 P2 P1 P0  0  0>

								  movq       mm3,[esi+ebp*1]; 1a

								   paddb     mm0,mm4        ; 0d: <P7+P5 P6+P4 ... P3+P1 P2+P0 jnk  jnk >

								  movq       mm1,mm3        ; 1b

								   psrlq     mm0,9          ; 0e: <0  (P7+P5)/2 ... (P2+P0)/2 jnk>  (dirty)


								SpatialFilterLoop_BlockToRight:


								  pand       mm0,mm5        ; 0f: <0  (P7+P5)/2 ... (P2+P0)/2    0>  (clean)

								   psllq     mm1,16         ; 1c

								  paddb      mm0,mm4        ; 0g: <jnk   (P7+2P6+P5)/2 ...  (P2+2P1+P0)/2 jnk>

								   paddb     mm1,mm3        ; 1d

								  psubb      mm0,mm6        ; 0h: <jnk (P7+2P6+P5+2)/2 ... (P2+2P1+P0+2)/2 jnk>

								   psrlq     mm1,9          ; 1e

								  psrlq      mm0,1          ; 0i: <jnk (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 jnk>

								   pand      mm4,mm7        ; 0j: <P7  0  0  0  0  0  0 P0>

								  pand       mm0,mm5        ; 0k: < 0 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2  0>

								   pand      mm1,mm5        ; 1f

								  por        mm0,mm4        ; 0l: <P7 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/4 P0>

								   paddb     mm1,mm3        ; 1g

								  movq       mm2,[esi+ebp*2]; 2a

								   psubb     mm1,mm6        ; 1h

								  movq       [edi],mm0      ; 0m: Store line 0 of filtered block.  This is R0.

								   movq      mm4,mm2        ; 2b

								  psrlq      mm1,1          ; 1i

								   pand      mm3,mm7        ; 1j

								  pand       mm1,mm5        ; 1k

								   psllq     mm2,16         ; 2c

								  por        mm1,mm3        ; 1l: This is R1

								   paddb     mm2,mm4        ; 2d

								  psubb      mm1,mm6               ; 1A: R1+1

								   psrlq     mm2,9                 ; 2e

								  pand       mm2,mm5               ; 2f

								   paddb     mm0,mm1               ; 1B: R0+R1+1

								  paddb      mm2,mm4               ; 2g

								   psrlq     mm0,1                 ; 1C: (R0+R1+1)/2  (dirty)

								  pand       mm0,[ebx]             ; 1D: (R0+R1+1)/2  (clean)

								   psubb     mm2,mm6               ; 2h

								  psrlq      mm2,1                 ; 2i

								   pand      mm4,mm7               ; 2j

								  movq       mm3,[esi+PITCH*3]     ; 3a

								   pand      mm2,mm5               ; 2k

								  por        mm2,mm4               ; 2l:  This is R2.

								   movq      mm4,mm3               ; 3b

								  paddb      mm1,mm2               ; 1E & 2B: R1+R2+1

								   psllq     mm3,16                ; 3c

								  psrlq      mm1,1                 ; 1F & 2C: (R1+R2+1)/2  (dirty)

								   paddb     mm3,mm4               ; 3d

								  pand       mm1,[ebx]             ; 1G & 2D: (R1+R2+1)/2  (clean)

								   psrlq     mm3,9                 ; 3e

								  paddb      mm0,mm1               ; 1H:  (R0+2R1+R2+2)/2

								   pand      mm3,mm5               ; 3f

								  psrlq      mm0,1                 ; 1I:  (R0+2R1+R2+2)/4  (dirty)

								   paddb     mm3,mm4               ; 3g

								  pand       mm0,[ebx]             ; 1J:  (R0+2R1+R2+2)/4  (clean)

								   psubb     mm3,mm6               ; 3h

								  psrlq      mm3,1                 ; 3i

								   pand      mm4,mm7               ; 3j

								  movq       [edi+ebp*1],mm0       ; 1K: Store line 1 of filtered block.

								   pand      mm3,mm5               ; 3k

								  movq       mm0,[eax]             ; 4a

								   por       mm3,mm4               ; 3l

								  psubb      mm3,mm6               ; 3A: R3+1

								   movq      mm4,mm0               ; 4b

								  paddb      mm2,mm3               ; 2E & 3B: R2+R3+1

								   psllq     mm0,16                ; 4c

								  psrlq      mm2,1                 ; 2F & 3C: (R2+R3+1)/2  (dirty)

								   paddb     mm0,mm4               ; 4d

								  pand       mm2,[ebx]             ; 2G & 3D: (R2+R3+1)/2  (clean)

								   psrlq     mm0,9                 ; 4e

								  paddb      mm1,mm2               ; 2H:  (R1+2R2+R3+2)/2

								   pand      mm0,mm5               ; 4f

								  psrlq      mm1,1                 ; 2I:  (R1+2R2+R3+2)/4  (dirty)

								   paddb     mm0,mm4               ; 4g

								  pand       mm1,[ebx]             ; 2J:  (R1+2R2+R3+2)/4  (clean)

								   psubb     mm0,mm6               ; 4h

								  psrlq      mm0,1                 ; 4i

								   pand      mm4,mm7               ; 4j

								  movq       [edi+ebp*2],mm1       ; 2K: Store line 2 of filtered block.

								   pand      mm0,mm5               ; 4k

								  movq       mm1,[eax+ebp*1]       ; 5a

								   por       mm0,mm4               ; 4l

								  movq       mm4,mm1               ; 5b

								   psllq     mm1,16                ; 5c

								  paddb      mm3,mm0               ; 3E & 4B: R3+R4+1

								   paddb     mm1,mm4               ; 5d

								  add        esi,8

								   psrlq     mm3,1                 ; 3F & 4C: (R3+R4+1)/2  (dirty)

								  pand       mm3,[ebx]             ; 3G & 4D: (R3+R4+1)/2  (clean)

								   psrlq     mm1,9                 ; 5e

								  paddb      mm2,mm3               ; 3H:  (R2+2R3+R4+2)/2

								   pand      mm1,mm5               ; 5f

								  psrlq      mm2,1                 ; 3I:  (R2+2R3+R4+2)/4  (dirty)

								   paddb     mm1,mm4               ; 5g

								  pand       mm2,[ebx]             ; 3J:  (R2+2R3+R4+2)/4  (clean)

								   psubb     mm1,mm6               ; 5h

								  psrlq      mm1,1                 ; 5i

								   pand      mm4,mm7               ; 5j

								  movq       [edi+PITCH*3],mm2     ; 3K: Store line 3 of filtered block.

								   pand      mm1,mm5               ; 5k

								  movq       mm2,[eax+ebp*2]       ; 6a

								   por       mm1,mm4               ; 5l

								  psubb      mm1,mm6               ; 5A: R5+1

								   movq      mm4,mm2               ; 6b

								  paddb      mm0,mm1               ; 4E & 5B: R4+R5+1

								   psllq     mm2,16                ; 6c

								  psrlq      mm0,1                 ; 4F & 5C: (R4+R5+1)/2  (dirty)

								   paddb     mm2,mm4               ; 6d

								  pand       mm0,[ebx]             ; 4G & 5D: (R4+R5+1)/2  (clean)

								   psrlq     mm2,9                 ; 6e

								  paddb      mm3,mm0               ; 4H:  (R3+2R4+R5+2)/2

								   pand      mm2,mm5               ; 6f

								  psrlq      mm3,1                 ; 4I:  (R3+2R4+R5+2)/4  (dirty)

								   paddb     mm2,mm4               ; 6g

								  pand       mm3,[ebx]             ; 4J:  (R3+2R4+R5+2)/4  (clean)

								   psubb     mm2,mm6               ; 6h

								  psrlq      mm2,1                 ; 6i

								   sub       cl,2                  ;     Loop control

								  movq       [edi+ebp*4],mm3       ; 4K: Store line 4 of filtered block.

								   pand      mm4,mm7               ; 6j

								  movq       mm3,[eax+PITCH*3]     ; 7a

								   pand      mm2,mm5               ; 6k

								  por        mm2,mm4               ; 6l

								   movq      mm4,mm3               ; 7b

								  paddb      mm1,mm2               ; 5E & 6B: R5+R6+1

								   psllq     mm3,16                ; 7c

								  psrlq      mm1,1                 ; 5F & 6C: (R5+R6+1)/2  (dirty)

								   paddb     mm3,mm4               ; 7d

								  pand       mm1,[ebx]             ; 5G & 6D: (R5+R6+1)/2  (clean)

								   psrlq     mm3,9                 ; 7e

								  paddb      mm0,mm1               ; 5H:  (R4+2R5+R6+2)/2

								   pand      mm3,mm5               ; 7f

								  psrlq      mm0,1                 ; 5I:  (R4+2R5+R6+2)/4  (dirty)

								   paddb     mm3,mm4               ; 7g

								  pand       mm0,[ebx]             ; 5J:  (R4+2R5+R6+2)/4  (clean)

								   psubb     mm3,mm6               ; 7h

								  psrlq      mm3,1                 ; 7i

								   pand      mm4,mm7               ; 7j

								  movq       [edi+PITCH*5],mm0     ; 5K: Store line 5 of filtered block.

								   pand      mm3,mm5               ; 7k

								  psubb      mm2,mm6               ; 7A: R6+1

								   por       mm3,mm4               ; 7l

								  paddb      mm2,mm3               ; 6E: R6+R7+1

								   lea       eax,[esi+ebp*4]

								  movq       mm0,[esi]             ; 0a:  for next iteration

								   psrlq     mm2,1                 ; 6F: (R6+R7+1)/2  (dirty)

								  pand       mm2,[ebx]             ; 6G: (R6+R7+1)/2  (clean)

								   movq      mm4,mm0               ; 0b:  for next iteration

								  movq       [edi+PITCH*7],mm3     ; 7m: Store line 7 of filtered block.

								   paddb     mm1,mm2               ; 6H: (R5+2R6+R7+2)/2

								  lea        edi,[edi+8]           ;     Advance output cursor.

								   psrlq     mm1,1                 ; 6I: (R5+2R6+R7+2)/4  (dirty)

								  pand       mm1,[ebx]             ; 6J: (R5+2R6+R7+2)/4  (clean)

								   psllq     mm0,16                ; 0c:  for next iteration

								  movq       mm3,[esi+ebp*1]       ; 1a:  for next iteration

								   paddb     mm0,mm4               ; 0d:  for next iteration

								  movq       [edi+PITCH*6-8],mm1   ; 6K: Store line 6 of filtered block.

								   movq      mm1,mm3               ; 1b:  for next iteration

								  psrlq      mm0,9                 ; 0e:  for next iteration

								   jg        SpatialFilterLoop_BlockToRight


								  lea        esi,[esi+ebp*8-16]

								   lea       eax,[eax+ebp*8-16]

								  lea        edi,[edi+ebp*8-16]

								   mov       cl,4

								  jl         SpatialFilterLoop


								SpatialFilterDone:


								  mov        edi,TargetMacroBlockBaseAddr

								   mov       esi,SpatiallyFilteredMB

								  test       ch,ch

								   jg        ReturnFromSpatialFilterForU


								;  Registers at this point:

								;  ebp -- PITCH

								;  esi -- Address of upper left block of spatially filtered candidate ref area.

								;  edi -- Address of upper left block of target.

								;  edx -- MBlockActionStream

								;  ecx -- Scratch

								;  ebx -- Scratch

								;  eax -- Loop control

								;  mm0-mm4 -- Scratch

								;  mm5,mm6 -- SWD for each block

								;  mm7 -- SWD for macroblock

								;


								  movq       mm0,[esi+ebp*1]

								   pxor      mm7,mm7

								  mov        al,3

								   jl        ReturnFromSpatialFilterForV


								ComputeSWDforSLFBlock:


								  psubw      mm0,[edi+ebp*1]   ; Get diff for line 1.


								ComputeSWDforSLFBlock_BlkToRight:


								  movq       mm1,[esi+PITCH*3] ; Ref MB, Line 3.

								   psllw     mm0,8             ; Extract diffs for line 1 even pels.

								  psubw      mm1,[edi+PITCH*3] ; Diff for line 3.

								   pmaddwd   mm0,mm0           ; Square of diffs for even pels of line 1.

								  movq       mm2,[esi+PITCH*5]

								   psllw     mm1,8

								  psubw      mm2,[edi+PITCH*5]

								   pmaddwd   mm1,mm1

								  movq       mm3,[esi+PITCH*7]

								   psllw     mm2,8

								  psubw      mm3,[edi+PITCH*7]

								   pmaddwd   mm2,mm2

								  movq       mm4,[esi]         ; Ref MB, upper left blk, Line 0.

								   psllw     mm3,8

								  psubw      mm4,[edi]         ; Diff for line 0.

								   paddusw   mm0,mm1           ; Accumulate SWD (lines 0 and 2).

								  movq       mm1,[esi+ebp*2]

								   pmaddwd   mm3,mm3

								  psubw      mm1,[edi+ebp*2]

								   paddusw   mm0,mm2

								  movq       mm2,[esi+ebp*4]

								   pmaddwd   mm4,mm4           ; Square of diffs for odd pels of line 0.

								  psubw      mm2,[edi+ebp*4]

								   paddusw   mm0,mm3

								  movq       mm3,[esi+PITCH*6]

								   pmaddwd   mm1,mm1

								  psubw      mm3,[edi+PITCH*6]

								   pmaddwd   mm2,mm2

								  paddusw    mm4,mm0

								   pmaddwd   mm3,mm3

								  paddusw    mm4,mm1

								   add       esi,8

								  paddusw    mm4,mm2

								   add       edi,8

								  movq       mm0,[esi+ebp*1]

								   paddusw   mm4,mm3

								  psubw      mm0,[edi+ebp*1]   ; Get diff for line 1.

								   punpckldq mm1,mm4           ; Get low order SWD accum to high order of mm1.

								  paddusw    mm1,mm4           ; mm1[48:63] is SWD for block.

								   psllq     mm6,32            ; Shift previous block's SWD left.

								  psrlq      mm1,48            ; mm1 is SWD for block.

								   sub       al,2              ; Loop control.

								  paddusw    mm7,mm1

								   por       mm6,mm1           ; Save current block's SWD.

								  movq       mm4,mm5

								   jg        ComputeSWDforSLFBlock_BlkToRight


								  movq       mm0,[esi+PITCH*9-16]

								   movq      mm5,mm6

								  lea        edi,[edi+ebp*8-16]

								   lea       esi,[esi+ebp*8-16]

								  mov        al,4

								   jl        ComputeSWDforSLFBlock


								  mov        ebx,BestMBFullPelSWD    ; Restore non-SLF SWD for macroblock.

								   mov       eax,SpatialFiltDifferential

								  sub        ebx,eax

								   sub       edi,PITCH*16+16

								  movdf      eax,mm7                 ; SLF SWD for macroblock.

								  cmp        eax,ebx

								   jge       SpatialFilterNotAsGood


								  movdf      SWDULandLR+4,mm5

								   psrlq     mm5,32

								  movdf      SWDURandLL+4,mm5

								  movdf      SWDURandLL,mm6

								   psrlq     mm6,32

								  movdf      SWDULandLR,mm6

								  mov        al,INTERSLF

								   mov       ebx,SpatiallyFilteredMB

								  mov        [edx].BlockType,al

								   sub       esi,PITCH*8-8

								  mov        [edx].BlkY4.PastRef,esi

								   mov       [edx].BlkY1.PastRef,ebx

								  sub        esi,8

								   add       ebx,8

								  mov        [edx].BlkY3.PastRef,esi

								   mov       [edx].BlkY2.PastRef,ebx


								SkipSpatialFiltering:

								SpatialFilterNotAsGood:

								ENDIF ; H261


								;===============================================================================

								; We've settled on the motion vector that will be used if we do indeed code the

								; macroblock with inter-coding.  We need to determine if some or all of the

								; blocks can be forced as empty (copy).  If all the blocks can be forced

								; empty, we force the whole macroblock to be empty.


								  mov        esi,EMPTYTHRESHOLD         ; Get threshold for forcing block empty?

								   mov       ebx,SWDULandLR             ; Get SWD for block 1.

								  mov        al,[edx].CodedBlocks

								   cmp       ebx,esi                    ; Is SWD > threshold?

								  jg         @f


								  and        al,0FEH                    ; If not, indicate block 1 is NOT coded.

								   xor       ebx,ebx


								@@:


								  mov        ecx,SWDURandLL             ; Get SWD for block 2.

								  cmp        ecx,esi

								   jg        @f


								  and        al,0FDH

								   xor       ecx,ecx


								@@:


								  add        ebx,ecx

								   mov       ecx,SWDURandLL+4           ; Get SWD for block 3.

								  cmp        ecx,esi

								   jg        @f


								  and        al,0FBH

								   xor       ecx,ecx


								@@:


								  add        ebx,ecx

								   mov       ecx,SWDULandLR+4           ; Get SWD for block 4.

								  cmp        ecx,esi

								   jg        @f


								  and        al,0F7H

								   xor       ecx,ecx


								@@:


								  mov        [edx].CodedBlocks,al  ; Store coded block pattern.

								   and       al,00FH

								  add        ebx,ecx

								   cmp       al,00FH               ; Are any blks marked empty?

								  jne        InterBest             ; If some blks are empty, can't code as Intra


								  mov        edi,TargetMacroBlockBaseAddr

								   mov       [edx].SWD,ebx

								  cmp        ebx,INTERCODINGTHRESHOLD  ; Is InterSWD below inter-coding thresh?

								   jae       CalculateIntraSWD


								InterBestX:


								  mov        ebx,[edx].SWD


								InterBest:


								  mov        ecx,SWDTotal          ; Add to total for this macroblock class.

								  add        ecx,ebx

								IFDEF H261

								  mov        SWDTotal,ecx

								ELSE ;H263

								   mov       bl,DoAdvancedPrediction

								  mov        SWDTotal,ecx

								   test      bl,bl

								  jne        OBMCDifferencing

								ENDIF


								;============================================================================

								; Perform differencing for the non-empty luma blocks of an Inter-coded

								; macroblock.  This is the non-OBMC case;  i.e. Advanced Prediction is

								; not selected.

								;

								;  ebp -- PITCH

								;  esi -- Address of reference block.

								;  edi -- Address of target block.

								;  edx -- MBlockActionStream.  Used as cursor over luma blocks.

								;  ecx -- Not in use.

								;  ebx -- Scratch.  Used to test half pel MV resolution.

								;  eax[0:3] -- Coded block pattern for luma blocks.


								  mov        cl,INTER1MV

								   mov       ebx,TargetMacroBlockBaseAddr

								  mov        StashBlockType,cl

								   test      al,1                        ; Don't diff block 1 if marked empty.

								  mov        edi,ebx

								   je        @f


								  mov        ebx,[edx].BlkY1.MVs

								   mov       esi,[edx].BlkY1.PastRef


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4   ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  mov        al,[edx].CodedBlocks

								  sub        al,bl

								   mov       ebx,TargetMacroBlockBaseAddr

								  mov        [edx].CodedBlocks,al

								   pop       edi                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>


								@@:


								  lea        edi,[ebx+8]                 ; Get address of next macroblock to do.

								   test      al,2                        ; Don't diff block 2 if marked empty.

								  je         @f


								  mov        ebx,[edx].BlkY2.MVs

								   mov       esi,[edx].BlkY2.PastRef


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4   ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  shl        bl,1

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       ebx,TargetMacroBlockBaseAddr

								  mov        [edx].CodedBlocks,al

								   pop       edi                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>


								@@:


								  lea        edi,[ebx+ebp*8]             ; Get address of next macroblock to do.

								   test      al,4                        ; Don't diff block 3 if marked empty.

								  je         @f


								  mov        ebx,[edx].BlkY3.MVs

								   mov       esi,[edx].BlkY3.PastRef


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4   ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  shl        bl,2

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       ebx,TargetMacroBlockBaseAddr

								  mov        [edx].CodedBlocks,al

								   pop       edi                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>


								@@:


								  lea        edi,[ebx+ebp*8+8]           ; Get address of next macroblock to do.

								   test      al,8                        ; Don't diff block 4 if marked empty.

								  je         NonOBMCDifferencingDone


								  mov        ebx,[edx].BlkY4.MVs

								   mov       esi,[edx].BlkY4.PastRef


								  call       DoNonOBMCDifferencing


								                                 ; (Finish differencing the last four lines.)

								  movq       mm4,[edi+ebp*4]     ; T4

								   psrlq     mm1,1

								  movq       mm5,[edi+PITCH*5]

								   psubb     mm4,mm0             ; D4 = T4 - P4

								  movq       mm0,[edi+PITCH*6]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*7]

								   pand      mm2,mm6

								  pand       mm3,mm6

								   psrlq     mm2,1

								  movq       PelDiffsLine4,mm4   ; Store D4.

								   psubb     mm0,mm2

								  movq       PelDiffsLine5,mm5

								   psrlq     mm3,1

								  movq       PelDiffsLine6,mm0

								   psubb     mm1,mm3

								  push       eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>


								  call       MMxDoForwardDCTx      ; Block is in PelDiffs block;  Pitch is 16


								  shl        bl,3

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   pop       edi                   ; Adjust stack pointer

								  mov        [edx].CodedBlocks,al


								StackOffset TEXTEQU <0>

								NonOBMCDifferencingDone:


								IFDEF H261

								ELSE

								   mov       al,IsPlainPFrame

								  test       al,al

								   jne       NextMacroBlock


								  movq       mm6,C0101010101010101

								   pxor      mm7,mm7                      ; Initialize SWD accumulator


								  call       MMxDoBFrameLumaBlocks


								ENDIF

								  jmp        NextMacroBlock


								;============================================================================

								;  Register usage in the following internal function.  This function does

								;  half pel motion estimation for whole macroblocks, or individual blocks.

								;

								;  ebp -- PITCH

								;  esi -- Address of best full pel reference macroblock.  For MBME unchanged

								;         at exit.  For BlkME, adjusted by -8-8*PITCH.

								;  edi -- Address of target macroblock.  For MBME unchanged at exit.  For BlkME,

								;         adjusted by -8-8*PITCH.

								;  edx -- MBlockActionStream

								;  ecx -- Reserved.

								;  ebx -- For MBME:  240 + Flags to indicate which half pel ME to do:

								;                    1 --> right;   2 --> left;   4 --> down;   8 --> up

								;         For BlkME: Garbage

								;  eax -- Count from -4 to -1 for blocks of macroblock.  0 for single block.

								;  mm7 -- Initialized to zero.

								;  mm6 -- Initialized to zero.

								;  mm0:mm7 -- Scratch

								;  mm3[ 0:15] -- SWD for ref 1/2 pel rightward

								;  mm3[16:31] -- SWD for ref 1/2 pel leftward

								;  mm3[32:47] -- SWD for ref 1/2 pel downward

								;  mm3[48:63] -- SWD for ref 1/2 pel upward


								StackOffset TEXTEQU <4>

								HalfPelMotionEstimation:


								  and       bl,15


								HalfPelMBMEForUpperBlock:

								HalfPelMEForFirst2LinesOfBlock:


								  movq       mm0,[esi-PITCH]   ; <P^7 P^6 P^5 P^4 P^3 P^2 P^1 P^0>

								  movq       mm1,[esi]         ; <P07 P06 P05 P04 P03 P02 P01 P00>

								  movq       mm4,[edi+ebp*1]   ; <T17 T16 T15 T14 T13 T12 T11 T10>

								   paddb     mm0,mm1           ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>


								HalfPelMEForNext2LinesOfBlock:


								  movq       mm2,[esi+ebp*1]   ; <P17 P16 P15 P14 P13 P12 P11 P10>

								   psrlw     mm0,1             ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>

								  movq       mm5,mm1           ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   psllw     mm4,8             ; <T16 0 T14 0 T12 0 T10 0>


								HalfPelMBMEForLowerBlock:


								  psubw      mm0,[edi]         ; <(P^7+P07)/2-T07 junk (P^5+P05)/2-T05 junk ...>

								   paddb     mm5,mm2           ; <P07+P17 P06+P16 P05+P15 P04+P14 ...>

								  pmullw     mm1,C0101010101010101  ; <(P07+P06)*256+P06 ...>

								   psllw     mm5,8             ; <(P06+P16) 0 (P04+P14) 0 ...>

								  pmaddwd    mm0,mm0           ; Square diff for line 0 odd pels, upward ref.

								   psrlw     mm5,1             ; <(P06+P16)/2 0 (P04+P14)/2 0 ...>

								  movq       mm3,[edi]         ; <T07 T06 T05 T04 T03 T02 T01 T00>

								   psubw     mm4,mm5           ; <T16-(P06+P16)/2 junk ...>

								  pmaddwd    mm4,mm4           ; Square diff for line 1 even pels, upward ref.

								   psrlw     mm1,1             ; <(P07+P06)*128+P06/2 ...>

								  psllw      mm3,8             ; <T06 0 T04 0 T02 0 T00 0>

								   lea       edi,[edi+ebp*2]   ; Advance Target cursor

								  psubw      mm3,mm1           ; <T06-(P07+P06)/2 junk T04-(P05+P03)/2 junk ...>

								   lea       esi,[esi+ebp*2]   ; Advance Reference cursor

								  psubw      mm1,[edi-PITCH*2] ; <(P07+P06)/2-T07 junk (P05+P04)/2-T05 junk ...>

								   pmaddwd   mm3,mm3           ; Square diff for line 0 even pels, rightwrd ref.

								  pmaddwd    mm1,mm1           ; Square diff for line 0 odd pels, leftward ref.

								   paddusw   mm0,mm4           ; SSD for line 0 and 1, upward ref.

								  pand       mm0,CFFFF0000FFFF0000 ; Extract SSD for line 0 and 1, upward ref.

								   movq      mm4,mm2           ; <P17 P16 P15 P14 P13 P12 P11 P10>

								  paddusw    mm6,mm0           ; Accumulate SSD for line 0 and 1, upward ref.

								   psrlq     mm4,8             ; <  0 P17 P16 P15 P14 P13 P12 P11>

								  pand       mm1,CFFFF0000FFFF0000 ; Extract SSD for line 0, leftward ref.

								   psrld     mm3,16            ; Extract SSD for line 0, rightward ref.

								  pmullw     mm4,C0200010101010101  ; <P17*256*2 (P16+P15)*256+P15 ...>

								   paddw     mm3,mm1           ; SSD for line 0, leftward and rightward refs.

								  movq       mm1,[esi]         ; <P27 P26 P25 P24 P23 P22 P21 P20>

								   movq      mm0,mm2           ; <P17 P16 P15 P14 P13 P12 P11 P10>

								  paddusw    mm7,mm3           ; Accumulate SSD for line 0, left and right refs.

								   paddb     mm2,mm1           ; <P17+P27 P16+P26 P15+P25 P14+P24 ...>

								  movq       mm3,mm0           ; <P17 P16 P15 P14 P13 P12 P11 P10>

								   psrlw     mm4,1             ; <P17 (P16*P15)*128+P15/2 ...>

								  psubw      mm4,[edi-PITCH*1] ; <P17-T17 junk (P16*P15)/2-T15 junk ...>

								   psllq     mm3,8             ; <P16 P15 P14 P13 P12 P11 P10   0>

								  pmullw     mm3,C0101010101010002  ; <(P16+P15)*256+P15 ... P10*256*2>

								   psrlw     mm2,1             ; <(P17+P27)/2 junk (P15+P25)/2 junk ...>

								  movq       StashMM6,mm6

								   pmaddwd   mm4,mm4           ; Square diff for line 1 odd pels, rightward ref.

								  movq       mm6,[edi-PITCH*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>

								   psrlw     mm3,1             ; <(P16+P15)*128+P15/2 ... P10*256>

								  psubw      mm2,[edi-PITCH*1] ; <(P17+P27)/2-T17 junk (P15+P25)/2-T15 junk ...>

								   psllw     mm6,8             ; <T16 0 T14 0 T12 0 T10 0>

								  psubw      mm3,mm6           ; <(P16+P15)/2-T16 junk ... P10-T10>

								   psrld     mm4,16            ; Extract SSD for line 1, rightward ref.

								  movq       mm6,[edi-PITCH*2] ; <T07 T06 T05 T04 T03 T02 T01 T00>

								   pmaddwd   mm3,mm3           ; Square diff for line 1 even pels, leftward ref.

								  pmaddwd    mm2,mm2           ; Square diff for line 1 odd pels, downward ref.

								   psllw     mm6,8             ; <T06 0 T04 0 T02 0 T00 0>

								  paddusw    mm7,mm4           ; Accumulate SSD for line 1, rightward ref.

								   psubw     mm6,mm5           ; <T06-(P06+P16)/2 junk ...>

								  pand       mm3,CFFFF0000FFFF0000 ; Extract SSD for line 1, leftward ref.

								   pmaddwd   mm6,mm6           ; Square diff for line 0 even pels, downward ref.

								  add        bl,080H

								   psrld     mm2,16            ; Extract SSD for line 1, downward ref.

								  paddusw    mm2,StashMM6      ; Accumulate SSD for line 1, downward ref.

								   paddusw   mm7,mm3           ; Accumulate SSD for line 1, leftward ref.

								  movq       mm4,[edi+ebp*1]   ; <T17 T16 T15 T14 T13 T12 T11 T10>

								   psrld     mm6,16            ; Extract SSD for line 0, downward ref.

								  paddusw    mm6,mm2           ; Accumulate SSD for line 0, downward ref.

								   paddb     mm0,mm1           ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>

								  punpckldq  mm5,mm6           ; Speculatively start to accum partial SWDs.

								   jnc       HalfPelMEForNext2LinesOfBlock  ; Iterate twice, for half a block.


								  punpckldq  mm3,mm7

								   add       bl,040H

								  paddusw    mm5,mm6

								   jns       HalfPelMEForNext2LinesOfBlock  ; Iterate twice, for a whole block.


								  paddusw    mm3,mm7

								   psrlw     mm0,1             ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>

								  movq       mm2,[esi+ebp*1]   ; <P17 P16 P15 P14 P13 P12 P11 P10>

								   punpckhdq mm3,mm5           ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward

								   ;                           ; mm3[16:31] -- SWD for ref 1/2 pel leftward

								   ;                           ; mm3[32:47] -- SWD for ref 1/2 pel downward

								   ;                           ; mm3[48:63] -- SWD for ref 1/2 pel upward

								  movq       mm5,mm1           ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   sub       bl,080H

								  movq       HalfPelMBMESWDAccum[eax*8+32],mm3

								   psllw     mm4,8             ; <T16 0 T14 0 T12 0 T10 0>

								  add        eax,2

								   jl        HalfPelMBMEForLowerBlock       ; Iterate twice for 2 blocks.


								  lea        edi,[edi-PITCH*16+8]

								   lea       esi,[esi-PITCH*16+8]

								  lea        eax,[eax-3]

								   je        HalfPelMBMEForUpperBlock       ; Iterate twice for macroblock.


								  sub        edi,16

								   xor       eax,eax

								  sub        esi,16

								   mov       al,bl

								  ret


								StackOffset TEXTEQU <0>


								;============================================================================

								;  Register usage in the following internal function.  This function does

								;  half pel motion estimation in both directions for whole macroblocks, or

								;  individual blocks.

								;

								;  ebp -- PITCH

								;  esi -- Address of best full pel reference macroblock.  For MBME unchanged

								;         at exit.  For BlkME, adjusted by -8-8*PITCH.

								;  edi -- Address of target macroblock.  For MBME unchanged at exit.  For BlkME,

								;         adjusted by -8-8*PITCH.

								;  edx -- MBlockActionStream

								;  ecx -- Reserved.  Contains motion vectors.

								;  ebx -- Returns SWD for this reference block or macroblock.

								;  al  -- Count from 4 to 1 for blocks of macroblock.  1 for blk only.

								;  mm0:mm6 -- Scratch

								;  mm7 -- Reserved.  Contains SWDs for four 1/2 pel refs at main compass points.

								;  mm4 -- Returns SWD for this reference block or macroblock.


								StackOffset TEXTEQU <4>

								HalfPelMotionEstimationBothWays:


								  movq       mm3,C0101010101010101

								   pxor      mm6,mm6                ; Zero out SSD accumulator.


								HalfPelMBMEForUpperBlockBothWays:

								HalfPelMEForFirst2LinesOfBlockBothWays:


								  movq       mm0,[esi]         ; <P07 P06 P05 P04 P03 P02 P01 P00>


								HalfPelMEForNext2LinesOfBlockBothWays:

								HalfPelMBMEForLowerBlockBothWays:


								  movq       mm1,[esi+ebp*1]   ; <P17 P16 P15 P14 P13 P12 P11 P10>

								   pmullw    mm0,mm3           ; <(P07+P06)*256+P06 ...>

								  movq       mm2,[esi+ebp*2]   ; <P27 P26 P25 P24 P23 P22 P21 P20>

								   pmullw    mm3,mm1           ; <(P17+P16)*256+P16 ...>

								  movq       mm4,mm2           ; <P27 P26 P25 P24 P23 P22 P21 P20>

								   psrlq     mm2,8             ; <  0 P27 P26 P25 P24 P23 P22 P21>

								  pmullw     mm2,C0200010101010101 ; <P27*256*2 (P26+P25)*256+P25 ...>

								   psrlq     mm1,8             ; <  0 P17 P16 P15 P14 P13 P12 P11>

								  pmullw     mm1,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>

								   psrlw     mm3,2             ; <(P17+P16)/4 junk ...> (w /2 frac bits)

								  movq       mm5,[edi]         ; <T07 T06 T05 T04 T03 T02 T01 T00>

								   psrlw     mm0,2             ; <(P07+P06)/4 junk ...> (w/ 2 frac bits)

								  paddw      mm3,mm0           ; <(P07+P06+P17+P16)/4 junk ...>

								   psrlw     mm2,2             ; <P27/2 junk (P26+P25)/4 junk ...>

								  psubw      mm2,[edi+ebp*1]   ; <P27/2-T17 junk (P26+P25)/4-T15 junk ...>

								   psrlw     mm1,2             ; <P17/2 junk (P16+P15)/4 junk ...>

								  paddw      mm2,mm1     ; <(P17+P27)/2-T17 junk (P16+P15+P26+P25)-T15 junk ...>

								   psllw     mm5,8             ; <T06   0 T04   0 T02   0 T00   0>

								  psubw      mm5,mm3           ; <T06-(P07+P06+P17+P16)/4 junk ...>

								   pmaddwd   mm2,mm2           ; Square diffs for odd pels of line 1.

								  pmaddwd    mm5,mm5           ; Square diffs for even pels of line 0.

								   movq      mm0,mm4           ; <P27 P26 P25 P24 P23 P22 P21 P20>

								  lea        edi,[edi+ebp*2]   ; Advance target cursor.

								   lea       esi,[esi+ebp*2]   ; Advance reference cursor.

								  paddusw    mm6,mm2           ; Accumulate SSD for odd pels of line 1.

								   add       al,080H

								  movq       mm3,C0101010101010101

								   paddusw   mm6,mm5           ; Accumulate SSD for even pels of line 0.

								  punpckldq  mm4,mm6           ; Speculatively start to accum partial SWDs.

								   jnc       HalfPelMEForNext2LinesOfBlockBothWays  ; Twice, for half a block.


								  add        al,040H

								   paddusw   mm4,mm6            ; After whole block, SSD is in mm4[48:63].

								  psrlq      mm4,48

								   jns       HalfPelMEForNext2LinesOfBlockBothWays  ; Twice, for a whole block.


								  movdf      ebx,mm4

								  sub        al,082H

								   jg        HalfPelMBMEForLowerBlockBothWays  ; Iterate twice for 2 blocks.


								  lea        edi,[edi-PITCH*16+8]

								   lea       esi,[esi-PITCH*16+8]

								  mov        al,3

								   je        HalfPelMBMEForUpperBlockBothWays  ; Iterate twice for macroblock.


								  sub        edi,16

								   sub       esi,16

								  ret


								StackOffset TEXTEQU <0>


								;============================================================================

								;  Register usage in the following internal function.  This function is also

								;  called to do frame differencing for chroma blocks.

								;

								;  ebp -- PITCH

								;  esi -- Address of reference block.

								;  edi -- Address of target block.

								;  edx -- Unavailable.  In use by caller.

								;  ecx -- Not in use.

								;  ebx -- Motion vectors for the block.  bl[0] indicates whether half-pel

								;         horizontal interpolation is required;  bh[0] same for vertical.

								;         This register is then used for scratch purposes.

								;  eax -- Unavailable.  In use by caller.

								;  mm0-mm5 -- Scratch

								;  mm6 -- 8 bytes of 0xFE

								;  mm7 -- 8 bytes of -1


								StackOffset TEXTEQU <4>


								DoNonOBMCDifferencing: ; Internal Function


								  pcmpeqb    mm7,mm7

								   pcmpeqb   mm6,mm6

								IFDEF H261

								ELSE ;H263

								  shr        bl,1


								   jc        NonOBMCDiff_Horz

								ENDIF


								  movq       mm1,[esi+ebp*1]     ; BC . . .  R0Dn

								   paddb     mm6,mm6

								IFDEF H261

								ELSE ;H263

								  shr        bh,1

								   jc        NonOBMCDiff_Vert

								ENDIF


								  psubb      mm1,[edi+ebp*1]     ; P1 - T1

								   pxor      mm4,mm4

								  movq       mm0,[edi]           ; T0

								   psubb     mm4,mm1             ; D1 = T1 - P1

								  psubb      mm0,[esi]           ; D0 = T0 - P0

								  movq       mm2,[edi+ebp*2]     ; T2

								  movq       mm3,[edi+PITCH*3]   ; T3

								  psubb      mm2,[esi+ebp*2]     ; D2 = T2 - P2

								  psubb      mm3,[esi+PITCH*3]   ; D3 = T3 - P3

								  movq       PelDiffsLine0,mm0   ; Store D0.

								  movq       PelDiffsLine1,mm4   ; Store D1.

								  movq       PelDiffsLine2,mm2   ; Store D2.

								  movq       PelDiffsLine3,mm3   ; Store D3.

								  movq       mm3,[esi+PITCH*7]   ; P7

								  movq       mm2,[esi+PITCH*6]   ; P6

								   paddb     mm3,mm3             ; Double so that return will fix it.

								  movq       mm1,[esi+PITCH*5]   ; P5

								   paddb     mm2,mm2             ; Double so that return will fix it.

								  movq       mm0,[esi+ebp*4]     ; P4

								   paddb     mm1,mm1             ; Double so that return will fix it.

								  ret


								IFDEF H261

								ELSE ;H263

								NonOBMCDiff_Vert:                ; 0123   Detail for 0


								  movq       mm0,[esi]           ; C. .   R0Up

								   psubb     mm1,mm7             ; DD .   R0Dn+1


								  call       Get4LinesOfPred_InterpVert


								  movq       mm5,[edi]           ; T0

								   psrlq     mm1,1               ;  O .

								  movq       mm7,[edi+ebp*1]

								   psubb     mm5,mm0             ; D0 = T0 - P0

								  movq       mm0,mm4

								   psubb     mm7,mm1

								  movq       mm1,[edi+ebp*2]

								   pand      mm2,mm6             ;  .N.

								  movq       mm4,[edi+PITCH*3]

								  pand       mm3,mm6             ;  . N

								   psrlq     mm2,1               ;  .O.

								  movq       PelDiffsLine0,mm5   ; Store D0.

								   psubb     mm1,mm2

								  movq       PelDiffsLine1,mm7   ; Store D1.

								   psrlq     mm3,1               ;  . O

								  movq       PelDiffsLine2,mm1   ; Store D2.

								   psubb     mm4,mm3

								  movq       mm1,[esi+ebp*1]     ; BC . . .  R0Dn

								   pcmpeqb   mm7,mm7

								  movq       PelDiffsLine3,mm4   ; Store D3.

								   psubb     mm1,mm7             ; DD . . .  R0Dn+1

								; jmp        Get4MoreLinesOfPred_InterpVert


								;===========================================================================

								; Internal function to get 4 lines of prediction, interpolating in the

								; vertical direction.  The first 3 lines of the function are scheduled into

								; the caller's space, and so are commented out here.  For 8 lines of prediction,

								; a second call, to the second entry point, is called after consuming the

								; outputs of the first function call.  Certain registers must remain intact

								; to convey information from the first call to the second.

								;

								; ebp -- PITCH

								; edi -- Points to target block.

								; esi -- Points to Upper left corner of 8 column, 9 row block that will be

								;        interpolated vertically to generate prediction.

								; edx -- Reserved (MBlockActionStream)

								; ecx -- Not in use.

								; ebx -- Will be used.

								; eax -- Reserved.

								; mm6 -- 8 bytes of 0xFE.

								; mm7 -- 8 bytes of -1.

								; mm0-mm5 -- Scratch.


								StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>

								Get4LinesOfPred_InterpVert:      ; 0123   Details for line 0

								; movq       mm1,[esi+ebp*1]     ; BC .   R0Dn

								; movq       mm0,[esi]           ; C. .   R0Up

								;  psubb     mm1,mm7             ; DD .   R0Dn+1

								Get4MoreLinesOfPred_InterpVert:

								  movq       mm2,[esi+ebp*2]     ;  BC.

								   paddb     mm0,mm1             ; E. .   R0Up+R0Dn+1

								  movq       mm3,[esi+PITCH*3]   ;  .BC

								   paddb     mm1,mm2             ;  E .

								  movq       mm4,[esi+ebp*4]     ;  . BC

								   psubb     mm3,mm7             ;  .DD

								  paddb      mm2,mm3             ;  .E.

								   pand      mm0,mm6             ; F. .   Pre-clean

								  paddb      mm3,mm4             ;    E

								   pand      mm1,mm6             ;  F .

								  lea        esi,[esi+ebp*4]     ;       Advance to next four lines.

								   psrlq     mm0,1               ; G. .   P0 = (R0Up + R0Dn + 1) / 2

								; pand       mm2,mm6             ;   G.

								;  psrlq     mm1,1               ;  H .

								; pand       mm3,mm6             ;    G

								;  psrlq     mm2,1               ;   H.

								; psrlq      mm3,1               ;    H

								  ret

								StackOffset TEXTEQU <4>


								;===========================================================================


								NonOBMCDiff_Horz:


								  movq       mm5,[esi+1]         ; A. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								   paddb     mm6,mm6             ; . .      8 bytes of 0xFE

								  shr        bh,1

								   jc        NonOBMCDiff_Both


								  movq       mm7,[edi+PITCH*3]   ; T3


								  call       Get4LinesOfPred_InterpHorz


								  movq       mm4,[edi]           ; T0

								   psrlq     mm1,1               ;  O .

								  movq       mm5,[edi+ebp*1]

								   psubb     mm4,mm0             ; D0 = T0 - P0

								  movq       mm0,[edi+ebp*2]

								   psubb     mm5,mm1

								  movq       mm1,[edi+PITCH*3]

								   pand      mm2,mm6             ;  .N.

								  pand       mm3,mm6             ;  . N

								   psrlq     mm2,1               ;  .O.

								  movq       PelDiffsLine0,mm4   ; Store D0.

								   psubb     mm0,mm2

								  movq       PelDiffsLine1,mm5   ; Store D1.

								   psrlq     mm3,1               ;  . O

								  movq       PelDiffsLine2,mm0   ; Store D2.

								   psubb     mm1,mm3

								  movq       mm5,[esi+1]         ; <R48 R47 R46 R45 R44 R43 R42 R41>

								   ;

								  movq       PelDiffsLine3,mm1   ; Store D3.

								   ;


								;===========================================================================

								; Internal function to get 4 lines of prediction, interpolating in the

								; horizontal direction.  The first line of the function are scheduled into

								; the caller's space, and so are commented out here.  For 8 lines of prediction,

								; a second call, to the second entry point, is called after consuming the

								; outputs of the first function call.  Certain registers must remain intact

								; to convey information from the first call to the second.

								;

								; ebp -- PITCH

								; edi -- Points to target block.

								; esi -- Points to Upper left corner of 9 column, 8 row block that will be

								;        interpolated horizontally to generate prediction.

								; edx -- Reserved (MBlockActionStream)

								; ecx -- Not in use.

								; ebx -- Will be used.

								; eax -- Reserved.

								; mm6 -- 8 bytes of 0xFE.

								; mm0-mm5 -- Will be used.


								StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>

								Get4LinesOfPred_InterpHorz:

								Get4MoreLinesOfPred_InterpHorz:


								; movq       mm5,[esi+1]         ; A. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								  xor        ebx,ebx             ;  . .

								   movq      mm0,mm5             ; B. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								  mov        bl,[esi]            ; C. .  R00

								   psllq     mm5,8               ; D. .  <R07 R06 R05 R04 R03 R02 R01   0>

								  movq       mm1,[esi+ebp*1+1]   ;  A .

								   paddb     mm0,mm5             ; E. .  <R08+R07   ... R02+R01   R01      >

								  paddb      mm0,Pel_Rnd[ebx*8]  ; F. .  <R08+R07+1 ... R02+R01+1 R01+R00+1>

								   movq      mm4,mm1             ;  B .

								  mov        bl,[esi+ebp*1]      ;  C .

								   psllq     mm4,8               ;  D .

								  movq       mm2,[esi+ebp*2+1]   ;   A.

								   paddb     mm1,mm4             ;  E .

								  paddb      mm1,Pel_Rnd[ebx*8]  ;  F .

								   movq      mm5,mm2             ;   B.

								  mov        bl,[esi+ebp*2]      ;   C.

								   psllq     mm5,8               ;   D.

								  movq       mm3,[esi+PITCH*3+1] ;    A

								   paddb     mm2,mm5             ;   E.

								  paddb      mm2,Pel_Rnd[ebx*8]  ;   F.

								   movq      mm4,mm3             ;    B

								  mov        bl,[esi+PITCH*3]    ;    C

								   psllq     mm4,8               ;    D

								  paddb      mm3,mm4             ;    E

								   pand      mm0,mm6             ; G. .  pre-cleaned

								  paddb      mm3,Pel_Rnd[ebx*8]  ;    F

								   psrlq     mm0,1               ; H. .  P0=<(R08+R07+1)/2 ... (R01+R00+1)/2>

								  lea        esi,[esi+ebp*4]     ;       Advance to next four lines.

								   pand      mm1,mm6             ;  G .

								; pand       mm2,mm6             ;   G.

								;  psrlq     mm1,1               ;  H .

								; pand       mm3,mm6             ;    G

								;  psrlq     mm2,1               ;   H.

								; psrlq      mm3,1               ;    H

								  ret

								StackOffset TEXTEQU <4>


								; The steps commented out above are scheduled into the mem-ops the caller has

								; to do at the point of return.  As though these ops were done, the registers

								; look as follows:

								;  mm0 -- Prediction for line 0.

								;  mm1 -- Prediction for line 1.

								;  mm2 -- Prediction for line 2.

								;  mm3 -- Prediction for line 3.

								;  mm6 -- 8 bytes of 0xFE.  Must be this when computing pred for next 4 lines.

								;=============================================================================


								NonOBMCDiff_Both:


								  call       Get4LinesOfPred_InterpBoth


								  movq       mm7,[edi]           ; T0

								   psrlq     mm1,1               ;  O .

								  psubb      mm7,mm0             ; D0 = T0 - P0

								   pand      mm2,mm6             ;  .N.

								  movq       mm0,[edi+ebp*1]

								   psrlq     mm2,1               ;  .O.

								  movq       PelDiffsLine0,mm7   ; Store D0.

								   psubb     mm0,mm1

								  movq       mm7,[edi+ebp*2]

								   pand      mm3,mm6             ;  . N

								  movq       PelDiffsLine1,mm0

								   psrlq     mm3,1               ;  . O

								  movq       mm1,[edi+PITCH*3]

								   psubb     mm7,mm2

								  psubb      mm1,mm3

								   movq      mm0,mm4

								  movq       PelDiffsLine2,mm7

								   paddb     mm5,mm5             ;  . .  Prepare for use for next 4 lines.

								  movq       PelDiffsLine3,mm1   ; Store D3.

								   pcmpeqb   mm7,mm7

								  jmp        Get4MoreLinesOfPred_InterpBoth


								;===========================================================================

								; Internal function to get 4 lines of prediction, interpolating in both

								; directions.  The first line of the function are scheduled into the

								; caller's space, and so are commented out here.  For 8 lines of prediction,

								; a second call, to the second entry point, is called after consuming the

								; outputs of the first function call.  Certain registers must remain intact

								; to convey information from the first call to the second.

								;

								; ebp -- PITCH

								; edi -- Points to target block.

								; esi -- Points to Upper left corner of 9*9 block that will be interpolated

								;        horizontally and vertically to generate prediction.

								; edx -- Reserved (MBlockActionStream)

								; ecx -- Not in use

								; ebx -- Will be used.

								; eax -- Reserved.

								; mm6 -- 8 bytes of 0xFE.

								; mm7 -- 8 bytes of -1.

								; mm0-mm5 -- Scratch


								StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>

								Get4LinesOfPred_InterpBoth:      ; 01234 Details for line 0


								; movq       mm5,[esi+1]         ; A. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								  movq       mm1,mm5             ; B. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								   xor       ebx,ebx             ;  . .

								  mov        bl,[esi]            ; C. .  R00

								   psllq     mm5,8               ; D. .  <R07 R06 R05 R04 R03 R02 R01   0>

								  paddb      mm5,mm1             ; E. .  <R08+R07 ... R02+R01 R01>

								  paddb      mm5,Pel_Rnd[ebx*8]  ; F. .  <R08+R07+1 ... R02+R01+1 R01+R00+1>

								   movq      mm0,mm6             ; G. .  Mask to extract each pel's frac bit.

								  pandn      mm0,mm5             ; H. .  <(R08+R07+1)&1 ...>

								   pand      mm5,mm6             ; I. .  Pre-clean

								Get4MoreLinesOfPred_InterpBoth:  ;  . .

								  movq       mm2,[esi+ebp*1+1]   ;  A .

								   psrlq     mm5,1               ; J. .  <(R08+R07+1)/2 ... (R01+R00+1)/2)>

								  xor        ebx,ebx             ;  . .

								   movq      mm1,mm2             ;  B .

								  mov        bl,[esi+ebp*1]      ;  C .

								   psllq     mm2,8               ;  D .

								  movq       mm3,[esi+ebp*2+1]   ;  .A.

								   paddb     mm2,mm1             ;  E .

								  paddb      mm2,Pel_Rnd[ebx*8]  ;  F .

								   movq      mm1,mm3             ;  .B.

								  mov        bl,[esi+ebp*2]      ;  .C.

								   psllq     mm3,8               ;  .D.

								  movq       mm4,[esi+PITCH*3+1] ;  . A

								   paddb     mm3,mm1             ;  .E.

								  paddb      mm3,Pel_Rnd[ebx*8]  ;  .F.

								   movq      mm1,mm4             ;  . B

								  mov        bl,[esi+PITCH*3]    ;  . C

								   pand      mm0,mm2             ; K. .  <(R08+R07+1)&(R18+R17+1)&1 ...>

								  paddb      mm0,mm5             ; L. .  <(R08+R07+1+((R18+R17+1)&1))/2 ...>

								   psllq     mm4,8               ;  . D

								  movq       mm5,[esi+ebp*4+1]   ;  . .A

								   paddb     mm4,mm1             ;  . E

								  paddb      mm4,Pel_Rnd[ebx*8]  ;  . F

								   movq      mm1,mm5             ;  . .B

								  mov        bl,[esi+ebp*4]      ;  . .C

								   psllq     mm5,8               ;  . .D

								  paddb      mm5,mm1             ;  . .E

								   movq      mm1,mm6             ;  G .

								  pandn      mm1,mm2             ;  H .

								   pand      mm2,mm6             ;  I .

								  paddb      mm5,Pel_Rnd[ebx*8]  ;  . .F

								   psrlq     mm2,1               ;  J .

								  paddb      mm0,mm2             ; M. .  <(R08+R07+R18+R17+2)/2 ...>

								   pand      mm1,mm3             ;  K .

								  paddb      mm1,mm2             ;  L .

								   movq      mm2,mm6             ;  .G.

								  pandn      mm2,mm3             ;  .H.

								   pand      mm3,mm6             ;  .I.

								  pand       mm0,mm6             ; N. .  Pre-clean

								   psrlq     mm3,1               ;  .J.

								  paddb      mm1,mm3             ;  M .

								   pand      mm2,mm4             ;  .K.

								  paddb      mm2,mm3             ;  .L.

								   movq      mm3,mm6             ;  . G

								  pandn      mm3,mm4             ;  . H

								   pand      mm4,mm6             ;  . I

								  pand       mm3,mm5             ;  . K

								   psrlq     mm4,1               ;  . J

								  paddb      mm2,mm4             ;  .M.

								   paddb     mm3,mm4             ;  . L

								  movq       mm4,mm6             ;  . .G

								   psrlq     mm0,1               ; O. .  P0 = <(R08+R07+R18+R17+2)/4 ...>

								  pandn      mm4,mm5             ;  . .H

								   pand      mm5,mm6             ;  . .I

								  pand       mm1,mm6             ;  N .

								   psrlq     mm5,1               ;  . .J

								  paddb      mm3,mm5             ;  . M

								   lea       esi,[esi+ebp*4]     ;       Advance to next four lines.

								; pand       mm2,mm6             ;  .N.

								;  psrlq     mm1,1               ;  O .

								; pand       mm3,mm6             ;  . N

								;  psrlq     mm2,1               ;  .O.

								; paddb      mm5,mm5             ;  . .  Prepare for use for next 4 lines.

								;  psrlq     mm3,1               ;  . O

								  ret

								StackOffset TEXTEQU <4>


								; The steps commented out above are scheduled into the mem-ops the caller has

								; to do at the point of return.  As though these ops were done, the registers

								; look as follows:

								;  mm0 -- Prediction for line 0.

								;  mm1 -- Prediction for line 1.

								;  mm2 -- Prediction for line 2.

								;  mm3 -- Prediction for line 3.

								;  mm4 -- Must be moved to mm0 before computing prediction for next 4 lines.

								;  mm5 -- Must be doubled before computing prediction for next 4 lines.

								;  mm6 -- 8 bytes of 0x01.  Must be this when computing pred for next 4 lines.

								;  mm7 -- 8 bytes of 0xFE.  Must be this when computing pred for next 4 lines.

								;=============================================================================

								ENDIF


								StackOffset TEXTEQU <0>


								IFDEF H261

								ELSE ;H263

								OBMCDifferencing:


								  mov        al,PendingOBMC           ; Do OBMC for previous block, if needed..

								   mov       bl,1

								  test       al,al

								   mov       PendingOBMC,bl

								  mov        cl,INTER1MV

								   je        NextMacroBlock


								  mov        StashBlockType,cl


								  call       DoPendingOBMCDiff


								  mov        al,IsPlainPFrame

								  test       al,al

								   jne       NextMacroBlock


								  add        edx,-SIZEOF T_MacroBlockActionDescr

								  movq       mm6,C0101010101010101

								   pxor      mm7,mm7                      ; Initialize SWD accumulator


								  call       MMxDoBFrameLumaBlocks


								  sub        edx,-SIZEOF T_MacroBlockActionDescr

								   jmp       NextMacroBlock


								ENDIF


								;============================================================================

								; Calculate the IntraSWD

								;

								;  ebp -- PITCH

								;  esi -- Accumulation for IntraSWD

								;  edi -- Address of target macroblock.

								;  edx -- MBlockActionStream

								;  ecx -- Scratch

								;  ebx -- Amount IntraSWD has to be less than to be the winner.

								;  eax -- Reserved.  Holds coded blk pattern, (except undef when IntraByDecree).

								;  mm7 -- SWD total for macroblock.

								;  mm6 -- Average pel value for block 1.

								;  mm5 -- Average pel value for block 2.

								;  mm4 -- Average pel value for block 3.

								;  mm3 -- Average pel value for block 4.

								;  mm0-mm2 Scratch

								;


								IntraByDecree:


								  mov        ebx,000080000H           ; Set Inter SWD artificially high.


								CalculateIntraSWD:


								  sub        ebx,INTRACODINGDIFFERENTIAL

								   mov       cl,1

								  movq       mm0,[edi]              ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   pcmpeqb   mm5,mm5


								ComputeIntraSWDForNextBlock:


								  movq       mm2,[edi+ebp*2]        ; <P27 P26 P25 P24 P23 P22 P21 P20>

								   psrlw     mm5,8

								  movq       mm4,[edi+ebp*4]

								   paddw     mm0,mm2                ; <junk P06+P26 junk P04+P24 ...>

								  movq       mm6,[edi+PITCH*6]

								   pand      mm0,mm5                ; <P06+P26 P04+P24 P02+P22 P00+P20>

								  movq       mm1,[edi+ebp*1]        ; <P17 P16 P15 P14 P13 P12 P11 P10>

								   paddw     mm4,mm6

								  movq       mm3,[edi+PITCH*3]      ; <P37 P36 P35 P34 P33 P32 P31 P30>

								   pand      mm4,mm5

								  movq       mm5,[edi+PITCH*5]

								   paddw     mm1,mm3                ; <P17+P37 junk P15+P35 junk ...>

								  movq       mm7,[edi+PITCH*7]

								   psrlw     mm1,8                  ; <P17+P37 P15+P35 P13+P33 P11+P31>

								  paddw      mm0,mm1

								   paddw     mm5,mm7

								  paddw      mm0,mm4

								   psrlw     mm5,8

								  paddw      mm0,mm5

								   pcmpeqw   mm5,mm5                ; Get words of -1

								  movq       mm4,[edi+ebp*4]

								   pmaddwd   mm0,mm5                ; <SumHi = Sum3+Sum2 | SumLo = Sum1+Sum0>

								  pcmpeqw    mm1,mm1

								   psllw     mm3,8                  ; <P36   0 P34   0 P32  0 P30  0>

								  movq       mm5,[edi+PITCH*5]

								   psllw     mm1,3                  ; 4 words of 0xFFF8

								  packssdw   mm0,mm0                ; <SumHi | SumLo | SumHi | SumLo>

								   mov       al,[edx].CodedBlocks   ; Fetch coded block pattern.

								  pmaddwd    mm0,mm1                ; <Sum = SumHi+SumLo | Sum = SumHi+SumLo>

								   psllw     mm5,8

								  movq       mm1,[edi+ebp*1]

								   psllw     mm7,8

								  ;

								   psllw     mm1,8

								  ;

								   packssdw  mm0,mm0                ; <Sum | Sum | Sum | Sum>

								  psubw      mm1,mm0                ; <P16-Avg frac P14-Avg frac ...>

								   psubw     mm2,mm0                ; <P27-Avg frac P25-Avg frac ...>

								  pmaddwd    mm1,mm1                ; Square of diff

								   psubw     mm3,mm0

								  pmaddwd    mm2,mm2

								   psubw     mm4,mm0

								  pmaddwd    mm3,mm3

								   psubw     mm5,mm0

								  pmaddwd    mm4,mm4

								   psubw     mm6,mm0

								  psubw      mm7,mm0

								   paddusw   mm1,mm2

								  psubw      mm0,[edi]

								   pmaddwd   mm5,mm5

								  pmaddwd    mm6,mm6

								   paddusw   mm1,mm3

								  pmaddwd    mm7,mm7

								   paddusw   mm1,mm4

								  pmaddwd    mm0,mm0

								   paddusw   mm1,mm5

								  paddusw    mm1,mm6

								   cmp       cl,2

								  paddusw    mm1,mm7

								   ;

								  paddusw    mm0,mm1

								   ;

								  punpckldq  mm1,mm0

								   ;

								  paddusw    mm0,mm1

								   jg        LowerBlkIntraDone


								  psrlq      mm0,48

								   lea       edi,[edi+ebp*8+8]   ; Speculate going from blk 1 to blk 4

								  mov        cl,4

								   je        Blk2IntraDone


								Blk1IntraDone:


								  movdf      esi,mm0

								  sub        ebx,esi

								   jle       InterBestX


								  movq       mm0,[edi]              ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   pcmpeqb   mm5,mm5

								  jmp        ComputeIntraSWDForNextBlock


								LowerBlkIntraDone:


								  psrlq      mm0,48

								   sub       edi,PITCH*8         ; Speculate going from blk 4 to blk 2

								  cmp        cl,3

								   je        Blk3IntraDone


								Blk4IntraDone:


								  movdf      ecx,mm0

								  add        esi,ecx             ; Accumulate IntraSWD

								   sub       ebx,ecx

								  jle        InterBestX


								  movq       mm0,[edi]              ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   pcmpeqb   mm5,mm5

								  mov        cl,2

								   jmp       ComputeIntraSWDForNextBlock


								Blk2IntraDone:


								  movdf      ecx,mm0

								  add        esi,ecx             ; Accumulate IntraSWD

								   sub       edi,16              ; Get to blk 3.

								  sub        ebx,ecx

								   jle       InterBestX


								  movq       mm0,[edi]              ; <P07 P06 P05 P04 P03 P02 P01 P00>

								   pcmpeqb   mm5,mm5

								  mov        cl,3

								   jmp       ComputeIntraSWDForNextBlock


								Blk3IntraDone:


								  movdf      ecx,mm0

								  add        esi,ecx             ; Accumulate IntraSWD

								   sub       ebx,ecx

								  jle        InterBestX


								IntraBest:


								  mov        ecx,SWDTotal

								   and       al,07FH                   ; Turn off FORCE-INTRA bit.

								  mov        [edx].SWD,esi

								   add       ecx,esi                   ; Add to total.

								  mov        SWDTotal,ecx

								   mov       cl,INTRA

								  mov        [edx].BlockType,cl        ; Indicate macroblock handling decision.

								   xor       ecx,ecx

								  mov        [edx].BlkY1.MVs,ecx

								   mov       [edx].BlkY2.MVs,ecx

								  mov        [edx].BlkY3.MVs,ecx

								   mov       [edx].BlkY4.MVs,ecx

								  mov        [edx].CodedBlocks,al


								IFDEF H261

								ELSE ;H263

								   mov       al,PendingOBMC            ; Do Prev MB if it needs to be OBMC'ed.

								  mov        [edx].BestFullPelMBHMV,cl ; Kill MVs so extended EMV of other

								  ;                                    ; blocks will work right.

								   dec       al

								  mov        [edx].BestFullPelMBVMV,cl

								   jne       @f


								  mov        PendingOBMC,al            ; Go on to next MB, unless the prev MB

								  ;                                    ; needs to be finished (OBMC).

								   mov       cl,INTER1MV

								  mov        StashBlockType,cl


								  call       DoPendingOBMCDiff


								  mov        al,IsPlainPFrame

								  test       al,al

								   jne       @f


								  add        edx,-SIZEOF T_MacroBlockActionDescr

								  movq       mm6,C0101010101010101

								   pxor      mm7,mm7                      ; Initialize SWD accumulator


								  call       MMxDoBFrameLumaBlocks


								  sub        edx,-SIZEOF T_MacroBlockActionDescr


								@@:


								ENDIF


								  mov        cl,INTRA

								   mov       esi,TargetMacroBlockBaseAddr

								  mov        StashBlockType,cl

								   push      eax                   ; Adjust stack pointer

								StackOffset TEXTEQU <4>

								  call       MMxDoForwardDCT

								  mov        al,[edx].CodedBlocks

								   mov       esi,TargetMacroBlockBaseAddr

								  sub        al,bl

								   add       esi,8

								  mov        [edx].CodedBlocks,al

								  call       MMxDoForwardDCT

								  shl        bl,1

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       esi,TargetMacroBlockBaseAddr

								  mov        [edx].CodedBlocks,al

								   add       esi,PITCH*8

								  call       MMxDoForwardDCT

								  shl        bl,2

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   mov       esi,TargetMacroBlockBaseAddr

								  mov        [edx].CodedBlocks,al

								   add       esi,PITCH*8+8

								  call       MMxDoForwardDCT

								  shl        bl,3

								   mov       al,[edx].CodedBlocks

								  sub        al,bl

								   pop       edi                   ; Adjust stack pointer

								StackOffset TEXTEQU <0>

								  mov        [edx].CodedBlocks,al

								IFDEF H261

								ELSE

								   mov       al,IsPlainPFrame

								  test       al,al

								   jne       NextMacroBlock


								  movq       mm6,C0101010101010101

								   pxor      mm7,mm7                      ; Initialize SWD accumulator


								  call       MMxDoBFrameLumaBlocks

								ENDIF


								  jmp        NextMacroBlock


								IFDEF H261

								ELSE; H263

								StackOffset TEXTEQU <4>

								DoPendingOBMCDiff: ; Internal function


								;============================================================================

								; Perform differencing for the non-empty luma blocks of an Inter-coded

								; macroblock.  This is the OBMC case;  i.e. Advanced Prediction is selected.


								PrevMBAD EQU [edx-SIZEOF T_MacroBlockActionDescr]


								  pcmpeqb    mm6,mm6

								   pcmpeqb   mm7,mm7                    ; 8 bytes of -1

								  paddb      mm6,mm6                    ; 8 bytes of 0xFE

								   mov       al,PrevMBAD.CodedBlocks    ; Bits  0- 3  set for non-empty Y blks.

								  test       al,1                       ; Check if block 1 empty.

								   je        OBMCDoneForBlock1


								  xor        ebx,ebx

								   mov       eax,SIZEOF T_Blk           ; Blk to right is blk 2 of this MB.

								  mov        bl,PrevMBAD.MBEdgeType

								   mov       ecx,1                      ; Mask to extract left edge indicator.

								  and        ecx,ebx                    ; Extract left edge indicator.

								   and       ebx,4                      ; Extract top edge indicator.

								  mov        esi,PrevMBAD.BlkY1.MVs

								   lea       edi,[eax*2]                ; Blk below is blk 3 of this MB.

								  mov        DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.

								   mov       edi,BlockAbove[ebx]        ; Blk above is blk 3 of mb above, or off

								   ;                                    ; upper edge.

								  mov        ecx,BlockToLeft[ecx*4]     ; Blk to left is blk 2 of mb to the

								  ;                                     ; left, or off left edge.

								   mov       DistToBADforBlockAbove,edi

								  call       DoOBMCForBlock

								  mov        al,PrevMBAD.CodedBlocks    ; Bits  0- 3  set for non-empty Y blks.

								  sub        al,bl

								  mov        PrevMBAD.CodedBlocks,al


								OBMCDoneForBlock1:


								   add       edx,SIZEOF T_Blk

								  test       al,2                       ; Check if block 2 empty.

								   je        OBMCDoneForBlock2


								  xor        ebx,ebx

								   mov       eax,2                      ; Mask to extract right edge indicator.

								  mov        bl,PrevMBAD[-SIZEOF T_Blk].MBEdgeType

								   mov       edi,2*SIZEOF T_Blk         ; Blk below is blk 4 of this MB.

								  and        eax,ebx                    ; Extract right edge indicator.

								   and       ebx,4                      ; Extract top edge indicator.

								  mov        DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.

								   lea       ecx,[edi-3*SIZEOF T_Blk]   ; Blk to left is blk 1 of this MB.

								  mov        eax,BlockToRight[eax*2]    ; Blk to right is blk 1 of mb to the

								  ;                                     ; right, or off right edge.

								   mov       edi,BlockAbove[ebx]        ; Blk above is blk 4 of mb above, or off

								   ;                                    ; upper edge.

								  mov        esi,PrevMBAD.BlkY1.MVs

								   mov       DistToBADforBlockAbove,edi

								  call       DoOBMCForBlock

								  shl        bl,1

								   mov       al,PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks

								  sub        al,bl

								  mov        PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks,al


								OBMCDoneForBlock2:


								   add       edx,SIZEOF T_Blk

								  test       al,4                       ; Check if block 3 empty.

								   je        OBMCDoneForBlock3


								  xor        ecx,ecx

								   xor       ebx,ebx                    ; Blk below is this block.

								  mov        cl,PrevMBAD[-2*SIZEOF T_Blk].MBEdgeType

								   mov       eax,SIZEOF T_Blk           ; Blk to right is blk 4 of this MB.

								  and        ecx,1                      ; Extract left edge indicator.

								   mov       DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.

								  lea        edi,[eax-3*SIZEOF T_Blk]   ; Blk above is blk 1 of this MB.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  mov        DistToBADforBlockAbove,edi

								   mov       ecx,BlockToLeft[ecx*4]     ; Blk to left is blk 1 of mb to the

								  ;                                     ; left, or off left edge.

								  call       DoOBMCForBlock

								  shl        bl,2

								   mov       al,PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks

								  sub        al,bl

								  mov        PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks,al


								OBMCDoneForBlock3:


								   add       edx,SIZEOF T_Blk

								  test       al,8                       ; Check if block 4 empty.

								   je        OBMCDoneForBlock4


								  xor        eax,eax

								   xor       ebx,ebx                    ; Blk below is this block.

								  mov        al,PrevMBAD[-3*SIZEOF T_Blk].MBEdgeType

								   mov       ecx,-SIZEOF T_Blk          ; Blk to left is blk 3 of this MB.

								  and        eax,2                      ; Extract right edge indicator.

								   mov       DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.

								  lea        edi,[ecx*2]                ; Blk above is blk 2 of this MB.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  mov        DistToBADforBlockAbove,edi

								   mov       eax,BlockToRight[eax*2]    ; Blk to right is blk 1 of mb to the

								  ;                                     ; right, or off right edge.

								  call       DoOBMCForBlock

								  shl        bl,3

								   mov       al,PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks

								  sub        al,bl

								  mov        PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks,al


								OBMCDoneForBlock4:


								   sub       edx,3*SIZEOF T_Blk    ; Get back to MacroBlock Action Descriptor

								  ret


								StackOffset TEXTEQU <8>

								DoOBMCForBlock: ; Internal Function


								;  Present register contents.

								;  ebp -- PITCH

								;  esi -- Motion vectors for current block.

								;  ecx -- Distance from BAD of blk we're doing to BAD for block that provides

								;         remote MV from left.

								;  eax -- Distance from BAD of blk we're doing to BAD for block that provides

								;         remote MV from right.

								;  edx -- MBlockActionStream, adjusted to reach BAD of blk we are doing OBMC to.

								;         doing OBMC)

								;  mm7 -- 8 bytes of -1.

								;  mm6 -- 8 bytes of 0xFE.

								;

								; In the body of this code:

								;

								;  edx -- Unchanged.

								;  edi -- Saved to memory.  Then used for address of destination for storing

								;         remote prediction blocks.

								;  ebp -- PITCH.

								;  esi -- Pointer to 8*8, 8*9, 9*8, or 9*9 remote reference areas, which are

								;         then interpolated and stored at edi.

								;  ecx, eax -- Inputs are used, then these are scratch.

								;  ebx -- Scratch

								;  mm7 -- 8 bytes of -1

								;  mm6 -- 8 bytes of 0xFE

								;  mm0-mm5 -- Scratch


								;  Compute left remote prediction block.


								  lea        edi,PrevMBAD[ecx]

								  and        edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to left.

								   lea       ebx,CentralPred

								  mov        AddrOfLeftPred,ebx  ; Speculate that left remote MV == center MV.

								   mov       AddrOfRightPred,ebx ; Speculate that right remote MV == center MV.

								  mov        bl,[edi].BlockType

								  cmp        bl,INTRA

								   je        LeftEqCtr           ; Jump if INTRA.  (Use central)


								  mov        ebx,PrevMBAD[ecx].BlkY1.MVs

								  and        ebx,00000FFFFH     ; Blk to left may have B MVs set.  Clear them.

								  cmp        esi,ebx

								   je        LeftEqCtr


								  mov        edi,PrevMBAD[ecx].BlkY1.BlkOffset

								   mov       esi,PrevMBAD[ecx].BlkY1.PastRef   ; Get ref addr using left remote.

								  sub        esi,edi

								   mov       edi,PrevMBAD.BlkY1.BlkOffset

								  add        esi,edi

								   lea       edi,LeftPred


								  call       GetPredForCenterLeftOrRight


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+32],mm0

								   psrlq     mm2,1

								  movq       [edi+40],mm1

								   pand      mm3,mm6

								  movq       [edi+48],mm2

								   psrlq     mm3,1

								  lea        ecx,PrevMBAD[eax]

								  and        ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  movq       [edi+56],mm3

								   pcmpeqb   mm7,mm7             ;  . .  Restore 8 bytes of -1


								;  Compute right remote prediction block.


								  mov        AddrOfLeftPred,edi

								   mov       bl,[ecx].BlockType

								  cmp        bl,INTRA

								   je        RightEqCtrButLeftNeCtr ; Jump if INTRA.(Use central)


								  mov        ebx,PrevMBAD[eax].BlkY1.MVs

								  cmp        esi,ebx

								   je        RightEqCtrButLeftNeCtr


								  mov        esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using right remote.

								   mov       edi,PrevMBAD[eax].BlkY1.BlkOffset


								RightNeCtr:


								  sub        esi,edi

								   mov       edi,PrevMBAD.BlkY1.BlkOffset

								  add        esi,edi

								   lea       edi,RightPred


								  call       GetPredForCenterLeftOrRight


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+32],mm0

								   psrlq     mm2,1

								  movq       [edi+40],mm1

								   pand      mm3,mm6

								  movq       [edi+48],mm2

								   psrlq     mm3,1

								  mov        AddrOfRightPred,edi

								   ;

								  movq       [edi+56],mm3

								   pcmpeqb   mm7,mm7             ;  . .  Restore 8 bytes of -1


								RightEqCtrButLeftNeCtr:


								;  Compute central prediction block.


								  mov        ebx,PrevMBAD.BlkY1.MVs

								   mov       esi,PrevMBAD.BlkY1.PastRef

								  lea        edi,CentralPred

								   mov       eax,DistToBADforBlockBelow


								  call       GetPredForCenterLeftOrRight


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+32],mm0

								   psrlq     mm2,1

								  movq       [edi+40],mm1

								   pand      mm3,mm6

								  movq       [edi+48],mm2

								   psrlq     mm3,1

								  lea        ecx,PrevMBAD[eax]

								  and        ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  movq       [edi+56],mm3

								   pcmpeqb   mm7,mm7

								  mov        bl,[ecx].BlockType

								   mov       ecx,PrevMBAD.BlkY1.BlkOffset

								  cmp        bl,INTRA

								   je        BelowEqCtrButSidesDiffer ; Jump if INTRA.  (Use central)


								; Compute bottom remote prediction block.


								  mov        ebx,PrevMBAD[eax].BlkY1.MVs

								   mov       edi,AddrOfLeftPred

								  cmp        esi,ebx

								   jne       BelowNeCtr


								BelowEqCtrButSidesDiffer:


								  paddb      mm1,mm1             ; Prep mm0-3, which have ctr, for reuse below.

								   paddb     mm2,mm2

								  paddb      mm3,mm3

								   mov       edi,AddrOfLeftPred

								  jmp        BelowEqCtr


								BelowNeCtr:


								  mov        esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using above remote.

								   mov       eax,PrevMBAD[eax].BlkY1.BlkOffset

								  sub        esi,eax

								   lea       eax,[ecx+ebp*4]


								  call       GetPredForAboveOrBelow


								BelowEqCtr:


								; Compute difference for lines 4 thru 7.

								; Lines 4 and 5: Cols 0,1,6, and 7 treated same.  Cols 2-5 treated same.


								  mov        esi,AddrOfRightPred

								   mov       ebx,TargetFrameBaseAddress

								  movdt      mm5,[edi+48]          ; 6B: <  0   0   0   0 R63 R62 R61 R60>

								   pand      mm2,mm6

								  punpckldq  mm5,[esi+48+4]        ; 6C: <L67 L66 L65 L64 R63 R62 R61 R60>

								   pand      mm3,mm6

								  movq       mm4,CFFFF00000000FFFF ; 6D: < FF  FF  00  00  00  00  FF  FF>

								   psrlq     mm2,1                 ; 6A: <B67 B66 B65 B64 B63 B62 B61 B60>

								  pand       mm4,mm5               ; 6E: <L67 L66  00  00  00  00 R61 R60>

								   paddb     mm5,mm2               ; 6F: <B67+L67 ... B65+L65 ...>


								  pand       mm2,C0000FFFFFFFF0000 ; 6G: < 00  00 B65 B64 B63 B62  00  00>

								   psrlq     mm1,1                 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>

								  paddb      mm2,mm4               ; 6H: <L67 L66 B65 B64 B63 B62 R61 R60>

								   add       ecx,ebx               ;     Address of target block.

								  movdt      mm4,[edi+56]          ; 7B: <  0   0   0   0 R73 R72 R71 R70>

								   psubb     mm5,mm2               ; 6I: <B67 B66 L65 L64 R63 R62 B61 B60>

								  paddb      mm5,CentralPred+48    ; 6J: <C67+B67 ... C65+L65 ...>

								   psrlq     mm3,1                 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>

								  punpckldq  mm4,[esi+56+4]        ; 7C: <L77 L76 L75 L74 R73 R72 R71 R70>

								   pand      mm5,mm6               ; 6K: <C67+B67 ... C65+L65 ...> pre-cleaned

								  mov        eax,DistToBADforBlockAbove

								   psrlq     mm5,1                 ; 6L: <(C67+B67)/2 ... (C65+L65)/2 ...>

								  paddb      mm2,mm5               ; 6M: <(C67+B67+2L67)/2 ...

								  ;                                ;      (C65+2B65+L65)/2 ...>

								   lea       ebx,PelDiffs

								  movq       mm5,CFF000000000000FF ; 7D: < FF  00  00  00  00  00  00  FF>

								   pand      mm2,mm6               ; 6N: pre-cleaned

								  pandn      mm5,CentralPred+56    ; 7E: < 00 C76 C75 C74 C73 C72 C71  00>

								   psrlq     mm2,1                 ; 6O: <(C67+B67+2L67)/4 ...

								   ;                               ;      (C65+2B65+L65)/4 ...>

								  paddb      mm2,CentralPred+48    ; 6P: <(5C67+B67+2L67)/4 ...

								  ;                                ;      (5C65+2B65+L65)/4 ...>

								   paddb     mm5,mm4               ; 7F: <L77 C76+L76 ...>

								  pand       mm4,CFF000000000000FF ; 7G: <L77  00  00  00  00  00  00  L70>

								   psubb     mm2,mm7               ; 6Q: <(5C67+B67+2L67+4)/4 ...

								   ;                               ;      (5C65+2B65+L65+4)/4 ...>

								  paddb      mm4,mm5               ; 7H: <2L77 C76+L76 ...>

								   pand      mm2,mm6               ; 6R: pre-cleaned

								  movq       mm5,[ecx+PITCH*6]     ; 6T: T6

								   psrlq     mm2,1                 ; 6S: P6 = <(5C67+B67+2L67+4)/8 ...

								   ;                               ;           (5C65+2B65+L65+4)/8 ...>

								  psubb      mm5,mm2               ; 6U: D6 = T6 - P6

								   ;

								                                   ; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0

								  movdt      mm2,[edi+32]          ; 4B: <  0   0   0   0 R43 R42 R41 R40>

								   pand      mm4,mm6               ; 7I: <2L77 C76+L76 ...> pre-cleaned

								  movq       [ebx+6*16],mm5        ; 6V: Store D6.

								   psrlq     mm4,1                 ; 7J: <2L77/2 (C76+L76)/2 ...>

								  punpckldq  mm2,[esi+32+4]        ; 4C: <L47 L46 L45 L44 R43 R42 R41 R40>

								   paddb     mm3,mm4               ; 7K: <(2B77+2L77)/2 (C76+2B76+L76)/2 ...>

								  movq       mm5,CFFFF00000000FFFF ; 4D: < FF  FF  00  00  00  00  FF  FF>

								   pand      mm3,mm6               ; 7L: pre-cleaned

								  movq       mm4,CentralPred+32    ; 4E: <C47 C46 C45 C44 C43 C42 C41 C40>

								   psrlq     mm3,1                 ; 7M: <(2B77+2L77)/4 (C76+2B76+L76)/4 ...>

								  paddb      mm3,CentralPred+56    ; 7N: <(4C77+2B77+2L77)/4

								  ;                                ;      (5C76+2B76+L76)/4 ...>

								   pand      mm5,mm4               ; 4F: <C47 C46  00  00  00  00 C41 C40>

								  psubb      mm3,mm7               ; 7O: <(4C77+2B77+2L77+4)/4

								   ;                               ;      (5C76+2B76+L76+4)/4 ...>

								   paddb     mm4,mm2               ; 4G: <C47+L47 ... C45+L45 ...>

								  pand       mm2,C0000FFFFFFFF0000 ; 4H: < 00  00 L45 L44 R43 R42  00  00>

								   pand      mm3,mm6               ; 7P: <(4C77+2B77+2L77+4)/4

								   ;                               ;      (5C76+2B76+L76+4)/4 ...> pre-cleaned

								  paddb      mm2,mm5               ; 4I: <C47 C46 L45 L44 R43 R42 C41 C40>

								   psrlq     mm3,1                 ; 7Q: P7 = <(4C77+2B77+2L77+4)/8

								   ;                               ;           (5C76+2B76+L76+4)/8 ...>

								  movdt      mm5,[edi+40]          ; 5B: <  0   0   0   0 R53 R52 R51 R50>

								   psubb     mm4,mm2               ; 4J: <L47 L46 C45 C44 C43 C42 R41 R40>

								  punpckldq  mm5,[esi+40+4]        ; 5C: <L57 L56 L55 L54 R53 R52 R51 R50>

								   paddb     mm0,mm2               ; 4K: <C47+B47 ... B45+L45 ...>

								  movq       mm2,[ecx+PITCH*7]     ; 7R: T7

								   pand      mm0,mm6               ; 4L: <C47+B47 ... B45+L45 ...> pre-cleaned

								  psubb      mm2,mm3               ; 7S: D7 = T7 - P7

								   psrlq     mm0,1                 ; 4M: <(C47+B47)/2 ... (B45+L45)/2 ...>

								  movq       mm3,CFFFF00000000FFFF ; 5D: < FF  FF  00  00  00  00  FF  FF>

								   paddb     mm0,mm4               ; 4N: <(C47+B47+2L47)/2 ...

								   ;                               ;      (2C45+B45+L45)/2 ...>

								  movq       mm4,CentralPred+40    ; 5E: <C57 C56 C55 C54 C53 C52 C51 C50>

								   pand      mm0,mm6               ; 4O: pre-cleaned

								  pand       mm3,mm4               ; 5F: <C57 C56  00  00  00  00 C51 C50>

								   paddb     mm4,mm5               ; 5G: <C57+L57 ... C55+L55 ...>

								  pand       mm5,C0000FFFFFFFF0000 ; 5H: < 00  00 L55 L54 R53 R52  00  00>

								   psrlq     mm0,1                 ; 4P: <(C47+B47+2L47)/4 ...

								   ;                               ;      (2C45+B45+L45)/4 ...>

								  paddb      mm0,CentralPred+32    ; 4Q: <(5C47+B47+2L47)/4 ...

								  ;                                ;      (6C45+B45+L45)/4 ...>

								   paddb     mm5,mm3               ; 5I: <C57 C56 L55 L54 R53 R52 C51 C50>

								  psubb      mm4,mm5               ; 5J: <L57 L56 C55 C54 C53 C52 R51 R50>

								   paddb     mm1,mm5               ; 5K: <C57+B57 ... B55+L55 ...>

								  pand       mm1,mm6               ; 5L: <C57+B57 ... B55+L55 ...> pre-cleaned

								   psubb     mm0,mm7               ; 4R: <(5C47+B47+2L47+4)/4 ...

								   ;                               ;      (6C45+B45+L45+4)/4 ...>

								  pand       mm0,mm6               ; 4S: pre-cleaned

								   psrlq     mm1,1                 ; 5M: <(C57+B57)/2 ... (B55+L55)/2 ...>

								  paddb      mm1,mm4               ; 5N: <(C57+B57+2L57)/2 ...

								  ;                                ;      (2C55+B55+L55)/2 ...>

								   psrlq     mm0,1                 ; 4T: P4 = <(5C47+B47+2L47+4)/8 ...

								   ;                               ;           (6C45+B45+L45+4)/8 ...>

								  movq       mm3,[ecx+PITCH*5]     ; 5U: T5

								   pand      mm1,mm6               ; 5O: pre-cleaned

								  movq       mm4,[ecx+ebp*4]       ; 4U: T4

								   psrlq     mm1,1                 ; 5P: <(C57+B57+2L57)/4 ...

								   ;                               ;      (2C55+B55+L55)/4 ...>

								  paddb      mm1,CentralPred+40    ; 5Q: <(5C57+B57+2L57)/4 ...

								  ;                                ;      (6C55+B55+L55)/4 ...>

								   psubb     mm4,mm0               ; 4V: D4 = T4 - P4

								  lea        esi,PrevMBAD[eax]

								   psubb     mm1,mm7               ; 5R: <(5C57+B57+2L57+4)/4 ...

								   ;                               ;      (6C55+B55+L55+4)/4 ...>

								  and        esi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.

								   pand      mm1,mm6               ; 5S: pre-cleaned

								  movq       [ebx+7*16],mm2        ; 7T

								   psrlq     mm1,1                 ; 5T: P5 = <(5C57+B57+2L57+4)/8 ...

								   ;                               ;           (6C55+B55+L55+4)/8 ...>

								  movq       [ebx+4*16],mm4        ; 4W: Store D4.

								   psubb     mm3,mm1               ; 5V: D5 = T5 - P5

								  mov        cl,[esi].BlockType    ; Bottom bit set if above neighbor is INTRA.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  movq       [ebx+5*16],mm3        ; 5W: Store D5.

								  cmp        cl,INTRA

								   je        AboveEqCtrButSidesDiffer ; Jump if INTRA.  (Use central)


								; Compute top remote prediction block.


								  mov        ebx,PrevMBAD[eax].BlkY1.MVs

								  and        ebx,00000FFFFH     ; Blk above may have B MVs set.  Clear them.

								   mov       ecx,PrevMBAD.BlkY1.BlkOffset

								  cmp        esi,ebx

								   jne       AboveNeCtr


								AboveEqCtrButSidesDiffer:


								  movq       mm3,CentralPred+24   ; Prep mm0-3, which have ctr, for reuse below.

								  movq       mm2,CentralPred+16

								   paddb     mm3,mm3

								  movq       mm1,CentralPred+8

								   paddb     mm2,mm2

								  movq       mm0,CentralPred

								   paddb     mm1,mm1

								  mov        ecx,PrevMBAD.BlkY1.BlkOffset

								   jmp       AboveEqCtr


								AboveNeCtr:


								  mov        esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using above remote.

								   mov       eax,PrevMBAD[eax].BlkY1.BlkOffset

								  sub        esi,eax

								   mov       eax,ecx


								  call       GetPredForAboveOrBelow


								AboveEqCtr:


								; Compute difference for lines 0 thru 3.


								  mov        esi,AddrOfRightPred

								   mov       ebx,TargetFrameBaseAddress

								  movdt      mm5,[edi+8]           ; 1B: <  0   0   0   0 R13 R12 R11 R10>

								   psrlq     mm1,1                 ; 1A: <A17 A16 A15 A14 A13 A12 A11 A10>

								  punpckldq  mm5,[esi+8+4]         ; 1C: <L17 L16 L15 L14 R13 R12 R11 R10>

								   pand      mm3,mm6

								  movq       mm4,CFFFF00000000FFFF ; 1D: < FF  FF  00  00  00  00  FF  FF>

								   psrlq     mm3,1                 ; 3A: <A37 A36 A35 A34 A33 A32 A31 A30>: mm0

								  pand       mm4,mm5               ; 1E: <L17 L16  00  00  00  00 R11 R10>

								   paddb     mm5,mm1               ; 1F: <A17+L17 ... A15+L15 ...>

								  pand       mm1,C0000FFFFFFFF0000 ; 1G: < 00  00 A15 A14 A13 A12  00  00>

								   pand      mm2,mm6

								  paddb      mm5,CentralPred+8     ; 1H: <C17+A17+L17 ... C15+A15+L15 ...>

								   paddb     mm1,mm4               ; 1I: <L17 L16 A15 A14 A13 A12 R11 R10>

								                                   ; 0A: <A07 A06 A05 A04 A03 A02 A01 A00>:mm0

								  movdt      mm4,[edi]             ; 0B: <  0   0   0   0 R03 R02 R01 R00>

								   psubb     mm5,mm1               ; 1J: <C17+A17 ... C15+L15 ...>

								  punpckldq  mm4,[esi+4]           ; 0C: <L07 L06 L05 L04 R03 R02 R01 R00>

								   pand      mm5,mm6               ; 1K: <C17+A17 ... C15+L15 ...> pre-cleaned

								  add        ecx,ebx               ;     Address of target block.

								   psrlq     mm5,1                 ; 1L: <(C17+A17)/2 ... (C15+L15)/2 ...>

								  paddb      mm1,mm5               ; 1M: <(C17+A17+2L17)/2 ...

								   ;                               ;      (C15+2A15+L15)/2 ...>

								   psrlq     mm2,1                 ; 2A: <A27 A26 A25 A24 A23 A22 A21 A20>

								  movq       mm5,CFF000000000000FF ; 0D: < FF  00  00  00  00  00  00  FF>

								   pand      mm1,mm6               ; 1N: pre-cleaned

								  pandn      mm5,CentralPred       ; 0E: < 00 C06 C05 C04 C03 C02 C01  00>

								   psrlq     mm1,1                 ; 1O: <(C17+A17+2L17)/4 ...

								   ;                               ;      (C15+2A15+L15)/4 ...>

								  paddb      mm1,CentralPred+8     ; 1P: <(5C17+A17+2L17)/4 ...

								  ;                                ;      (5C15+2A15+L15)/4 ...>

								   paddb     mm5,mm4               ; 0F: <L07 C06+L06 ...>

								  pand       mm4,CFF000000000000FF ; 0G: <L07  00  00  00  00  00  00  L00>

								   psubb     mm1,mm7               ; 1Q: <(5C17+A17+2L17+4)/4 ...

								   ;                               ;      (5C15+2A15+L15+4)/4 ...>

								  paddb      mm4,mm5               ; 0H: <2L07 C06+L06 ...>

								   pand      mm1,mm6               ; 1R: pre-cleaned

								  movq       mm5,[ecx+ebp*1]       ; 1T: T1

								   psrlq     mm1,1                 ; 1S: P1 = <(5C17+A17+2L17+4)/8 ...

								   ;                               ;           (5C15+2A15+L15+4)/8 ...>

								  psubb      mm5,mm1               ; 1U: D1 = T1 - P1

								   ;

								  movdt      mm1,[edi+24]          ; 3B: <  0   0   0   0 R33 R32 R31 R30>

								   pand      mm4,mm6               ; 0I: <2L07 C06+L06 ...> pre-cleaned

								  movq       PelDiffsLine1,mm5       ; 1V: Store D1.

								   psrlq     mm4,1                 ; 0J: <2L07/2 (C06+L06)/2 ...>

								  punpckldq  mm1,[esi+24+4]        ; 3C: <L37 L36 L35 L34 R33 R32 R31 R30>

								   paddb     mm0,mm4               ; 0K: <(2A07+2L07)/2 (C06+2A06+L06)/2 ...>

								  movq       mm5,CFFFF00000000FFFF ; 3D: < FF  FF  00  00  00  00  FF  FF>

								   pand      mm0,mm6               ; 0L: pre-cleaned

								  movq       mm4,CentralPred+24    ; 3E: <C37 C36 C35 C34 C33 C32 C31 C30>

								   psrlq     mm0,1                 ; 0M: <(2A07+2L07)/4 (C06+2A06+L06)/4 ...>

								  paddb      mm0,CentralPred       ; 0N: <(4C07+2A07+2L07)/4

								  ;                                ;      (5C06+2A06+L06)/4 ...>

								   pand      mm5,mm4               ; 3F: <C37 C36  00  00  00  00 C31 C30>

								  psubb      mm0,mm7               ; 0O: <(4C07+2A07+2L07+4)/4

								  ;                                ;      (5C06+2A06+L06+4)/4 ...>

								   paddb     mm4,mm1               ; 3G: <C37+L37 ... C35+L35 ...>

								  pand       mm1,C0000FFFFFFFF0000 ; 3H: < 00  00 L35 L34 R33 R32  00  00>

								   pand      mm0,mm6               ; 0P: <(4C07+2A07+2L07+4)/4

								   ;                               ;      (5C06+2A06+L06+4)/4 ...> pre-cleaned

								  paddb      mm1,mm5               ; 3I: <C37 C36 L35 L34 R33 R32 C31 C30>

								   psrlq     mm0,1                 ; 0Q: P0 = <(4C07+2A07+2L07+4)/8

								   ;                               ;           (5C06+2A06+L06+4)/8 ...>

								  movdt      mm5,[edi+16]          ; 2B: <  0   0   0   0 R23 R22 R21 R20>

								   psubb     mm4,mm1               ; 3J: <L37 L36 C35 C34 C33 C32 R31 R30>

								  punpckldq  mm5,[esi+16+4]        ; 2C: <L27 L26 L25 L24 R23 R22 R21 R20>

								   paddb     mm3,mm1               ; 3K: <C37+A37 ... A35+L35 ...>

								  movq       mm1,[ecx]             ; 0R: T0

								   pand      mm3,mm6               ; 3L: <C37+A37 ... A35+L35 ...> pre-cleaned

								  psubb      mm1,mm0               ; 0S: D0 = T0 - P0

								   psrlq     mm3,1                 ; 3M: <(C37+A37)/2 ... (A35+L35)/2 ...>

								  movq       mm0,CFFFF00000000FFFF ; 2D: < FF  FF  00  00  00  00  FF  FF>

								   paddb     mm3,mm4               ; 3N: <(C37+A37+2L37)/2 ...

								   ;                               ;      (2C35+A35+L35)/2 ...>

								  movq       mm4,CentralPred+16    ; 2E: <C27 C26 C25 C24 C23 C22 C21 C20>

								   pand      mm3,mm6               ; 3O: pre-cleaned

								  pand       mm0,mm4               ; 2F: <C27 C26  00  00  00  00 C21 C20>

								   paddb     mm4,mm5               ; 2G: <C27+L27 ... C25+L25 ...>

								  pand       mm5,C0000FFFFFFFF0000 ; 2H: < 00  00 L25 L24 R23 R22  00  00>

								   psrlq     mm3,1                 ; 3P: <(C37+A37+2L37)/4 ...

								   ;                               ;      (2C35+A35+L35)/4 ...>

								  paddb      mm3,CentralPred+24    ; 3Q: <(5C37+A37+2L37)/4 ...

								  ;                                ;      (6C35+A35+L35)/4 ...>

								   paddb     mm5,mm0               ; 2I: <C27 C26 L25 L24 R23 R22 C21 C20>

								  psubb      mm4,mm5               ; 2J: <L27 L26 C25 C24 C23 C22 R21 R20>

								   paddb     mm2,mm5               ; 2K: <C27+A27 ... A25+L25 ...>

								  pand       mm2,mm6               ; 2L: <C27+A27 ... A25+L25 ...> pre-cleaned

								   psubb     mm3,mm7               ; 3R: <(5C37+A37+2L37+4)/4 ...

								   ;                               ;      (6C35+A35+L35+4)/4 ...>

								  pand       mm3,mm6               ; 3S: pre-cleaned

								   psrlq     mm2,1                 ; 2M: <(C27+A27)/2 ... (A25+L25)/2 ...>

								  paddb      mm2,mm4               ; 2N: <(C27+A27+2L27)/2 ...

								  ;                                ;      (2C25+A25+L25)/2 ...>

								   psrlq     mm3,1                 ; 3T: P3 = <(5C37+A37+2L37+4)/8 ...

								   ;                               ;           (6C35+A35+L35+4)/8 ...>

								  movq       mm0,[ecx+ebp*2]       ; 2U: T2

								   pand      mm2,mm6               ; 2O: pre-cleaned

								  movq       mm4,[ecx+PITCH*3]     ; 3U: T3

								   psrlq     mm2,1                 ; 2P: <(C27+A27+2L27)/4 ...

								   ;                               ;      (2C25+A25+L25)/4 ...>

								  paddb      mm2,CentralPred+16    ; 2Q: <(5C27+A27+2L27)/4 ...

								  ;                                ;      (6C25+A25+L25)/4 ...>

								   psubb     mm4,mm3               ; 3V: D3 = T3 - P3

								  movq       PelDiffsLine0,mm1     ; 0T

								   psubb     mm2,mm7               ; 2R: <(5C27+A27+2L27+4)/4 ...

								   ;                               ;      (6C25+A25+L25+4)/4 ...>

								  movq       PelDiffsLine3,mm4     ; 3W: Store D3.

								   pand      mm2,mm6               ; 2S: pre-cleaned

								  psrlq      mm2,1                 ; 2T: P2 = <(5C27+A27+2L27+4)/8 ...

								  ;                                ;           (6C25+A25+L25+4)/8 ...>

								   ;

								  psubb      mm0,mm2               ; 2V: D2 = T2 - P2

								   ;

								  ;

								   ;

								  movq       PelDiffsLine2,mm0     ; 2W: Store D2.

								   ;

								  jmp        MMxDoForwardDCTy      ; Block is in PelDiffs block;  Pitch is 16


								LeftEqCtr:


								;  Left remote motion vector was same as center.

								;  Compute right remote prediction block.


								  lea        edi,PrevMBAD[eax]

								  and        edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  ;

								   ;

								  mov        cl,[edi].BlockType

								   mov       ebx,PrevMBAD[eax].BlkY1.MVs

								  cmp        cl,INTRA

								   je        LeftEqCtrAndRightEqCtr ; Jump if INTRA.  (Use central)


								  cmp        esi,ebx

								   mov       esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using right remote.

								  mov        edi,PrevMBAD[eax].BlkY1.BlkOffset

								   jne       RightNeCtr


								;  Left and right remote motion vectors were same as center.

								;  Compute central prediction block.


								LeftEqCtrAndRightEqCtr:


								  mov        ebx,PrevMBAD.BlkY1.MVs

								   mov       esi,PrevMBAD.BlkY1.PastRef

								  lea        edi,CentralPred

								   mov       eax,DistToBADforBlockBelow


								  call       GetPredForCenterLeftOrRight


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+32],mm0

								   psrlq     mm2,1

								  movq       [edi+40],mm1

								   pand      mm3,mm6

								  movq       [edi+48],mm2

								   psrlq     mm3,1

								  lea        ecx,PrevMBAD[eax]

								  and        ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.

								   mov       esi,PrevMBAD.BlkY1.MVs

								  movq       [edi+56],mm3

								   pcmpeqb   mm7,mm7             ;  . .  Restore 8 bytes of -1

								  mov        bl,[ecx].BlockType

								   mov       ecx,PrevMBAD.BlkY1.BlkOffset

								  cmp        bl,INTRA

								   mov       edi,AddrOfLeftPred

								  mov        ebx,PrevMBAD[eax].BlkY1.MVs

								   je        BottomHalfAllSame   ; Jump if INTRA.  (Use central)


								; Compute bottom remote prediction block.


								  cmp        esi,ebx

								   mov       esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using above remote.

								  mov        eax,PrevMBAD[eax].BlkY1.BlkOffset

								   je        BottomHalfAllSame


								  sub        esi,eax

								   lea       eax,[ecx+ebp*4]


								  call       GetPredForAboveOrBelow


								; Compute difference for lines 4 thru 7.  Only the remote motion vector below

								; was different than the central motion vector.


								                                   ; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0

								  movq       mm5,CentralPred+48    ; 6b: <C67 C66 C65 C64 C63 C62 C61 C60>

								   pand      mm2,mm6

								  movq       mm4,CentralPred+32    ; 4B: <C47 C46 C45 C44 C43 C42 C41 C40>

								   psrlq     mm2,1                 ; 6a: <B67 B66 B65 B64 B63 B62 B61 B60>

								  paddb      mm2,mm5               ; 6c: <C67+B67 ... C65+B65 ...>

								   paddb     mm0,mm4               ; 4C: <C47+B47>

								  pand       mm0,mm6               ; 4D: <C47+B47> pre-cleaned

								   psrlq     mm1,1                 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>

								  pand       mm2,mm6               ; 6d: <C67+B67 ... C65+B65 ...> pre-cleaned

								   psrlq     mm0,1                 ; 4E: <(C47+B47)/2 ...>

								  paddb      mm0,mm4               ; 4F: <(3C47+B47)/2 ...>

								   psrlq     mm2,1                 ; 6e: <(C67+B67)/2 ... (C65+B65)/2 ...>

								  pmullw     mm2,C0001000200020001 ; 6f: <(C67+B67)/2 ... (2C65+2B65)/2 ...>

								   pand      mm0,mm6               ; 4G: <(3C47+B47)/2 ...> pre-cleaned

								  pand       mm3,mm6

								   psrlq     mm0,1                 ; 4H: <(3C47+B47)/4 ...>

								  paddb      mm0,mm4               ; 4I: <(7C47+B47)/4 ...>

								   psrlq     mm3,1                 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>

								  movq       mm4,C0000FFFFFFFF0000 ; 6g: < 00  00  FF  FF  FF  FF  00  00>

								   psubb     mm0,mm7               ; 4J: <(7C47+B47+4)/4 ...>

								  pandn      mm4,mm5               ; 6h: <C67 C66  00  00  00  00 C61 C60>

								   psubb     mm5,mm7               ; 6i: <C67+1 ... C65+1 ...>

								  paddb      mm2,mm4               ; 6j: <(3C67+B67)/2 ... (2C65+2B65)/2 ...>

								   pand      mm0,mm6               ; 4K: <(7C47+B47+4)/4 ...> pre-cleaned

								  movq       mm4,CentralPred+40    ; 5B

								   pand      mm2,mm6               ; 6k: pre-cleaned

								  paddb      mm1,mm4               ; 5C

								   psrlq     mm0,1                 ; 4L: <(7C47+B47+4)/8 ...>

								  pand       mm1,mm6               ; 5D

								   psrlq     mm2,1                 ; 6l: <(3C67+B67)/4 ... (2C65+2B65)/4 ...>

								  paddb      mm2,mm5               ; 6m: <(7C67+B67+4)/4 ... (6C65+2B65+4)/4...>

								   psrlq     mm1,1                 ; 5E

								  movq       mm5,CentralPred+56    ; 7B: <C77 C76 C75 C74 C73 C72 C71 C70>

								   paddb     mm1,mm4               ; 5F

								  paddb      mm3,mm5               ; 7C: <C77+B47>

								   pand      mm1,mm6               ; 5G

								  pand       mm3,mm6               ; 7D: <C77+B47> pre-cleaned

								   psrlq     mm1,1                 ; 5H

								  paddb      mm1,mm4               ; 5I

								   psrlq     mm3,1                 ; 7E: <(C77+B47)/2 ...>

								  psubb      mm1,mm7               ; 5J

								   paddb     mm3,mm5               ; 7F: <(3C77+B47)/2 ...>

								  pand       mm1,mm6               ; 5K

								   psubb     mm3,mm7               ; 7G: <(3C77+B47+2)/2 ...>

								  pand       mm2,mm6               ; 6n: pre-cleaned

								   psrlq     mm1,1                 ; 5L

								  pand       mm3,mm6               ; 7H: <(3C77+B47+2)/2 ...> pre-cleaned

								   psrlq     mm2,1                 ; 6o: <(7C67+B67+4)/8 ... (6C65+2B65+4)/8...>

								  psrlq      mm3,1                 ; 7I: <(3C77+B47+2)/4 ...>


								BottomHalfAllSame:


								   mov       ebx,TargetFrameBaseAddress

								  mov        eax,DistToBADforBlockAbove

								   mov       esi,PrevMBAD.BlkY1.MVs

								  movq       mm5,[ecx+ebx+PITCH*5] ; 5M

								  add        ecx,ebx               ;     Address of target block.

								   lea       ebx,PrevMBAD[eax]


								  and        ebx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.

								   psubb     mm5,mm1               ; 5N

								  movq       mm4,[ecx+ebp*4]       ; 4M: T4

								  movq       mm1,[ecx+PITCH*7]     ; 7J: T7

								   psubb     mm4,mm0               ; 4N: D4 = T4 - P4

								  movq       mm0,[ecx+PITCH*6]     ; 6p: T6

								   psubb     mm1,mm3               ; 7K: D7 = T7 - P7

								  movq       PelDiffsLine4,mm4     ; 4O: Store D4.

								   psubb     mm0,mm2               ; 6q: D6 = T6 - P6

								  movq       PelDiffsLine5,mm5     ; 5O

								  movq       PelDiffsLine6,mm0     ; 6r

								  movq       PelDiffsLine7,mm1     ; 7L

								  mov        cl,[ebx].BlockType

								  cmp        cl,INTRA

								   mov       ecx,PrevMBAD.BlkY1.BlkOffset

								  mov        ebx,PrevMBAD[eax].BlkY1.MVs

								   je        SidesEqCtrAndAboveEqCtr  ; Jump if INTRA.  (Use central)


								; Compute top remote prediction block.


								  and        ebx,00000FFFFH     ; Blk above may have B MVs set.  Clear them.

								  cmp        esi,ebx

								   mov       esi,PrevMBAD[eax].BlkY1.PastRef  ; Get ref addr using above remote.

								  mov        eax,PrevMBAD[eax].BlkY1.BlkOffset

								   jne       SidesEqCtrButAboveNeCtr


								SidesEqCtrAndAboveEqCtr:


								  movq       mm0,CentralPred

								  movq       mm1,CentralPred+8

								   paddb     mm0,mm0

								  movq       mm2,CentralPred+16

								   paddb     mm1,mm1

								  movq       mm3,CentralPred+24

								   paddb     mm2,mm2

								  jmp        TopHalfAllSame


								SidesEqCtrButAboveNeCtr:


								  sub        esi,eax

								   mov       eax,ecx


								  call       GetPredForAboveOrBelow


								; Compute difference for lines 0 thru 3.  Only the remote motion vector above

								; was different than the central motion vector.


								  movq       mm5,CentralPred+8     ; 1b

								   pand      mm3,mm6

								  movq       mm4,CentralPred+24    ; 3B

								   psrlq     mm3,1                 ; 3A

								  paddb      mm3,mm4               ; 3C

								   psrlq     mm1,1                 ; 1A

								  paddb      mm1,mm5               ; 1c

								   pand      mm3,mm6               ; 3D

								  pand       mm1,mm6               ; 1d

								   psrlq     mm3,1                 ; 3E

								  paddb      mm3,mm4               ; 3F

								   psrlq     mm1,1                 ; 1e

								  pmullw     mm1,C0001000200020001 ; 1f

								   pand      mm3,mm6               ; 3G

								  pand       mm2,mm6

								   psrlq     mm3,1                 ; 3H

								  paddb      mm3,mm4               ; 3I

								   psrlq     mm2,1                 ; 2a

								  movq       mm4,C0000FFFFFFFF0000 ; 1g

								   psubb     mm3,mm7               ; 3J

								  pandn      mm4,mm5               ; 1h

								   psubb     mm5,mm7               ; 1i

								  paddb      mm1,mm4               ; 1j

								   pand      mm3,mm6               ; 3K

								  movq       mm4,CentralPred+16    ; 2B

								   pand      mm1,mm6               ; 1k

								  paddb      mm2,mm4               ; 2C

								   psrlq     mm3,1                 ; 3L

								  pand       mm2,mm6               ; 2D

								   psrlq     mm1,1                 ; 1l

								  paddb      mm1,mm5               ; 1m

								   psrlq     mm2,1                 ; 2E

								  movq       mm5,CentralPred       ; 0B

								   paddb     mm2,mm4               ; 2F

								  paddb      mm0,mm5               ; 0C

								   pand      mm2,mm6               ; 2G

								  pand       mm0,mm6               ; 0D

								   psrlq     mm2,1                 ; 2H

								  paddb      mm2,mm4               ; 2I

								   psrlq     mm0,1                 ; 0E

								  psubb      mm2,mm7               ; 2J

								   paddb     mm0,mm5               ; 0F

								  pand       mm2,mm6               ; 2K

								   psubb     mm0,mm7               ; 0G


								TopHalfAllSame:


								  mov        ebx,TargetFrameBaseAddress

								  lea        edi,[ecx+ebx]

								   pand      mm1,mm6               ; 1n

								  movq       mm7,[ecx+ebx]         ; 0J

								   pand      mm0,mm6               ; 0H

								  movq       mm5,[edi+PITCH*3]     ; 3M

								   psrlq     mm2,1                 ; 2L

								  movq       mm4,[edi+ebp*2]       ; 2M

								   psubb     mm5,mm3               ; 3N

								  psubb      mm4,mm2               ; 2N

								   psrlq     mm1,1                 ; 1o

								  movq       mm3,[edi+ebp*1]       ; 1p

								   psubb     mm3,mm1               ; 1q

								  movq       PelDiffsLine3,mm5     ; 3O

								   psrlq     mm0,1                 ; 0I

								  movq       PelDiffsLine2,mm4     ; 2O

								   psubb     mm7,mm0               ; 0K

								  movq       PelDiffsLine1,mm3     ; 1r

								  movq       PelDiffsLine0,mm7     ; 0L

								  jmp        MMxDoForwardDCTy      ; Block is in PelDiffs block;  Pitch is 16


								;=============================================================================

								; This internal function computes the OBMC contribution for the reference

								; block that uses the left, central, or right remote motion vector.

								;

								;  ebp -- PITCH

								;  edi -- Address of where to put the contribution.

								;  esi -- Address of reference block.

								;  edx -- Reserved.  MBlockActionStream

								;  ecx -- Unavailable.

								;  ebx -- Scratch.  Initially the horizontal and vertical motion vectors.

								;  eax -- Unavailable.

								;  mm7 -- 8 bytes of -1

								;  mm6 -- 8 bytes of 0xFE

								;  mm0-mm5 -- Scratch


								StackOffset TEXTEQU <12_ButAccessToLocalVariablesShouldNotBeNeeded>


								GetPredForCenterLeftOrRight:


								  shr        ebx,1

								   jc        HorzInterpInCLRPred


								  movq       mm1,[esi+ebp*1]

								  and        bl,080H

								   je        NoInterpInCLRPred


								VertInterpInCLRPred:


								  movq       mm0,[esi]

								   psubb     mm1,mm7


								  call       Get4LinesOfPred_InterpVert


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+0],mm0

								   pand      mm3,mm6

								  movq       [edi+8],mm1

								   psrlq     mm2,1

								  movq       mm1,[esi+ebp*1]

								   psrlq     mm3,1

								  movq       [edi+16],mm2

								   movq      mm0,mm4

								  movq       [edi+24],mm3

								   psubb     mm1,mm7

								  jmp        Get4MoreLinesOfPred_InterpVert


								HorzInterpInCLRPred:


								  movq       mm5,[esi+1]         ; A. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								  and        bl,080H

								   jne       BothInterpInCLRPred


								  call       Get4LinesOfPred_InterpHorz


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+0],mm0

								   pand      mm3,mm6

								  movq       [edi+8],mm1

								   psrlq     mm2,1

								  movq       mm5,[esi+1]         ; <R48 R47 R46 R45 R44 R43 R42 R41>

								   psrlq     mm3,1

								  movq       [edi+16],mm2

								   ;

								  movq       [edi+24],mm3

								   ;

								  jmp        Get4MoreLinesOfPred_InterpHorz


								BothInterpInCLRPred:


								  call       Get4LinesOfPred_InterpBoth


								  pand       mm2,mm6

								   psrlq     mm1,1

								  movq       [edi+0],mm0

								   pand      mm3,mm6

								  movq       [edi+8],mm1

								   psrlq     mm2,1

								  movq       mm1,[esi+ebp*1]

								   psrlq     mm3,1

								  movq       [edi+16],mm2

								   movq      mm0,mm4

								  movq       [edi+24],mm3

								   psubb     mm1,mm7

								  paddb      mm5,mm5

								   jmp       Get4MoreLinesOfPred_InterpBoth


								NoInterpInCLRPred:


								  movq       mm0,[esi]

								  movq       mm2,[esi+ebp*2]

								  movq       mm3,[esi+PITCH*3]

								  movq       [edi+0],mm0

								  movq       [edi+8],mm1

								  movq       [edi+16],mm2

								  movq       [edi+24],mm3

								  movq       mm3,[esi+PITCH*7]

								  movq       mm2,[esi+PITCH*6]

								   paddb     mm3,mm3

								  movq       mm1,[esi+PITCH*5]

								   paddb     mm2,mm2

								  movq       mm0,[esi+ebp*4]

								   paddb     mm1,mm1

								  ret


								;=============================================================================

								; This internal function computes the OBMC contribution for the reference

								; block that uses the remote motion vector from block above or below.

								;

								;  ebp -- PITCH

								;  edi -- Not used.

								;  esi -- Address of reference block (after ecx is added in).

								;  edx -- Reserved.  MBlockActionStream

								;  ecx -- Unavailable.  Must not be changed.

								;  ebx -- Scratch.  Initially the horizontal and vertical motion vectors.

								;  eax -- Offset within frame for block being worked on.

								;  mm7 -- 8 bytes of -1

								;  mm6 -- 8 bytes of 0xFE

								;  mm0-mm5 -- Scratch


								GetPredForAboveOrBelow:


								  shr        ebx,1

								   lea       esi,[esi+eax]

								  jc         HorzInterpInABPred


								  movq       mm1,[esi+ebp*1]

								  movq       mm0,[esi]

								   psubb     mm1,mm7

								  and        bl,080H

								   jne       Get4LinesOfPred_InterpVert


								  movq       mm2,[esi+ebp*2]

								   paddb     mm1,mm7

								  movq       mm3,[esi+PITCH*3]

								   paddb     mm1,mm1

								  paddb      mm2,mm2

								   paddb     mm3,mm3

								  ret


								HorzInterpInABPred:


								  movq       mm5,[esi+1]         ; A. .  <R08 R07 R06 R05 R04 R03 R02 R01>

								  and        bl,080H

								   jne       Get4LinesOfPred_InterpBoth


								  jmp        Get4LinesOfPred_InterpHorz


								StackOffset TEXTEQU <0>

								;=============================================================================

								ENDIF


								Done:


								IFDEF H261

								ELSE; H263

								  mov        bl,PendingOBMC

								   mov       cl,INTER1MV

								  test       bl,bl

								   je        TrulyDone


								  mov        StashBlockType,cl


								  call       DoPendingOBMCDiff


								  mov        al,IsPlainPFrame

								   add       edx,-SIZEOF T_MacroBlockActionDescr

								  test       al,al

								   jne       TrulyDone


								  movq       mm6,C0101010101010101

								   pxor      mm7,mm7                      ; Initialize SWD accumulator


								  call       MMxDoBFrameLumaBlocks


								ENDIF

								TrulyDone:


								  emms

								IFDEF H261

								  mov        eax,SWDTotal

								  mov        esp,StashESP

								  mov        edi,[esp+PSWDTotal]

								  mov        [edi],eax

								ELSE

								  mov        eax,SWDTotal

								   mov       ebx,BSWDTotal

								  mov        esp,StashESP

								  mov        edi,[esp+PSWDTotal]

								   mov       esi,[esp+PBSWDTotal]

								  mov        [edi],eax

								   mov       [esi],ebx

								ENDIF

								  pop        ebx

								   pop       ebp

								  pop        edi

								   pop       esi

								  rturn

								
MMxEDTQ endp

END