windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/ex5me.asm


								;/* *************************************************************************

								;**    INTEL Corporation Proprietary Information

								;**

								;**    This listing is supplied under the terms of a license

								;**    agreement with INTEL Corporation and may not be copied

								;**    nor disclosed except in accordance with the terms of

								;**    that agreement.

								;**

								;**    Copyright (c) 1995 Intel Corporation.

								;**    All Rights Reserved.

								;**

								;** *************************************************************************

								;*/


								;////////////////////////////////////////////////////////////////////////////

								;//

								;// $Header:   R:\h26x\h26x\src\enc\ex5me.asv   1.17   24 Sep 1996 11:27:00   BNICKERS  $

								;//

								;// $Log:   R:\h26x\h26x\src\enc\ex5me.asv  $

								;//

								;//    Rev 1.17   24 Sep 1996 11:27:00   BNICKERS

								;//

								;// Fix register colision.

								;//

								;//    Rev 1.16   24 Sep 1996 10:40:32   BNICKERS

								;// For H261, zero out motion vectors when classifying MB as Intra.

								;//

								;//    Rev 1.13   19 Aug 1996 13:48:26   BNICKERS

								;// Provide threshold and differential variables for spatial filtering.

								;//

								;//    Rev 1.12   17 Jun 1996 15:19:34   BNICKERS

								;// Fix recording of block and MB SWDs for Spatial Loop Filtering case in H261.

								;//

								;//    Rev 1.11   30 May 1996 16:40:14   BNICKERS

								;// Fix order of arguments.

								;//

								;//    Rev 1.10   30 May 1996 15:08:36   BNICKERS

								;// Fixed minor error in recent IA ME speed improvements.

								;//

								;//    Rev 1.9   29 May 1996 15:37:58   BNICKERS

								;// Acceleration of IA version of ME.

								;//

								;//    Rev 1.8   15 Apr 1996 10:48:48   AKASAI

								;// Fixed bug in Spatial loop filter code.  Code had been unrolled and

								;// the second case had not been updated in the fix put in place of

								;// (for) the first case.  Basically an ebx instead of bl that cased

								;// and overflow from 7F to 3F.

								;//

								;//    Rev 1.7   15 Feb 1996 15:39:26   BNICKERS

								;// No change.

								;//

								;//    Rev 1.6   15 Feb 1996 14:39:00   BNICKERS

								;// Fix bug wherein access to area outside stack frame was occurring.

								;//

								;//    Rev 1.5   15 Jan 1996 14:31:40   BNICKERS

								;// Fix decrement of ref area addr when half pel upward is best in block ME.

								;// Broadcast macroblock level MV when block gets classified as Intra.

								;//

								;//    Rev 1.4   12 Jan 1996 13:16:08   BNICKERS

								;// Fix SLF so that 3 7F pels doesn't overflow, and result in 3F instead of 7F.

								;//

								;//    Rev 1.3   27 Dec 1995 15:32:46   RMCKENZX

								;// Added copyright notice

								;//

								;//    Rev 1.2   19 Dec 1995 17:11:16   RMCKENZX

								;// fixed 2 bugs:

								;//   1.  do +-15 pel search if central and NOT 4 mv / macroblock

								;//      (was doing when central AND 4 mv / macroblock)

								;//   2.  correctly compute motion vectors when doing 4 motion

								;//      vectors per block.

								;//

								;//    Rev 1.1   28 Nov 1995 15:25:48   AKASAI

								;// Added white space so that will complie with the long lines.

								;//

								;//    Rev 1.0   28 Nov 1995 14:37:00   BECHOLS

								;// Initial revision.

								;//

								;//

								;//    Rev 1.13   22 Nov 1995 15:32:42   DBRUCKS

								;// Brian made this change on my system.

								;// Increased a value to simplify debugging

								;//

								;//

								;//

								;//    Rev 1.12   17 Nov 1995 10:43:58   BNICKERS

								;// Fix problems with B-Frame ME.

								;//

								;//

								;//

								;//    Rev 1.11   31 Oct 1995 11:44:26   BNICKERS

								;// Save/restore ebx.

								;//

								;////////////////////////////////////////////////////////////////////////////

								;

								; MotionEstimation -- This function performs motion estimation for the macroblocks identified

								;                     in the input list.

								;                     Conditional assembly selects either the H263 or H261 version.

								;

								; Input Arguments:

								;

								;   MBlockActionStream

								;

								;     The list of macroblocks for which we need to perform motion estimation.

								;

								;     Upon input, the following fields must be defined:

								;

								;       CodedBlocks -- Bit 6 must be set for the last macroblock to be processed.

								;

								;       FirstMEState -- must be 0 for macroblocks that are forced to be Intracoded.  An

								;                       IntraSWD will be calculated.

								;                       Other macroblocks must have the following values:

								;                        1:  upper left, without advanced prediction.  (Advanced prediction

								;                            only applies to H263.)

								;                        2:  upper edge, without advanced prediction.

								;                        3:  upper right, without advanced prediction.

								;                        4:  left edge, without advanced prediction.

								;                        5:  central block, or any block if advanced prediction is being done.

								;                        6:  right edge, without advanced prediction.

								;                        7:  lower left, without advanced prediction.

								;                        8:  lower edge, without advanced prediction.

								;                        9:  lower right, without advanced prediction.

								;                       If vertical motion is NOT allowed:

								;                       10:  left edge, without advanced prediction.

								;                       11:  central block, or any block if advanced prediction is being done.

								;                       12:  right edge, without advanced prediction.

								;                       *** Note that with advanced prediction, only initial states 0, 4, or

								;                           11 can be specified.  Doing block level motion vectors mandates

								;                           advanced prediction, but in that case, only initial

								;                           states 0 and 4 are allowed.

								;

								;       BlkOffset -- must be defined for each of the blocks in the macroblocks.

								;

								;   TargetFrameBaseAddress -- Address of upper left viewable pel in the target Y plane.

								;

								;   PreviousFrameBaseAddress -- Address of upper left viewable pel in the previous Y plane.  Whether this is the

								;                               reconstructed previous frame, or the original, is up to the caller to decide.

								;

								;   FilteredFrameBaseAddress -- Address of upper left viewable pel in the scratch area that this function can record

								;                               the spatially filtered prediction for each block, so that frame differencing can

								;                               utilize it rather than have to recompute it.  (H261 only)

								;

								;   DoRadius15Search -- TRUE if central macroblocks should search a distance of 15 from center.  Else searches 7 out.

								;

								;   DoHalfPelEstimation -- TRUE if we should do ME to half pel resolution.  This is only applicable for H263 and must

								;                          be FALSE for H261.  (Note:  TRUE must be 1;  FALSE must be 0).

								;

								;   DoBlockLevelVectors -- TRUE if we should do ME at block level.  This is only applicable for H263 and must be FALSE

								;                          for H261.  (Note:  TRUE must be 1; FALSE must be 0).

								;   DoSpatialFiltering -- TRUE if we should determine if spatially filtering the prediction reduces the SWD.  Only

								;                         applicable for H261 and must be FALSE for H263.  (Note:  TRUE must be 1;  FALSE must be 0).

								;

								;   ZeroVectorThreshold -- If the SWD for a macroblock is less than this threshold, we do not bother searching for a

								;                          better motion vector.  Compute as follows, where D is the average tolerable pel difference

								;                          to satisfy this threshold.  (Initial recommendation:  D=2  ==> ZVT=384)

								;                             ZVT = (128 * ((int)((D**1.6)+.5)))

								;

								;   NonZeroDifferential -- After searching for the best motion vector (or individual block motion vectors, if enabled),

								;                          if the macroblock's SWD is not better than it was for the zero vector -- not better by at

								;                          least this amount -- then we revert to the zero vector.  We are comparing two macroblock

								;                          SWDs, both calculated as follows:   (Initial recommendation:	 NZD=128)

								;                            For each of 128 match points, where D is its Abs Diff, accumulate ((int)(M**1.6)+.5)))

								;

								;   BlockMVDifferential -- The amount by which the sum of four block level SWDs must be better than a single macroblock

								;                          level SWD to cause us to choose block level motion vectors.  See NonZeroDifferential for

								;                          how the SWDs are calculated.  Only applicable for H261.  (Initial recommendation:  BMVD=128)

								;

								;   EmptyThreshold -- If the SWD for a block is less than this, the block is forced empty.  Compute as follows, where D

								;                     is the average tolerable pel diff to satisfy threshold.  (Initial recommendation:  D=3 ==> ET=96)

								;                        ET = (32 * ((int)((D**1.6)+.5)))

								;

								;   InterCodingThreshold -- If any of the blocks are forced empty, we can simply skip calculating the INTRASWD for the

								;                           macroblock.  If none of the blocks are forced empty, we will compare the macroblock's SWD

								;                           against this threshold.  If below the threshold, we will likewise skip calculating the

								;                           INTRASWD.  Otherwise, we will calculate the INTRASWD, and if it is less than the [Inter]SWD,

								;                           we will classify the block as INTRA-coded.  Compute as follows, where D is the average

								;                           tolerable pel difference to satisfy threshold.  (Initial recommendation:  D=4 ==> ICT=1152)

								;                             ICT = (128 * ((int)((D**1.6)+.5)))

								;

								;   IntraCodingDifferential -- For INTRA coding to occur, the INTRASWD must be better than the INTERSWD by at least

								;                              this amount.

								;

								; Output Arguments

								;

								;   MBlockActionStream

								;

								;     These fields are defined as follows upon return:

								;

								;       BlockType -- Set to INTRA, INTER1MV, or (H263 only) INTER4MV.

								;

								;       PHMV and PVMV -- The horizontal and vertical motion vectors,  in units of a half pel.

								;

								;       BHMV and BVMV -- These fields get clobbered.

								;

								;       PastRef -- If BlockType != INTRA, set to the address of the reference block.

								;

								;                  If Horizontal MV indicates a half pel position, the prediction for the upper left pel of the block

								;                  is the average of the pel at PastRef and the one at PastRef+1.

								;

								;                  If Vertical MV indicates a half pel position, the prediction for the upper left pel of the block

								;                  is the average of the pel at PastRef and the one at PastRef+PITCH.

								;

								;                  If both MVs indicate half pel positions, the prediction for the upper left pel of the block is the

								;                  average of the pels at PastRef, PastRef+1, PastRef+PITCH, and PastRef+PITCH+1.

								;

								;                  Indications of a half pel position can only happen for H263.

								;

								;                  In H261, when spatial filtering is done, the address will be in the SpatiallyFilteredFrame, where

								;                  this function stashes the spatially filtered prediction for subsequent reuse by frame differencing.

								;

								;       CodedBlocks -- Bits 4 and 5 are turned on, indicating that the U and V blocks should be processed.  (If the

								;                      FDCT function finds them to quantize to empty, it will mark them as empty.)

								;

								;                      Bits 0 thru 3 are cleared for each of blocks 1 thru 4 that MotionEstimation forces empty;

								;                      they are set otherwise.

								;

								;                      Bits 6 and 7 are left unchanged.

								;

								;       SWD -- Set to the sum of the SWDs for the four luma blocks in the macroblock.  The SWD for any block that is

								;              forced empty, is NOT included in the sum.

								;

								;

								;

								;   IntraSWDTotal  -- The sum of the block SWDs for all Intracoded macroblocks.

								;

								;   IntraSWDBlocks -- The number of blocks that make up the IntraSWDTotal.

								;

								;   InterSWDTotal  -- The sum of the block SWDs for all Intercoded macroblocks.

								;                     None of the blocks forced empty are included in this.

								;

								;   InterSWDBlocks -- The number of blocks that make up the InterSWDTotal.

								;

								;

								; Other assumptions:

								;

								;   For performance reasons, it is assumed that the layout of current and previous frames (and spatially filtered

								;   frame for H261) rigourously conforms to the following guide.

								;

								;   The spatially filtered frame (only present and applicable for H261) is an output frame into which MotionEstimation

								;   places spatially filtered macroblocks as it determines if filtering is good for a macroblock.  If it determines

								;   such, frame differencing will be able to re-use the spatially filtered macroblock, rather than recomputing it.

								;

								;   Cache

								;   Alignment

								;   Points:  v       v       v       v       v       v       v       v       v       v       v       v       v

								;             16 | 352 (narrower pictures are left justified)                                            | 16

								;            +---+---------------------------------------------------------------------------------------+---+

								;            | D |  Current Frame Y Plane                                                                | D |

								;            | u |                                                                                       | u |

								;   Frame    | m |                                                                                       | m |

								;   Height   | m |                                                                                       | m |

								;   Lines    | y |                                                                                       | y |

								;            |   |                                                                                       |   |

								;            +---+---------------------------------------------------------------------------------------+---+

								;            |                                                                                               |

								;            |                                                                                               |

								;            |                                                                                               |

								;   24 lines |      Dummy Space (24 lines plus 8 bytes.  Can be reduced to 8 bytes if unrestricted motion    |

								;            |      vectors is NOT selected.)                                                                |

								;            |                                                                                               |

								;            |  8  176                                        16   176                                       |8

								;            | +-+-------------------------------------------------------------------------------------------+-+

								;            +-+D|  Current Frame U Plane                    | D |  Current Frame V Plane                    |D|

								;   Frame      |u|                                           | u |                                           |u|

								;   Height     |m|                                           | m |                                           |m|

								;   Div By 2   |m|                                           | m |                                           |m|

								;   Lines      |y|                                           | y |                                           |y|

								;              +-+-------------------------------------------+---+-------------------------------------------+-+

								;            72 dummy bytes.  I.e. enough dummy space to assure that MOD ((Previous_Frame - Current_Frame), 128) == 80

								;            +-----------------------------------------------------------------------------------------------+

								;            |                                                                                               |

								;   16 lines | If Unrestricted Motion Vectors selected, 16 lines must appear above and below previous frame, |

								;            | and these lines plus the 16 columns to the left and 16 columns to the right of the previous   |

								;            | frame must be initialized to the values at the edges and corners, propagated outward.  If     |

								;            | Unrestricted Motion Vectors is off, these lines don't have to be allocated.                   |

								;            |                                                                                               |

								;            |   +---------------------------------------------------------------------------------------+   +

								;   Frame    |   |  Previous Frame Y Plane                                                               |   |

								;   Height   |   |                                                                                       |   |

								;   Lines    |   |                                                                                       |   |

								;            |   |                                                                                       |   |

								;            |   |                                                                                       |   |

								;            |   +---------------------------------------------------------------------------------------+   +

								;            |                                                                                               |

								;   16 lines | See comment above Previous Y Plane                                                            |

								;            |                                                                                               |

								;            |+--- 8 bytes of dummy space.  Must be there, whether unrestricted MV or not.                   |

								;            ||                                                                                              |

								;            |v+-----------------------------------------------+---------------------------------------------+-+

								;            +-+                                               |                                               |

								;              | See comment above Previous Y Plane.           | See comment above Previous Y Plane.           |

								;   8 lines    | Same idea here, but 8 lines are needed above  | Same idea here, but 8 lines are needed        |

								;              | and below U plane, and 8 columns on each side.| and below V plane, and 8 columns on each side.|

								;              |                                               |                                               |

								;              |8  176                                        8|8  176                                        8|

								;              | +-------------------------------------------+ | +-------------------------------------------+ |

								;              | |  Previous Frame U Plane                   | | |  Previous Frame V Plane                   | |

								;   Frame      | |                                           | | |                                           | |

								;   Height     | |                                           | | |                                           | |

								;   Div By 2   | |                                           | | |                                           | |

								;   Lines      | |                                           | | |                                           | |

								;              | +-------------------------------------------+ | +-------------------------------------------+ |

								;              |                                               |                                               |

								;   8 lines    | See comment above Previous U Plane            | See comment above Previous V Plane            |

								;              |                                               |                                               |

								;              |                                               |                                               |

								;              |                                               |                                               |

								;              +-----------------------------------------------+---------------------------------------------+-+

								;            Enough dummy space to assure that MOD ((Spatial_Frame - Previous_Frame), 4096) == 2032

								;            +---+---------------------------------------------------------------------------------------+---+

								;            | D |  Spatially Filtered Y Plane (present only for H261)                                   | D |

								;            | u |                                                                                       | u |

								;   Frame    | m |                                                                                       | m |

								;   Height   | m |                                                                                       | m |

								;   Lines    | y |                                                                                       | y |

								;            |   |                                                                                       |   |

								;            +---+---------------------------------------------------------------------------------------+---+

								;            |                                                                                               |

								;            |                                                                                               |

								;            |                                                                                               |

								;   24 lines |      Dummy Space (24 lines plus 8 bytes.  Can be reduced to 8 bytes if unrestricted motion    |

								;            |      vectors is NOT selected, which is certainly the case for H261.)                          |

								;            |                                                                                               |

								;            |  8  176                                        16   176                                       |8

								;            | +-+-------------------------------------------------------------------------------------------+-+

								;            +-+D|  Spatially Filtered U plane (H261 only)   | D |  Spatially Filtered V plane (H261 only)   |D|

								;   Frame      |u|                                           | u |                                           |u|

								;   Height     |m|                                           | m |                                           |m|

								;   Div By 2   |m|                                           | m |                                           |m|

								;   Lines      |y|                                           | y |                                           |y|

								;              +-+-------------------------------------------+---+-------------------------------------------+-+

								;

								; Cache layout of the target block and the full range for the reference area (as restricted to +/- 7 in vertical,

								; and +/- 7 (expandable to +/- 15) in horizontal, is as shown here.  Each box represents a cache line (32 bytes),

								; increasing incrementally from left to right, and then to the next row (like reading a book).  The 128 boxes taken

								; as a whole represent 4Kbytes.  The boxes are populated as follows:

								;

								;   R -- Data from the reference area.  Each box contains 23 of the pels belonging to a line of the reference area.

								;        The remaining 7 pels of the line is either in the box to the left (for reference areas used to provide

								;        predictions for target macroblocks that begin at an address 0-mod-32), or to the right (for target MBs that

								;        begin at an address 16-mod-32).  There are 30 R's corresponding to the 30-line limit on the vertical distance

								;        we might search.

								;

								;   T -- Data from the target macroblock.  Each box contains a full line (16 pels) for each of two adjacent

								;        macroblocks.  There are 16 T's corresponding to the 16 lines of the macroblocks.

								;

								;   S -- Space for the spatially filtered macroblock (H261 only).

								;

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+---+---+---+---+

								;      | T |   | R |   | S |   | R |   |

								;      +---+---+---+---+---+---+---+---+

								;

								; Thus, in a logical sense, the above data fits into one of the 4K data cache pages, leaving the other for all other

								; data.  Care has been taken to assure that the tables and the stack space needed by this function fit nicely into

								; the other data cache page.    Only the MBlockActionStream remains to conflict with the above data structures.  That

								; is both unavoidable, and of minimal consequence.

								; An algorithm has been selected that calculates fewer SWDs (Sum of Weighted Differences) than the typical log search.

								; In the typical log search, a three level search is done, in which the SWDs are compared for the center point and a

								; point at each 45 degrees, initially 4 pels away, then 2, then 1.  This requires a total of 25 SWDs for each

								; macroblock (except those near edges or corners).

								;

								; In this algorithm, six levels are performed, with each odd level being a horizontal search, and each even level being

								; a vertical search.  Each search compares the SWD for the center point with that of a point in each direction on the

								; applicable axis.  This requires 13 SWDs, and a lot simpler control structure.  Here is an example picture of a

								; search, in which "0" represents the initial center point (the 0,0 motion vector), "A", and "a" represent the first

								; search points, etc.  In this example, the "winner" of each level of the search proceeds as follows:  a, B, C, C, E, F,

								; arriving at a motion vector of -1 horizontal, 5 vertical.

								;

								;                ...............

								;                ...............

								;                ...............

								;                ...b...........

								;                ...............

								;                ...............

								;                ...............

								;                ...a...0...A...

								;                ...............

								;                .....d.........

								;                ......f........

								;                .c.BeCE........

								;                ......F........

								;                .....D.........

								;                ...............

								;

								;

								; A word about data cache performance.  Conceptually, the tables and local variables used by this function are placed

								; in memory such that they will fit in one 4K page of the on-chip data cache.  For the Pentium (tm) microprocessor,

								; this leaves the other 4K page for other purposes.  The other data structures consist of:

								;

								;   The current frame, from which we need to access the lines of the 16*16 macroblock.  Since cache lines are 32 bytes

								;   wide, the cache fill operations that fetch one target macroblock will serve to fetch the macroblock to the right,

								;   so an average of 8 cache lines are fetched for each macroblock.

								;

								;   The previous frame, from which we need to access a reference area of 30*30 pels.  For each macroblock for which we

								;   need to search for a motion vector, we will typically need to access no more than about 25 of these, but in general

								;   these lines span the 30 lines of the search area.  Since cache lines are 32 bytes wide, the cache fill operations

								;   that fetch reference data for one macroblock, will tend to fetch data that is useful as reference data for the

								;   macroblock to the right, so an average of about 15 (rounded up to be safe) cache lines are fetched for each

								;   macroblock.

								;

								;   The MBlockActionStream, which controls the searching (since we don't need to motion estimate blocks that are

								;   legislated to be intra) will disrupt cache behaviour of the other data structures, but not to a significant degree.

								;

								; By setting the pitch to a constant of 384, and by allocating the frames as described above, the one available 4K page

								; of data cache will be able to contain the 30 lines of the reference area, the 16 lines of the target area, and the

								; 16 lines of the spatially filtered area (H261 only) without any collisions.

								;

								;

								; Here is a flowchart of the major sections of this function:

								;

								; +-- Execute once for Y part of each macroblock that is NOT Intra By Decree --+

								; |                                                                            |

								; |   +---------------------------------------------------------------+        |

								; |   |  1) Compute average value for target match points.            |        |

								; |   |  2) Prepare match points in target MB for easier matching.    |        |

								; |   |  3) Compute the SWD for (0,0) motion vector.                  |        |

								; |   +---------------------------------------------------------------+        |

								; |                 |                                                          |

								; |                 v                                                          |

								; |           /---------------------------------\  Yes                         |

								; |          < 4) Is 0-motion SWD good enough?   >-------------------------+   |

								; |           \---------------------------------/                          |   |

								; |                                          |                             |   |

								; |                                          |No                           |   |

								; |                                          v                             |   |

								; |     +--- 5) While state engine has more motion vectors to check ---+   |   |

								; |     |                                                              |   |   |

								; |     |                                                              |   |   |

								; |     |   +---------------------------------------------------+      |   |   |

								; |     |   | 5) Compute SWDs for 2 ref MBs and pick best of 3. |----->|   |   |

								; |     |   +---------------------------------------------------+      |   |   |

								; |     |                                                              |   |   |

								; |     +--------------------------------------------------------------+   |   |

								; |                             |                                          |   |

								; |                             v                                          |   |

								; |                /-----------------------------------------\             |   |

								; |               <  6) Is best motion vector the 0-vector?   >            |   |

								; |                \-----------------------------------------/             |   |

								; |                   |              |                                     |   |

								; |                   |No            |Yes                                  |   |

								; |                   v              v                                     |   |

								; |     +-----------------+ +-------------------------------------------+  |   |

								; |     | Mark all blocks | |  6) Identify as empty block any in which: |<-+   |

								; |  +--| non-empty.      | |     -->  0-motion SWD < EmptyThresh, and  |      |

								; |  |  +-----------------+ +-------------------------------------------+      |

								; |  |                          |                                              |

								; |  |                          v                                              |

								; |  |   /--------------------------------\ Yes +--------------------------+   |

								; |  |  <  6) Are all blocks marked empty? >--->|  6) Classify FORCEDEMPTY |-->|

								; |  |   \--------------------------------/     +--------------------------+   |

								; |  |                |                                                        |

								; |  |                |No                                                      |

								; |  |                v                                                        |

								; |  |        /--------------------------------------------\                   |

								; |  |       <  7) Are any non-phantom blocks marked empty? >                  |

								; |  |        \--------------------------------------------/                   |

								; |  |                |            |                                           |

								; |  |                |No          |Yes                                        |

								; |  v                v            v                                           |

								; | +---------------------+   +--------------------------------+               |

								; | | 8) Compute IntraSWD |   | Set IntraSWD artificially high |               |

								; | +---------------------+   +--------------------------------+               |

								; |              |                 |                                           |

								; |              v                 v                                           |

								; |         +-------------------------------+                                  |

								; |         | 10) Classify block as one of: |                                  |

								; |         |       INTRA                   |--------------------------------->|

								; |         |       INTER                   |                                  |

								; |         +-------------------------------+                                  |

								; |                                                                            |

								; +----------------------------------------------------------------------------+

								;

								;


								OPTION PROLOGUE:None

								OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro

								OPTION M510


								include e3inst.inc

								include e3mbad.inc


								.xlist

								include memmodel.inc

								.list

								.DATA


								;  Storage for tables and temps used by Motion Estimation function.  Fit into

								;  4Kbytes contiguous memory so that it uses one cache page, leaving other

								;  for reference area of previous frame and target macroblock of current frame.


								PickPoint     DB  0,4,?,4,0,?,2,2 ; Map CF accum to new central pt selector.

								PickPoint_BLS DB  6,4,?,4,6,?,2,2 ; Same, for when doing block level search.


								OffsetToRef LABEL DWORD    ; Linearized adjustments to affect horz/vert motion.

								   DD  ?        ; This index used when zero-valued motion vector is good enough.

								   DD  0        ; Best fit of 3 SWDs is previous center.

								   DD  1        ; Best fit of 3 SWDs is the ref block 1 pel to the right.

								   DD -1        ; Best fit of 3 SWDs is the ref block 1 pel to the left.

								   DD  1*PITCH  ; Best fit of 3 SWDs is the ref block 1 pel above.

								   DD -1*PITCH  ; Best fit of 3 SWDs is the ref block 1 pel below.

								   DD  2        ; Best fit of 3 SWDs is the ref block 2 pels to the right.

								   DD -2        ; Best fit of 3 SWDs is the ref block 2 pels to the left.

								   DD  2*PITCH  ; Best fit of 3 SWDs is the ref block 2 pel above.

								   DD -2*PITCH  ; Best fit of 3 SWDs is the ref block 2 pel below.

								   DD  4        ; Best fit of 3 SWDs is the ref block 4 pels to the right.

								   DD -4        ; Best fit of 3 SWDs is the ref block 4 pels to the left.

								   DD  4*PITCH  ; Best fit of 3 SWDs is the ref block 4 pel above.

								   DD -4*PITCH  ; Best fit of 3 SWDs is the ref block 4 pel below.

								   DD  7        ; Best fit of 3 SWDs is the ref block 7 pels to the right.

								   DD -7        ; Best fit of 3 SWDs is the ref block 7 pels to the left.

								   DD  7*PITCH  ; Best fit of 3 SWDs is the ref block 7 pel above.

								   DD -7*PITCH  ; Best fit of 3 SWDs is the ref block 7 pel below.


								M0   =  4  ; Define symbolic indices into OffsetToRef lookup table.

								MHP1 =  8

								MHN1 = 12

								MVP1 = 16

								MVN1 = 20

								MHP2 = 24

								MHN2 = 28

								MVP2 = 32

								MVN2 = 36

								MHP4 = 40

								MHN4 = 44

								MVP4 = 48

								MVN4 = 52

								MHP7 = 56

								MHN7 = 60

								MVP7 = 64

								MVN7 = 68


								                ; Map linearized motion vector to vertical part.

								                ; (Mask bottom byte of linearized MV to zero, then use result

								                ; as index into this array to get vertical MV.)

								IF PITCH-384

								*** error:  The magic of this table assumes a pitch of 384.

								ENDIF

								   DB -32, -32

								   DB -30

								   DB -28, -28

								   DB -26

								   DB -24, -24

								   DB -22

								   DB -20, -20

								   DB -18

								   DB -16, -16

								   DB -14

								   DB -12, -12

								   DB -10

								   DB  -8,  -8

								   DB  -6

								   DB  -4,  -4

								   DB  -2

								   DB   0

								UnlinearizedVertMV  DB 0

								   DB   2

								   DB   4,   4

								   DB   6

								   DB   8,   8

								   DB  10

								   DB  12,  12

								   DB  14

								   DB  16,  16

								   DB  18

								   DB  20,  20

								   DB  22

								   DB  24,  24

								   DB  26

								   DB  28,  28

								   DB  30


								; Map initial states to initializers for half pel search.  Where search would

								; illegally take us off edge of picture, set initializer artificially high.


								InitHalfPelSearchHorz LABEL DWORD

								  DD 040000000H, 000000000H, 000004000H

								  DD 040000000H, 000000000H, 000004000H

								  DD 040000000H, 000000000H, 000004000H

								  DD 040000000H, 000000000H, 000004000H


								InitHalfPelSearchVert LABEL DWORD

								  DD 040000000H, 040000000H, 040000000H

								  DD 000000000H, 000000000H, 000000000H

								  DD 000004000H, 000004000H, 000004000H

								  DD 040004000H, 040004000H, 040004000H


								SWDState LABEL BYTE ; Rules that govern state engine of motion estimator.


								   DB   8 DUP (?)       ; 0:  not used.


								                        ; 1:  Upper Left Corner.  Explore 4 right and 4 down.

								   DB  21, M0           ; (0,0)

								   DB  22, MHP4         ; (0,4)

								   DB  23, MVP4, ?, ?   ; (4,0)


								                        ; 2:  Upper Edge.  Explore 4 left and 4 right.

								   DB  22, M0           ; (0, 0)

								   DB  22, MHN4         ; (0,-4)

								   DB  22, MHP4, ?, ?   ; (0, 4)


								                        ; 3:  Upper Right Corner.  Explore 4 right and 4 down.

								   DB  31, M0           ; (0, 0)

								   DB  22, MHN4         ; (0,-4)

								   DB  32, MVP4, ?, ?   ; (4, 0)


								                        ; 4:  Left Edge.  Explore 4 up and 4 down.

								   DB  23, M0           ; ( 0,0)

								   DB  23, MVN4         ; (-4,0)

								   DB  23, MVP4, ?, ?   ; ( 4,0)


								                        ; 5:  Interior Macroblock.  Explore 4 up and 4 down.

								   DB  37, M0           ; ( 0,0)

								   DB  37, MVN4         ; (-4,0)

								   DB  37, MVP4, ?, ?   ; ( 4,0)


								                        ; 6:  Right Edge.  Explore 4 up and 4 down.

								   DB  32, M0           ; ( 0,0)

								   DB  32, MVN4         ; (-4,0)

								   DB  32, MVP4, ?, ?   ; ( 4,0)


								                        ; 7:  Lower Left Corner.  Explore 4 up and 4 right.

								   DB  38, M0           ; ( 0,0)

								   DB  39, MHP4         ; ( 0,4)

								   DB  23, MVN4, ?, ?   ; (-4,0)


								                        ; 8:  Lower Edge.  Explore 4 left and 4 right.

								   DB  39, M0           ; (0, 0)

								   DB  39, MHN4         ; (0,-4)

								   DB  39, MHP4, ?, ?   ; (0, 4)


								                        ; 9:  Lower Right Corner.  Explore 4 up and 4 left.

								   DB  44, M0           ; ( 0, 0)

								   DB  39, MHN4         ; ( 0,-4)

								   DB  32, MVN4, ?, ?   ; (-4, 0)


								                        ; 10: Left Edge, No Vertical Motion Allowed.

								   DB  46, M0           ; (0,0)

								   DB  48, MHP2         ; (0,2)

								   DB  47, MHP4, ?, ?   ; (0,4)


								                        ; 11: Interior Macroblock, No Vertical Motion Allowed.

								   DB  47, M0           ; (0, 0)

								   DB  47, MHN4         ; (0,-4)

								   DB  47, MHP4, ?, ?   ; (0, 4)


								                        ; 12: Right Edge, No Vertical Motion Allowed.

								   DB  49, M0           ; (0, 0)

								   DB  48, MHN2         ; (0,-2)

								   DB  47, MHN4, ?, ?   ; (0,-4)


								                        ; 13: Horz by 2, Vert by 2, Horz by 1, Vert by 1.

								   DB  14, M0

								   DB  14, MHP2

								   DB  14, MHN2, ?, ?


								                        ; 14: Vert by 2, Horz by 1, Vert by 1.

								   DB  15, M0

								   DB  15, MVP2

								   DB  15, MVN2, ?, ?


								                        ; 15: Horz by 1, Vert by 1.

								   DB  16, M0

								   DB  16, MHP1

								   DB  16, MHN1, ?, ?


								                        ; 16: Vert by 1.

								   DB   0, M0

								   DB   0, MVP1

								   DB   0, MVN1, ?, ?


								                        ; 17: Vert by 2, Horz by 2, Vert by 1, Horz by 1.

								   DB  18, M0

								   DB  18, MVP2

								   DB  18, MVN2, ?, ?


								                        ; 18: Horz by 2, Vert by 1, Horz by 1.

								   DB  19, M0

								   DB  19, MHP2

								   DB  19, MHN2, ?, ?


								                        ; 19: Vert by 1, Horz by 1.

								   DB  20, M0

								   DB  20, MVP1

								   DB  20, MVN1, ?, ?


								                        ; 20: Horz by 1.

								   DB   0, M0

								   DB   0, MHP1

								   DB   0, MHN1, ?, ?


								                        ; 21: From 1A.  Upper Left.  Try 2 right and 2 down.

								   DB  24, M0           ; (0, 0)

								   DB  25, MHP2         ; (0, 2)

								   DB  26, MVP2, ?, ?   ; (2, 0)


								                        ; 22: From  1B.

								                        ;     From  2  center point would be (0,-4/0/4).

								                        ;     From  3B center point would be (0,-4).

								   DB  27, M0           ; (0, 4)

								   DB  18, MVP2         ; (2, 4) Next: Horz 2, Vert 1, Horz 1.         (1:3,1:7)

								   DB  13, MVP4, ?, ?   ; (4, 4) Next: Horz 2, Vert 2, Horz 1, Vert 1. (1:7,1:7)


								                        ; 23: From  1C.

								                        ;     From  4  center point would be (-4/0/4,0).

								                        ;     From  7C center point would be (-4,0).

								   DB  29, M0           ; (4, 0)

								   DB  14, MHP2         ; (4, 2) Next: Vert 2, Horz 1, Vert 1.         (1:7,1:3)

								   DB  17, MHP4, ?, ?   ; (4, 4) Next: Vert 2, Horz 2, Vert 1, Horz 1. (1:7,1:7)


								                        ; 24: From 21A.  Upper Left.  Try 1 right and 1 down.

								   DB   0, M0           ; (0, 0)

								   DB   0, MHP1         ; (1, 0)

								   DB   0, MVP1, ?, ?   ; (0, 1)


								                        ; 25: From 21B.

								                        ;     From 31B center point would be (0,-2).

								   DB  20, M0           ; (0, 2) Next: Horz 1                            (0,1:3)

								   DB  20, MVP1         ; (1, 2) Next: Horz 1                            (1,1:3)

								   DB  15, MVP2, ?, ?   ; (2, 2) Next: Horz 1, Vert 1                  (1:3,1:3)


								                        ; 26: From 21C.

								                        ;     From 38C center point would be (-2,0).

								   DB  16, M0           ; (2, 0) Next: Vert 1                            (1:3,0)

								   DB  16, MHP1         ; (2, 1) Next: Vert 1                            (1:3,1)

								   DB  19, MHP2, ?, ?   ; (2, 2) Next: Vert 1, Horz 1                  (1:3,1:3)


								                        ; 27: From 22A.

								   DB  28, M0           ; (0, 4)

								   DB  28, MHN2         ; (0, 2)

								   DB  28, MHP2, ?, ?   ; (0, 6)


								                        ; 28: From 27.

								   DB  20, M0           ; (0, 2/4/6) Next: Horz 1.                       (0,1:7)

								   DB  20, MVP1         ; (1, 2/4/6) Next: Horz 1.                       (1,1:7)

								   DB  20, MVP2, ?, ?   ; (2, 2/4/6) Next: Horz 1.                       (2,1:7)


								                        ; 29: From 23A.

								   DB  30, M0           ; (4, 0)

								   DB  30, MVN2         ; (2, 0)

								   DB  30, MVP2, ?, ?   ; (6, 0)


								                        ; 30: From 29.

								   DB  16, M0           ; (2/4/6, 0) Next: Vert 1.                       (1:7,0)

								   DB  16, MHP1         ; (2/4/6, 1) Next: Vert 1.                       (1:7,1)

								   DB  16, MHP2, ?, ?   ; (2/4/6, 2) Next: Vert 1.                       (1:7,2)


								                        ; 31: From  3A.  Upper Right.  Try 2 left and 2 down.

								   DB  33, M0           ; (0, 0)

								   DB  25, MHN2         ; (0,-2)

								   DB  34, MVP2, ?, ?   ; (2, 0)


								                        ; 32: From  3C.

								                        ;     From  6  center point would be (-4/0/4, 0)

								                        ;     From  9C center point would be (-4, 0)

								   DB  35, M0           ; (4, 0)

								   DB  14, MHN2         ; (4,-2) Next: Vert2,Horz1,Vert1.            (1:7,-1:-3)

								   DB  17, MHN4, ?, ?   ; (4,-4) Next: Vert2,Horz2,Vert1,Horz1.      (1:7,-1:-7)


								                        ; 33: From 31A.  Upper Right.  Try 1 left and 1 down.

								   DB   0, M0           ; (0, 0)

								   DB   0, MHN1         ; (0,-1)

								   DB   0, MVP1, ?, ?   ; (1, 0)


								                        ; 34: From 31C.

								                        ;     From 44C center point would be (-2, 0)

								   DB  16, M0           ; (2, 0) Next: Vert 1                           (1:3, 0)

								   DB  16, MHN1         ; (2,-1) Next: Vert 1                           (1:3,-1)

								   DB  19, MHN2, ?, ?   ; (2,-2) Next: Vert 1, Horz 1                (1:3,-1:-3)


								                        ; 35: From 32A.

								   DB  36, M0           ; (4, 0)

								   DB  36, MVN2         ; (2, 0)

								   DB  36, MVP2, ?, ?   ; (6, 0)


								                        ; 36: From 35.

								   DB  16, M0           ; (2/4/6, 0) Next: Vert 1.                      (1:7, 0)

								   DB  16, MHN1         ; (2/4/6,-1) Next: Vert 1.                      (1:7,-1)

								   DB  16, MHN2, ?, ?   ; (2/4/6,-2) Next: Vert 1.                      (1:7,-2)


								                        ; 37: From  5.

								   DB  17, M0           ; (-4/0/4, 0) Next: Vert2,Horz2,Vert1,Horz1 (-7:7,-3: 3)

								   DB  17, MHP4         ; (-4/0/4,-4) Next: Vert2,Horz2,Vert1,Horz1 (-7:7, 1: 7)

								   DB  17, MHN4, ?, ?   ; (-4/0/4, 4) Next: Vert2,Horz2,Vert1,Horz1 (-7:7,-7:-1)


								                        ; 38: From 7A.  Lower Left.  Try 2 right and 2 up.

								   DB  42, M0           ; ( 0,0)

								   DB  43, MHP2         ; ( 0,2)

								   DB  26, MVN2, ?, ?   ; (-2,0)


								                        ; 39: From 13B.

								                        ;     From 14  center point would be (0,-4/0/4)

								                        ;     From 16B center point would be (0,-4)

								   DB  40, M0           ; ( 0,4)

								   DB  18, MVN2         ; (-2,4) Next: Horz2,Vert1,Horz1.            (-3:-1,1:7)

								   DB  13, MVN4, ?, ?   ; (-4,4) Next: Horz2,Vert2,Horz1,Vert1.      (-7:-1,1:7)


								                        ; 40: From 39A.

								   DB  41, M0           ; (0, 4)

								   DB  41, MHN2         ; (0, 2)

								   DB  41, MHP2, ?, ?   ; (0, 6)


								                        ; 41: From 40.

								   DB  20, M0           ; ( 0,2/4/6) Next: Horz 1.                      ( 0,1:7)

								   DB  20, MVN1         ; (-1,2/4/6) Next: Horz 1.                      (-1,1:7)

								   DB  20, MVN2, ?, ?   ; (-2,2/4/6) Next: Horz 1.                      (-2,1:7)


								                        ; 42: From 38A.  Lower Left.  Try 1 right and 1 up.

								   DB   0, M0           ; ( 0,0)

								   DB   0, MHP1         ; ( 0,1)

								   DB   0, MVN1, ?, ?   ; (-1,0)


								                        ; 43: From 38B.

								                        ;     From 44B center point would be (0,-2)

								   DB  20, M0           ; ( 0,2) Next: Horz 1                           ( 0,1:3)

								   DB  20, MVN1         ; (-1,2) Next: Horz 1                           (-1,1:3)

								   DB  15, MVN2, ?, ?   ; (-2,2) Next: Horz 1, Vert 1                (-1:-3,1:3)


								                        ; 44: From 9A.  Lower Right.  Try 2 left and 2 up.

								   DB  45, M0           ; ( 0, 0)

								   DB  43, MHN2         ; ( 0,-2)

								   DB  34, MVN2, ?, ?   ; (-2, 0)


								                        ; 45: From 44A.  Lower Right.  Try 1 left and 1 up.

								   DB   0, M0           ; ( 0, 0)

								   DB   0, MHN1         ; ( 0,-1)

								   DB   0, MVN1, ?, ?   ; (-1, 0)


								                        ; 46: From 17A.

								   DB   0, M0           ; (0,0)

								   DB   0, MHP1         ; (0,1)

								   DB   0, MHP1, ?, ?   ; (0,1)


								                        ; 47: From 10C.

								                        ;     From 11 center point would be (0,4/0/-4)

								                        ;     From 12C center point would be (0,-4)

								   DB  48, M0           ; (0,4)

								   DB  48, MHN2         ; (0,2)

								   DB  48, MHP2, ?, ?   ; (0,6)


								                        ; 48 From 10B.

								                        ;    From 47  center point would be (0,2/4/6)

								                        ;    From 12B center point would be (0,-2)

								   DB   0, M0           ; (0,2)

								   DB   0, MHN1         ; (0,1)

								   DB   0, MHP1, ?, ?   ; (0,3)


								                        ; 49 From 12A.

								   DB   0, M0           ; (0, 0)

								   DB   0, MHN1         ; (0,-1)

								   DB   0, MHN1, ?, ?   ; (0,-1)


								                        ; 50:  Interior Macroblock.  Explore 7 up and 7 down.

								   DB  51, M0           ; ( 0,0)

								   DB  51, MVN7         ; (-7,0)

								   DB  51, MVP7, ?, ?   ; ( 7,0)


								                        ; 51:  Explore 7 left and 7 right.

								   DB   5, M0           ; (-7|0|7, 0)

								   DB   5, MHN7         ; (-7|0|7,-7)

								   DB   5, MHP7, ?, ?   ; (-7|0|7, 7)


								MulByNeg8 LABEL DWORD


								CNT = 0

								REPEAT 128

								  DD WeightedDiff+CNT

								  CNT = CNT - 8

								ENDM


								  ; The following treachery puts the numbers into byte 2 of each aligned DWORD.

								  DB   0,  0

								  DD 193 DUP (255)

								  DD 250,243,237,231,225,219,213,207,201,195,189,184,178,172,167,162,156

								  DD 151,146,141,135,130,126,121,116,111,107,102, 97, 93, 89, 84, 80, 76

								  DD  72, 68, 64, 61, 57, 53, 50, 46, 43, 40, 37, 34, 31, 28, 25, 22, 20

								  DD  18, 15, 13, 11,  9,  7,  6,  4,  3,  2,  1

								  DB   0,  0

								WeightedDiff LABEL DWORD

								  DB   0,  0

								  DD   0,  0,  1,  2,  3,  4,  6,  7,  9, 11, 13, 15, 18

								  DD  20, 22, 25, 28, 31, 34, 37, 40, 43, 46, 50, 53, 57, 61, 64, 68, 72

								  DD  76, 80, 84, 89, 93, 97,102,107,111,116,121,126,130,135,141,146,151

								  DD 156,162,167,172,178,184,189,195,201,207,213,219,225,231,237,243,250

								  DD 191 DUP (255)

								  DB 255,  0


								MotionOffsets DD 1*PITCH,0,?,?


								RemnantOfCacheLine DB 8 DUP (?)


								LocalStorage LABEL DWORD  ; Local storage goes on the stack at addresses

								                          ; whose lower 12 bits match this address.


								.CODE


								ASSUME cs : FLAT

								ASSUME ds : FLAT

								ASSUME es : FLAT

								ASSUME fs : FLAT

								ASSUME gs : FLAT

								ASSUME ss : FLAT


								MOTIONESTIMATION proc C AMBAS: DWORD,

								ATargFrmBase: DWORD,

								APrevFrmBase: DWORD,

								AFiltFrmBase: DWORD,

								ADo15Search: DWORD,

								ADoHalfPelEst: DWORD,

								ADoBlkLvlVec: DWORD,

								ADoSpatialFilt: DWORD,

								AZeroVectorThresh: DWORD,

								ANonZeroMVDiff: DWORD,

								ABlockMVDiff: DWORD,

								AEmptyThresh: DWORD,

								AInterCodThresh: DWORD,

								AIntraCodDiff: DWORD,

								ASpatialFiltThresh: DWORD,

								ASpatialFiltDiff: DWORD,

								AIntraSWDTot: DWORD,

								AIntraSWDBlks: DWORD,

								AInterSWDTot: DWORD,

								AInterSWDBlks: DWORD


								LocalFrameSize = 128 + 168*4 + 32   ; 128 for locals;  168*4 for blocks;  32 for dummy block.

								RegStoSize = 16


								; Arguments:


								MBlockActionStream_arg       = RegStoSize +  4

								TargetFrameBaseAddress_arg   = RegStoSize +  8

								PreviousFrameBaseAddress_arg = RegStoSize + 12

								FilteredFrameBaseAddress_arg = RegStoSize + 16

								DoRadius15Search_arg         = RegStoSize + 20

								DoHalfPelEstimation_arg      = RegStoSize + 24

								DoBlockLevelVectors_arg      = RegStoSize + 28

								DoSpatialFiltering_arg       = RegStoSize + 32

								ZeroVectorThreshold_arg      = RegStoSize + 36

								NonZeroMVDifferential_arg    = RegStoSize + 40

								BlockMVDifferential_arg      = RegStoSize + 44

								EmptyThreshold_arg           = RegStoSize + 48

								InterCodingThreshold_arg     = RegStoSize + 52

								IntraCodingDifferential_arg  = RegStoSize + 56

								SpatialFiltThreshold_arg     = RegStoSize + 60

								SpatialFiltDifferential_arg  = RegStoSize + 64

								IntraSWDTotal_arg            = RegStoSize + 68

								IntraSWDBlocks_arg           = RegStoSize + 72

								InterSWDTotal_arg            = RegStoSize + 76

								InterSWDBlocks_arg           = RegStoSize + 80

								EndOfArgList                 = RegStoSize + 84


								; Locals (on local stack frame)


								MBlockActionStream       EQU [esp+   0]

								CurrSWDState             EQU [esp+   4]

								MotionOffsetsCursor      EQU CurrSWDState

								HalfPelHorzSavings       EQU CurrSWDState

								VertFilterDoneAddr       EQU CurrSWDState

								IntraSWDTotal            EQU [esp+   8]

								IntraSWDBlocks           EQU [esp+  12]

								InterSWDTotal            EQU [esp+  16]

								InterSWDBlocks           EQU [esp+  20]


								MBCentralInterSWD        EQU [esp+  24]

								MBRef1InterSWD           EQU [esp+  28]

								MBRef2InterSWD           EQU [esp+  32]

								MBCentralInterSWD_BLS    EQU [esp+  36]

								MB0MVInterSWD            EQU [esp+  40]

								MBAddrCentralPoint       EQU [esp+  44]

								MBMotionVectors          EQU [esp+  48]


								DoHalfPelEstimation      EQU [esp+  52]

								DoBlockLevelVectors      EQU [esp+  56]

								DoSpatialFiltering       EQU [esp+  60]

								ZeroVectorThreshold      EQU [esp+  64]

								NonZeroMVDifferential    EQU [esp+  68]

								BlockMVDifferential      EQU [esp+  72]

								EmptyThreshold           EQU [esp+  76]

								InterCodingThreshold     EQU [esp+  80]

								IntraCodingDifferential  EQU [esp+  84]

								SpatialFiltThreshold     EQU [esp+  88]

								SpatialFiltDifferential  EQU [esp+  92]

								TargetMBAddr             EQU [esp+  96]

								TargetFrameBaseAddress   EQU [esp+ 100]

								PreviousFrameBaseAddress EQU [esp+ 104]

								TargToRef                EQU [esp+ 108]

								TargToSLF                EQU [esp+ 112]

								DoRadius15Search         EQU [esp+ 116]


								StashESP                 EQU [esp+ 120]


								BlockLen                 EQU 168

								Block1                   EQU [esp+  128+40]      ; "128" is for locals.  "40" is so offsets range from -40 to 124.

								Block2                   EQU Block1  + BlockLen

								Block3                   EQU Block2  + BlockLen

								Block4                   EQU Block3  + BlockLen

								BlockN                   EQU Block4  + BlockLen

								BlockNM1                 EQU Block4

								BlockNM2                 EQU Block3

								BlockNP1                 EQU Block4  + BlockLen + BlockLen

								DummyBlock               EQU Block4  + BlockLen


								Ref1Addr                 EQU  -40

								Ref2Addr                 EQU  -36

								AddrCentralPoint         EQU  -32

								CentralInterSWD          EQU  -28

								Ref1InterSWD             EQU  -24

								Ref2InterSWD             EQU  -20

								CentralInterSWD_BLS      EQU  -16  ; CentralInterSWD, when doing blk level search.

								CentralInterSWD_SLF      EQU  -16  ; CentralInterSWD, when doing spatial filter.

								HalfPelSavings           EQU  Ref2Addr

								ZeroMVInterSWD           EQU  -12

								BlkHMV                   EQU   -8

								BlkVMV                   EQU   -7

								BlkMVs                   EQU   -8

								AccumTargetPels          EQU   -4


								; Offsets for Negated Quadrupled Target Pels:

								N8T00                    EQU    0

								N8T04                    EQU    4

								N8T02                    EQU    8

								N8T06                    EQU   12

								N8T20                    EQU   16

								N8T24                    EQU   20

								N8T22                    EQU   24

								N8T26                    EQU   28

								N8T40                    EQU   32

								N8T44                    EQU   36

								N8T42                    EQU   40

								N8T46                    EQU   44

								N8T60                    EQU   48

								N8T64                    EQU   52

								N8T62                    EQU   56

								N8T66                    EQU   60

								N8T11                    EQU   64

								N8T15                    EQU   68

								N8T13                    EQU   72

								N8T17                    EQU   76

								N8T31                    EQU   80

								N8T35                    EQU   84

								N8T33                    EQU   88

								N8T37                    EQU   92

								N8T51                    EQU   96

								N8T55                    EQU  100

								N8T53                    EQU  104

								N8T57                    EQU  108

								N8T71                    EQU  112

								N8T75                    EQU  116

								N8T73                    EQU  120

								N8T77                    EQU  124


								  push  esi

								  push  edi

								  push  ebp

								  push  ebx


								; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.


								  mov   esi,esp

								   sub  esp,000001000H

								  mov   eax,[esp]                   ; Cause system to commit page.

								   sub  esp,000001000H

								  and   esp,0FFFFF000H

								   mov  ebx,OFFSET LocalStorage+31

								  and   ebx,000000FE0H

								   mov  edx,PD [esi+MBlockActionStream_arg]

								  or    esp,ebx

								   mov  eax,PD [esi+TargetFrameBaseAddress_arg]

								  mov   TargetFrameBaseAddress,eax

								   mov  ebx,PD [esi+PreviousFrameBaseAddress_arg]

								  mov   PreviousFrameBaseAddress,ebx

								   sub  ebx,eax

								  mov   ecx,PD [esi+FilteredFrameBaseAddress_arg]

								  sub   ecx,eax

								   mov  TargToRef,ebx

								  mov   TargToSLF,ecx

								   mov  eax,PD [esi+EmptyThreshold_arg]

								  mov   EmptyThreshold,eax

								   mov  eax,PD [esi+DoHalfPelEstimation_arg]

								  mov   DoHalfPelEstimation,eax

								   mov  eax,PD [esi+DoBlockLevelVectors_arg]

								  mov   DoBlockLevelVectors,eax

								   mov  eax,PD [esi+DoRadius15Search_arg]

								  mov   DoRadius15Search,eax

								   mov  eax,PD [esi+DoSpatialFiltering_arg]

								  mov   DoSpatialFiltering,eax

								   mov  eax,PD [esi+ZeroVectorThreshold_arg]

								  mov   ZeroVectorThreshold,eax

								   mov  eax,PD [esi+NonZeroMVDifferential_arg]

								  mov   NonZeroMVDifferential,eax

								   mov  eax,PD [esi+BlockMVDifferential_arg]

								  mov   BlockMVDifferential,eax

								   mov  eax,PD [esi+InterCodingThreshold_arg]

								  mov   InterCodingThreshold,eax

								   mov  eax,PD [esi+IntraCodingDifferential_arg]

								  mov   IntraCodingDifferential,eax

								   mov  eax,PD [esi+SpatialFiltThreshold_arg]

								  mov   SpatialFiltThreshold,eax

								   mov  eax,PD [esi+SpatialFiltDifferential_arg]

								  mov   SpatialFiltDifferential,eax

								   xor  ebx,ebx

								  mov   IntraSWDBlocks,ebx

								   mov  InterSWDBlocks,ebx

								  mov   IntraSWDTotal,ebx

								   mov  InterSWDTotal,ebx

								  mov   Block1.BlkMVs,ebx

								   mov  Block2.BlkMVs,ebx

								  mov   Block3.BlkMVs,ebx

								   mov  Block4.BlkMVs,ebx

								  mov   DummyBlock.Ref1Addr,esp

								   mov  DummyBlock.Ref2Addr,esp

								  mov   StashESP,esi

								   jmp  FirstMacroBlock


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;     1)  To calculate an average value for the target match points of each

								;         block, we sum the 32 match points.  The totals for each of the 4

								;         blocks is output seperately.

								;

								;     2)  Define each prepared match point in the target macroblock as the

								;         real match point times negative 8, with the base address of the

								;         WeightedDiff lookup table added.  I.e.

								;

								;           for (i = 0; i < 16; i += 2)

								;             for (j = 0; j < 16; j += 2)

								;               N8T[i][j] = ( -8 * Target[i][j]) + ((U32) WeightedDiff);

								;

								;         Both the multiply and the add of the WeightedDiff array base are

								;         effected by a table lookup into the array MulByNeg8.

								;

								;         Then the SWD of a reference macroblock can be calculated as follows:

								;

								;           SWD = 0;

								;           for each match point (i,j)

								;               SWD += *((U32 *) (N8T[i][j] + 8 * Ref[i][j]));

								;

								;         In assembly, the fetch of WeightedDiff array element amounts to this:

								;

								;           mov edi,DWORD PTR N8T[i][j]   ; Fetch N8T[i][j]

								;           mov dl,BYTE PTR Ref[i][j]     ; Fetch Ref[i][j]

								;           mov edi,DWORD PTR[edi+edx*8]  ; Fetch WeithtedDiff of target & ref.

								;

								;     3)  We calculate the 0-motion SWD, as described just above.  We use 32

								;         match points per block, and write the result seperately for each

								;         block.  The result is accumulated into the high half of ebp.

								;

								;     4)  If the SWD for the 0-motion vector is below a threshold, we don't

								;         bother searching for other possibly better motion vectors.  Presently,

								;         this threshold is set such that an average difference of less than

								;         three per match point causes the 0-motion vector to be accepted.

								;

								; Register usage for this section:

								;

								;   Input of this section:

								;

								;     edx -- MBlockActionStream

								;

								;   Predominate usage for body of this section:

								;

								;     esi -- Target block address.

								;     edi -- 0-motion reference block address.

								;     ebp[ 0:12] -- Accumulator for target pels.

								;     ebp[13:15] -- Loop control

								;     ebp[16:31] -- Accumulator for weighted diff between target and 0-MV ref.

								;     edx -- Address at which to store -8 times pels.

								;     ecx -- A reference pel.

								;     ebx -- A target pel.

								;     eax -- A target pel times -8;  and a weighted difference.

								;

								; Expected Pentium (tm) microprocessor performance for section:

								;

								;   Executed once per macroblock.

								;

								;   520 clocks for instruction execution

								;     8 clocks for bank conflicts (64 dual mem ops with 1/8 chance of conflict)

								;    80 clocks generously estimated for an average of 8 cache line fills for

								;       the target macroblock and 8 cache line fills for the reference area.

								;  ----

								;   608 clocks total time for this section.

								;


								NextMacroBlock:


								  mov   bl,[edx].CodedBlocks

								   add  edx,SIZEOF T_MacroBlockActionDescr

								  and   ebx,000000040H                ; Check for end-of-stream

								   jne  Done


								FirstMacroBlock:


								  mov   cl,[edx].CodedBlocks          ; Init CBP for macroblock.

								   mov  ebp,TargetFrameBaseAddress

								  mov   bl,[edx].FirstMEState         ; First State

								   mov  eax,DoRadius15Search          ; Searching 15 full pels out, or just 7?

								  neg   al                            ; doing blk lvl => al=0, not => al=-1

								  or    cl,03FH                       ; Indicate all 6 blocks are coded.

								   and  al,bl

								  mov   esi,[edx].BlkY1.BlkOffset     ; Get address of next macroblock to do.

								   cmp  al,5

								  jne   @f

								  mov   bl,50                         ; Cause us to search +/- 15 if central

								  ;                                   ; block and willing to go that far.

								@@:

								   mov  edi,TargToRef

								  add   esi,ebp

								   mov  CurrSWDState,ebx              ; Stash First State Number as current.

								  add   edi,esi

								   xor  ebp,ebp

								  mov   TargetMBAddr,esi              ; Stash address of target macroblock.

								   mov  MBlockActionStream,edx        ; Stash list ptr.

								  mov   [edx].CodedBlocks,cl

								   mov  ecx,INTER1MV                  ; Speculate INTER-coding, 1 motion vector.

								  mov   [edx].BlockType,cl

								   lea  edx,Block1


								PrepMatchPointsNextBlock:


								  mov   bl,PB [esi+6]                 ; 06A -- Target Pel 00.

								  add   ebp,ebx                       ; 06B -- Accumulate target pels.

								   mov  cl,PB [edi+6]                 ; 06C -- Reference Pel 00.

								  mov   eax,MulByNeg8[ebx*4]          ; 06D -- Target Pel 00 * -8.

								   mov  bl,PB [esi+4]                 ; 04A

								  mov   [edx].N8T06,eax               ; 06E -- Store negated quadrupled Pel 00.

								   add  ebp,ebx                       ; 04B

								  mov   eax,PD [eax+ecx*8]            ; 06F -- Weighted difference for Pel 00.

								   mov  cl,PB [edi+4]                 ; 04C

								  add   ebp,eax                       ; 06G -- Accumulate weighted difference.

								   mov  eax,MulByNeg8[ebx*4]          ; 04D

								  mov   bl,PB [esi+2]                 ; 02A

								   mov  [edx].N8T04,eax               ; 04E

								  add   ebp,ebx                       ; 02B

								   mov  eax,PD [eax+ecx*8]            ; 04F

								  mov   cl,PB [edi+2]                 ; 02C

								   add  ebp,eax                       ; 04G

								  mov   eax,MulByNeg8[ebx*4]          ; 02D

								   mov  bl,PB [esi]                   ; 00A

								  mov   [edx].N8T02,eax               ; 02E

								   add  ebp,ebx                       ; 00B

								  mov   eax,PD [eax+ecx*8]            ; 02F

								   add  esi,PITCH+1

								  mov   cl,PB [edi]                   ; 00C

								   add  edi,PITCH+1

								  lea   ebp,[ebp+eax+000004000H]      ; 02G  (plus loop control)

								   mov  eax,MulByNeg8[ebx*4]          ; 00D

								  mov   bl,PB [esi+6]                 ; 17A

								   mov  [edx].N8T00,eax               ; 00E

								  add   ebp,ebx                       ; 17B

								   mov  eax,PD [eax+ecx*8]            ; 00F

								  mov   cl,PB [edi+6]                 ; 17C

								   add  ebp,eax                       ; 00G

								  mov   eax,MulByNeg8[ebx*4]          ; 17D

								   mov  bl,PB [esi+4]                 ; 15A

								  mov   [edx].N8T17,eax               ; 17E

								   add  ebp,ebx                       ; 15B

								  mov   eax,PD [eax+ecx*8]            ; 17F

								   mov  cl,PB [edi+4]                 ; 15C

								  add   ebp,eax                       ; 17G

								   mov  eax,MulByNeg8[ebx*4]          ; 15D

								  mov   bl,PB [esi+2]                 ; 13A

								   mov  [edx].N8T15,eax               ; 15E

								  add   ebp,ebx                       ; 13B

								   mov  eax,PD [eax+ecx*8]            ; 15F

								  mov   cl,PB [edi+2]                 ; 13C

								   add  ebp,eax                       ; 15G

								  mov   eax,MulByNeg8[ebx*4]          ; 13D

								   mov  bl,PB [esi]                   ; 11A

								  mov   [edx].N8T13,eax               ; 13E

								   add  ebp,ebx                       ; 11B

								  mov   eax,PD [eax+ecx*8]            ; 13F

								   add  esi,PITCH-1

								  mov   cl,PB [edi]                   ; 11C

								   add  edi,PITCH-1

								  add   ebp,eax                       ; 13G

								   mov  eax,MulByNeg8[ebx*4]          ; 11D

								  mov   bl,PB [esi+6]                 ; 26A

								   mov  [edx].N8T11,eax               ; 11E

								  add   ebp,ebx                       ; 26B

								   mov  eax,PD [eax+ecx*8]            ; 11F

								  mov   cl,PB [edi+6]                 ; 26C

								   add  ebp,eax                       ; 11G

								  mov   eax,MulByNeg8[ebx*4]          ; 26D

								   mov  bl,PB [esi+4]                 ; 24A

								  mov   [edx].N8T26,eax               ; 26E

								   add  ebp,ebx                       ; 24B

								  mov   eax,PD [eax+ecx*8]            ; 26F

								   mov  cl,PB [edi+4]                 ; 24C

								  add   ebp,eax                       ; 26G

								   mov  eax,MulByNeg8[ebx*4]          ; 24D

								  mov   bl,PB [esi+2]                 ; 22A

								   mov  [edx].N8T24,eax               ; 24E

								  add   ebp,ebx                       ; 22B

								   mov  eax,PD [eax+ecx*8]            ; 24F

								  mov   cl,PB [edi+2]                 ; 22C

								   add  ebp,eax                       ; 24G

								  mov   eax,MulByNeg8[ebx*4]          ; 22D

								   mov  bl,PB [esi]                   ; 20A

								  mov   [edx].N8T22,eax               ; 22E

								   add  ebp,ebx                       ; 20B

								  mov   eax,PD [eax+ecx*8]            ; 22F

								   add  esi,PITCH+1

								  mov   cl,PB [edi]                   ; 20C

								   add  edi,PITCH+1

								  add   ebp,eax                       ; 22G

								   mov  eax,MulByNeg8[ebx*4]          ; 20D

								  mov   bl,PB [esi+6]                 ; 37A

								   mov  [edx].N8T20,eax               ; 20E

								  add   ebp,ebx                       ; 37B

								   mov  eax,PD [eax+ecx*8]            ; 20F

								  mov   cl,PB [edi+6]                 ; 37C

								   add  ebp,eax                       ; 20G

								  mov   eax,MulByNeg8[ebx*4]          ; 37D

								   mov  bl,PB [esi+4]                 ; 35A

								  mov   [edx].N8T37,eax               ; 37E

								   add  ebp,ebx                       ; 35B

								  mov   eax,PD [eax+ecx*8]            ; 37F

								   mov  cl,PB [edi+4]                 ; 35C

								  add   ebp,eax                       ; 37G

								   mov  eax,MulByNeg8[ebx*4]          ; 35D

								  mov   bl,PB [esi+2]                 ; 33A

								   mov  [edx].N8T35,eax               ; 35E

								  add   ebp,ebx                       ; 33B

								   mov  eax,PD [eax+ecx*8]            ; 35F

								  mov   cl,PB [edi+2]                 ; 33C

								   add  ebp,eax                       ; 35G

								  mov   eax,MulByNeg8[ebx*4]          ; 33D

								   mov  bl,PB [esi]                   ; 31A

								  mov   [edx].N8T33,eax               ; 33E

								   add  ebp,ebx                       ; 31B

								  mov   eax,PD [eax+ecx*8]            ; 33F

								   add  esi,PITCH-1

								  mov   cl,PB [edi]                   ; 31C

								   add  edi,PITCH-1

								  add   ebp,eax                       ; 33G

								   mov  eax,MulByNeg8[ebx*4]          ; 31D

								  mov   bl,PB [esi+6]                 ; 46A

								   mov  [edx].N8T31,eax               ; 31E

								  add   ebp,ebx                       ; 46B

								   mov  eax,PD [eax+ecx*8]            ; 31F

								  mov   cl,PB [edi+6]                 ; 46C

								   add  ebp,eax                       ; 31G

								  mov   eax,MulByNeg8[ebx*4]          ; 46D

								   mov  bl,PB [esi+4]                 ; 44A

								  mov   [edx].N8T46,eax               ; 46E

								   add  ebp,ebx                       ; 44B

								  mov   eax,PD [eax+ecx*8]            ; 46F

								   mov  cl,PB [edi+4]                 ; 44C

								  add   ebp,eax                       ; 46G

								   mov  eax,MulByNeg8[ebx*4]          ; 44D

								  mov   bl,PB [esi+2]                 ; 42A

								   mov  [edx].N8T44,eax               ; 44E

								  add   ebp,ebx                       ; 42B

								   mov  eax,PD [eax+ecx*8]            ; 44F

								  mov   cl,PB [edi+2]                 ; 42C

								   add  ebp,eax                       ; 44G

								  mov   eax,MulByNeg8[ebx*4]          ; 42D

								   mov  bl,PB [esi]                   ; 40A

								  mov   [edx].N8T42,eax               ; 42E

								   add  ebp,ebx                       ; 40B

								  mov   eax,PD [eax+ecx*8]            ; 42F

								   add  esi,PITCH+1

								  mov   cl,PB [edi]                   ; 40C

								   add  edi,PITCH+1

								  add   ebp,eax                       ; 42G

								   mov  eax,MulByNeg8[ebx*4]          ; 40D

								  mov   bl,PB [esi+6]                 ; 57A

								   mov  [edx].N8T40,eax               ; 40E

								  add   ebp,ebx                       ; 57B

								   mov  eax,PD [eax+ecx*8]            ; 40F

								  mov   cl,PB [edi+6]                 ; 57C

								   add  ebp,eax                       ; 40G

								  mov   eax,MulByNeg8[ebx*4]          ; 57D

								   mov  bl,PB [esi+4]                 ; 55A

								  mov   [edx].N8T57,eax               ; 57E

								   add  ebp,ebx                       ; 55B

								  mov   eax,PD [eax+ecx*8]            ; 57F

								   mov  cl,PB [edi+4]                 ; 55C

								  add   ebp,eax                       ; 57G

								   mov  eax,MulByNeg8[ebx*4]          ; 55D

								  mov   bl,PB [esi+2]                 ; 53A

								   mov  [edx].N8T55,eax               ; 55E

								  add   ebp,ebx                       ; 53B

								   mov  eax,PD [eax+ecx*8]            ; 55F

								  mov   cl,PB [edi+2]                 ; 53C

								   add  ebp,eax                       ; 55G

								  mov   eax,MulByNeg8[ebx*4]          ; 53D

								   mov  bl,PB [esi]                   ; 51A

								  mov   [edx].N8T53,eax               ; 53E

								   add  ebp,ebx                       ; 51B

								  mov   eax,PD [eax+ecx*8]            ; 53F

								   add  esi,PITCH-1

								  mov   cl,PB [edi]                   ; 51C

								   add  edi,PITCH-1

								  add   ebp,eax                       ; 53G

								   mov  eax,MulByNeg8[ebx*4]          ; 51D

								  mov   bl,PB [esi+6]                 ; 66A

								   mov  [edx].N8T51,eax               ; 51E

								  add   ebp,ebx                       ; 66B

								   mov  eax,PD [eax+ecx*8]            ; 51F

								  mov   cl,PB [edi+6]                 ; 66C

								   add  ebp,eax                       ; 51G

								  mov   eax,MulByNeg8[ebx*4]          ; 66D

								   mov  bl,PB [esi+4]                 ; 64A

								  mov   [edx].N8T66,eax               ; 66E

								   add  ebp,ebx                       ; 64B

								  mov   eax,PD [eax+ecx*8]            ; 66F

								   mov  cl,PB [edi+4]                 ; 64C

								  add   ebp,eax                       ; 66G

								   mov  eax,MulByNeg8[ebx*4]          ; 64D

								  mov   bl,PB [esi+2]                 ; 62A

								   mov  [edx].N8T64,eax               ; 64E

								  add   ebp,ebx                       ; 62B

								   mov  eax,PD [eax+ecx*8]            ; 64F

								  mov   cl,PB [edi+2]                 ; 62C

								   add  ebp,eax                       ; 64G

								  mov   eax,MulByNeg8[ebx*4]          ; 62D

								   mov  bl,PB [esi]                   ; 60A

								  mov   [edx].N8T62,eax               ; 62E

								   add  ebp,ebx                       ; 60B

								  mov   eax,PD [eax+ecx*8]            ; 62F

								   add  esi,PITCH+1

								  mov   cl,PB [edi]                   ; 60C

								   add  edi,PITCH+1

								  add   ebp,eax                       ; 62G

								   mov  eax,MulByNeg8[ebx*4]          ; 60D

								  mov   bl,PB [esi+6]                 ; 77A

								   mov  [edx].N8T60,eax               ; 60E

								  add   ebp,ebx                       ; 77B

								   mov  eax,PD [eax+ecx*8]            ; 60F

								  mov   cl,PB [edi+6]                 ; 77C

								   add  ebp,eax                       ; 60G

								  mov   eax,MulByNeg8[ebx*4]          ; 77D

								   mov  bl,PB [esi+4]                 ; 75A

								  mov   [edx].N8T77,eax               ; 77E

								   add  ebp,ebx                       ; 75B

								  mov   eax,PD [eax+ecx*8]            ; 77F

								   mov  cl,PB [edi+4]                 ; 75C

								  add   ebp,eax                       ; 77G

								   mov  eax,MulByNeg8[ebx*4]          ; 75D

								  mov   bl,PB [esi+2]                 ; 73A

								   mov  [edx].N8T75,eax               ; 75E

								  add   ebp,ebx                       ; 73B

								   mov  eax,PD [eax+ecx*8]            ; 75F

								  mov   cl,PB [edi+2]                 ; 73C

								   add  ebp,eax                       ; 75G

								  mov   eax,MulByNeg8[ebx*4]          ; 73D

								   mov  bl,PB [esi]                   ; 71A

								  mov   [edx].N8T73,eax               ; 73E

								   add  ebp,ebx                       ; 71B

								  mov   eax,PD [eax+ecx*8]            ; 73F

								   mov  cl,PB [edi]                   ; 71C

								  add   esi,PITCH-1-PITCH*8+8

								   add  edi,PITCH-1-PITCH*8+8

								  add   ebp,eax                       ; 73G

								   mov  eax,MulByNeg8[ebx*4]          ; 71D

								  mov   ebx,ebp

								   mov  [edx].N8T71,eax               ; 71E

								  and   ebx,000001FFFH                ; Extract sum of target pels.

								   add  edx,BlockLen                  ; Move to next output block

								  mov   eax,PD [eax+ecx*8]            ; 71F

								   mov  [edx-BlockLen].AccumTargetPels,ebx ; Store acc of target pels for block.

								  add   eax,ebp                       ; 71G

								   and  ebp,000006000H                ; Extract loop control

								  shr   eax,16                        ; Extract SWD;  CF == 1 every second iter.

								   mov  ebx,ecx

								  mov   [edx-BlockLen].CentralInterSWD,eax ; Store SWD for 0-motion vector.

								   jnc  PrepMatchPointsNextBlock


								  add   esi,PITCH*8-16                ; Advance to block 3, or off end.

								   add  edi,PITCH*8-16                ; Advance to block 3, or off end.

								  xor   ebp,000002000H

								   jne  PrepMatchPointsNextBlock      ; Jump if advancing to block 3.


								  mov   ebx,CurrSWDState              ; Fetch First State Number for engine.

								   mov  edi,Block1.CentralInterSWD

								  test  bl,bl                         ; Test for INTRA-BY-DECREE.

								   je   IntraByDecree


								  add   eax,Block2.CentralInterSWD

								   add  edi,Block3.CentralInterSWD

								  add   eax,edi

								   mov  edx,ZeroVectorThreshold

								  cmp   eax,edx                       ; Compare 0-MV against ZeroVectorThresh

								   jle  BelowZeroThresh               ; Jump if 0-MV is good enough.


								  mov   cl,PB SWDState[ebx*8+3]       ; cl == Index of inc to apply to central

								  ;                                   ; point to get to ref1.

								   mov  bl,PB SWDState[ebx*8+5]       ; bl == Same as cl, but for ref2.

								  mov   edx,TargToRef

								   mov  MB0MVInterSWD,eax             ; Stash SWD for zero motion vector.

								  mov   edi,PD OffsetToRef[ebx]       ; Get inc to apply to ctr to get to ref2.

								   mov  ebp,PD OffsetToRef[ecx]       ; Get inc to apply to ctr to get to ref1.

								  lea   esi,[esi+edx-PITCH*16]        ; Calculate address of 0-MV ref block.

								   ;

								  mov   MBAddrCentralPoint,esi        ; Set central point to 0-MV.

								   mov  MBCentralInterSWD,eax

								  mov   eax,Block1.CentralInterSWD    ; Stash Zero MV SWD, in case we decide

								   mov  edx,Block2.CentralInterSWD    ; the best non-zero MV isn't enough

								  mov   Block1.ZeroMVInterSWD,eax     ; better than the zero MV.

								   mov  Block2.ZeroMVInterSWD,edx

								  mov   eax,Block3.CentralInterSWD

								   mov  edx,Block4.CentralInterSWD

								  mov   Block3.ZeroMVInterSWD,eax

								   mov  Block4.ZeroMVInterSWD,edx


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;     5)  The SWD for two different reference macroblocks is calculated; ref1

								;         into the high order 16 bits of ebp, and ref2 into the low 16 bits.

								;         This is performed for each iteration of the state engine.  A normal,

								;         internal macroblock will perform 6 iterations, searching +/- 4

								;         horizontally, then +/- 4 vertically, then +/- 2 horizontally, then

								;         +/- 2 vertically, then +/- 1 horizontally, then +/- 1 vertically.

								;

								; Register usage for this section:

								;

								;   Input:

								;

								;     esi -- Addr of 0-motion macroblock in ref frame.

								;     ebp -- Increment to apply to get to first ref1 macroblock.

								;     edi -- Increment to apply to get to first ref2 macroblock.

								;     ebx, ecx -- High order 24 bits are zero.

								;

								;   Output:

								;

								;     ebp -- SWD for the best-fit reference macroblock.

								;     ebx -- Index of increment to apply to get to best-fit reference MB.

								;     MBAddrCentralPoint -- the best-fit of the previous iteration;  it is the

								;                         value to which OffsetToRef[ebx] must be added.

								;

								;

								; Expected performance for SWDLoop code:

								;

								;   Execution frequency:  Six times per block for which motion analysis is done

								;                         beyond the 0-motion vector.

								;

								; Pentium (tm) microprocessor times per six iterations:

								;   180 clocks for instruction execution setup to DoSWDLoop

								;  2520 clocks for DoSWDLoop procedure, instruction execution.

								;   192 clocks for bank conflicts in DoSWDLoop

								;    30 clocks generously estimated for an average of 6 cache line fills for

								;       the reference area.

								;  ----

								;  2922 clocks total time for this section.


								MBFullPelMotionSearchLoop:


								  lea   edi,[esi+edi+PITCH*8+8]

								   lea  esi,[esi+ebp+PITCH*8+8]

								  mov   Block4.Ref1Addr,esi

								   mov  Block4.Ref2Addr,edi

								  sub   esi,8

								   sub  edi,8

								  mov   Block3.Ref1Addr,esi

								   mov  Block3.Ref2Addr,edi

								  sub   esi,PITCH*8-8

								   sub  edi,PITCH*8-8

								  mov   Block2.Ref1Addr,esi

								   mov  Block2.Ref2Addr,edi

								  sub   esi,8

								   sub  edi,8

								  mov   Block1.Ref1Addr,esi

								   mov  Block1.Ref2Addr,edi


								;    esi -- Points to ref1

								;    edi -- Points to ref2

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  call  DoSWDLoop


								;    ebp -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  mov   esi,MBCentralInterSWD     ; Get SWD for central point of these 3 refs

								   xor  eax,eax

								  add   ebp,Block1.Ref1InterSWD

								   add  edx,Block1.Ref2InterSWD

								  add   ebp,Block2.Ref1InterSWD

								   add  edx,Block2.Ref2InterSWD

								  add   ebp,Block3.Ref1InterSWD

								   add  edx,Block3.Ref2InterSWD


								  cmp   ebp,edx                   ; Carry flag == 1 iff ref1 SWD < ref2 SWD.

								   mov  edi,CurrSWDState          ; Restore current state number.

								  adc   eax,eax                   ; eax == 1 iff ref1 SWD < ref2 SWD.

								   cmp  ebp,esi                   ; Carry flag == 1 iff ref1 SWD < central SWD.

								  adc   eax,eax                   ;

								   cmp  edx,esi                   ; Carry flag == 1 iff ref2 SWD < central SWD.

								  adc   eax,eax                   ; 0 --> Pick central point.

								  ;                               ; 1 --> Pick ref2.

								  ;                               ; 2 --> Not possible.

								  ;                               ; 3 --> Pick ref2.

								  ;                               ; 4 --> Pick central point.

								  ;                               ; 5 --> Not possible.

								  ;                               ; 6 --> Pick ref1.

								  ;                               ; 7 --> Pick ref1.

								   mov  MBRef2InterSWD,edx

								  mov   MBRef1InterSWD,ebp

								   xor  edx,edx

								  mov   dl,PB PickPoint[eax]        ; dl == 0: central pt;  2: ref1;  4: ref2

								   mov  esi,MBAddrCentralPoint      ; Reload address of central ref block.

								  ;

								   ;

								  mov   ebp,Block1.CentralInterSWD[edx*2] ; Get SWD for each block, picked pt.

								   mov  al,PB SWDState[edx+edi*8+1] ; al == Index of inc to apply to old central

								   ;                                ;       point to get new central point.

								  mov   Block1.CentralInterSWD,ebp  ; Stash SWD for new central point.

								   mov  ebp,Block2.CentralInterSWD[edx*2]

								  mov   Block2.CentralInterSWD,ebp

								   mov  ebp,Block3.CentralInterSWD[edx*2]

								  mov   Block3.CentralInterSWD,ebp

								   mov  ebp,Block4.CentralInterSWD[edx*2]

								  mov   Block4.CentralInterSWD,ebp

								   mov  ebp,MBCentralInterSWD[edx*2]; Get the SWD for the point we picked.

								  mov   dl,PB SWDState[edx+edi*8]   ; dl == New state number.

								   mov  MBCentralInterSWD,ebp       ; Stash SWD for new central point.

								  mov   edi,PD OffsetToRef[eax]     ; Get inc to apply to get to new central pt.

								   mov  CurrSWDState,edx            ; Stash current state number.

								  mov   bl,PB SWDState[edx*8+3]     ; bl == Index of inc to apply to central

								  ;                                 ;       point to get to next ref1.

								   mov  cl,PB SWDState[edx*8+5]     ; cl == Same as bl, but for ref2.

								  add   esi,edi                     ; Move to new central point.

								   test dl,dl

								  mov   ebp,PD OffsetToRef[ebx]     ; Get inc to apply to ctr to get to ref1.

								   mov  edi,PD OffsetToRef[ecx]     ; Get inc to apply to ctr to get to ref2.

								  mov   MBAddrCentralPoint,esi      ; Stash address of new central ref block.

								   jne  MBFullPelMotionSearchLoop   ; Jump if not done searching.


								;Done searching for integer motion vector for full macroblock


								IF PITCH-384

								  *** Error:  The magic leaks out of the following code if PITCH isn't 384.

								ENDIF

								  mov   ecx,TargToRef            ; To Linearize MV for winning ref blk.

								   mov  eax,esi                  ; Copy of ref macroblock addr.

								  sub   eax,ecx                  ; To Linearize MV for winning ref blk.

								   mov  ecx,TargetMBAddr

								  sub   eax,ecx

								   mov  edx,MBlockActionStream   ; Fetch list ptr.

								  mov   ebx,eax

								   mov  ebp,DoHalfPelEstimation  ; Are we doing half pel motion estimation?

								  shl   eax,25                   ; Extract horz motion component.

								   mov  [edx].BlkY1.PastRef,esi  ; Save address of reference MB selected.

								  sar   ebx,8                    ; Hi 24 bits of linearized MV lookup vert MV.

								   mov  ecx,MBCentralInterSWD

								  sar   eax,24                   ; Finish extract horz motion component.

								   test ebp,ebp

								  mov   bl,PB UnlinearizedVertMV[ebx] ; Look up proper vert motion vector.

								   mov  [edx].BlkY1.PHMV,al      ; Save winning horz motion vector.

								  mov   [edx].BlkY1.PVMV,bl      ; Save winning vert motion vector.


								IFDEF H261

								ELSE

								   je   SkipHalfPelSearch_1MV


								;Search for half pel motion vector for full macroblock.


								  mov   Block1.AddrCentralPoint,esi

								   lea  ebp,[esi+8]

								  mov   Block2.AddrCentralPoint,ebp

								   add  ebp,PITCH*8-8

								  mov   Block3.AddrCentralPoint,ebp

								   xor  ecx,ecx

								  mov   cl,[edx].FirstMEState

								   add  ebp,8

								  mov   edi,esi

								   mov  Block4.AddrCentralPoint,ebp

								  mov   ebp,InitHalfPelSearchHorz[ecx*4-4]


								;    ebp -- Initialized to 0, except when can't search off left or right edge.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel to left.  Ref2 is .5 to right.


								  call  DoSWDHalfPelHorzLoop


								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								  mov   esi,MBlockActionStream

								   xor  eax,eax                   ; Keep pairing happy

								  add   ecx,Block1.Ref1InterSWD

								   add  edx,Block1.Ref2InterSWD

								  add   ecx,Block2.Ref1InterSWD

								   add  edx,Block2.Ref2InterSWD

								  add   ecx,Block3.Ref1InterSWD

								   add  edx,Block3.Ref2InterSWD

								  mov   bl,[esi].FirstMEState

								   mov  edi,Block1.AddrCentralPoint

								  cmp   ecx,edx

								   jl   MBHorz_Ref1LTRef2


								  mov   ebp,MBCentralInterSWD

								   mov  esi,MBlockActionStream

								  sub   ebp,edx

								   jle  MBHorz_CenterBest


								  mov   al,[esi].BlkY1.PHMV        ; Half pel to the right is best.

								   mov  ecx,Block1.Ref2InterSWD

								  mov   Block1.CentralInterSWD_BLS,ecx

								   mov  ecx,Block3.Ref2InterSWD

								  mov   Block3.CentralInterSWD_BLS,ecx

								   mov  ecx,Block2.Ref2InterSWD

								  mov   Block2.CentralInterSWD_BLS,ecx

								   mov  ecx,Block4.Ref2InterSWD

								  mov   Block4.CentralInterSWD_BLS,ecx

								   inc  al

								  mov   [esi].BlkY1.PHMV,al

								   jmp  MBHorz_Done


								MBHorz_CenterBest:


								  mov   ecx,Block1.CentralInterSWD

								   xor  ebp,ebp

								  mov   Block1.CentralInterSWD_BLS,ecx

								   mov  ecx,Block2.CentralInterSWD

								  mov   Block2.CentralInterSWD_BLS,ecx

								   mov  ecx,Block3.CentralInterSWD

								  mov   Block3.CentralInterSWD_BLS,ecx

								   mov  ecx,Block4.CentralInterSWD

								  mov   Block4.CentralInterSWD_BLS,ecx

								   jmp  MBHorz_Done


								MBHorz_Ref1LTRef2:


								  mov   ebp,MBCentralInterSWD

								   mov  esi,MBlockActionStream

								  sub   ebp,ecx

								   jle  MBHorz_CenterBest


								  mov   al,[esi].BlkY1.PHMV        ; Half pel to the left is best.

								   mov  edx,[esi].BlkY1.PastRef

								  dec   al

								   mov  ecx,Block1.Ref1InterSWD

								  mov   Block1.CentralInterSWD_BLS,ecx

								   mov  ecx,Block3.Ref1InterSWD

								  mov   Block3.CentralInterSWD_BLS,ecx

								   mov  ecx,Block2.Ref1InterSWD

								  mov   Block2.CentralInterSWD_BLS,ecx

								   mov  ecx,Block4.Ref1InterSWD

								  mov   Block4.CentralInterSWD_BLS,ecx

								   dec  edx

								  mov   [esi].BlkY1.PHMV,al

								   mov  [esi].BlkY1.PastRef,edx


								MBHorz_Done:


								  mov   HalfPelHorzSavings,ebp

								   mov  ebp,InitHalfPelSearchVert[ebx*4-4]


								;    ebp -- Initialized to 0, except when can't search off left or right edge.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel above.  Ref2 is .5 below.


								  call  DoSWDHalfPelVertLoop


								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								  add   ecx,Block1.Ref1InterSWD

								   add  edx,Block1.Ref2InterSWD

								  add   ecx,Block2.Ref1InterSWD

								   add  edx,Block2.Ref2InterSWD

								  add   ecx,Block3.Ref1InterSWD

								   add  edx,Block3.Ref2InterSWD

								  cmp   ecx,edx

								   jl   MBVert_Ref1LTRef2


								  mov   ebp,MBCentralInterSWD

								   mov  esi,MBlockActionStream

								  sub   ebp,edx

								   jle  MBVert_CenterBest


								  mov   ecx,Block1.CentralInterSWD

								   mov  edx,Block1.Ref2InterSWD

								  sub   ecx,edx

								   mov  edx,Block1.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  al,[esi].BlkY1.PVMV        ; Half pel below is best.

								  mov   Block1.CentralInterSWD,edx

								   inc  al

								  mov   ecx,Block3.CentralInterSWD

								   mov  edx,Block3.Ref2InterSWD

								  sub   ecx,edx

								   mov  edx,Block3.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  ecx,Block2.CentralInterSWD

								  mov   Block3.CentralInterSWD,edx

								   mov  edx,Block2.Ref2InterSWD

								  sub   ecx,edx

								   mov  edx,Block2.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  ecx,Block4.CentralInterSWD

								  mov   Block2.CentralInterSWD,edx

								   mov  edx,Block4.Ref2InterSWD

								  sub   ecx,edx

								   mov  edx,Block4.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  [esi].BlkY1.PVMV,al

								  mov   Block4.CentralInterSWD,edx

								   jmp  MBVert_Done


								MBVert_CenterBest:


								  mov   ecx,Block1.CentralInterSWD_BLS

								   xor  ebp,ebp

								  mov   Block1.CentralInterSWD,ecx

								   mov  ecx,Block2.CentralInterSWD_BLS

								  mov   Block2.CentralInterSWD,ecx

								   mov  ecx,Block3.CentralInterSWD_BLS

								  mov   Block3.CentralInterSWD,ecx

								   mov  ecx,Block4.CentralInterSWD_BLS

								  mov   Block4.CentralInterSWD,ecx

								   jmp  MBVert_Done


								MBVert_Ref1LTRef2:


								  mov   ebp,MBCentralInterSWD

								   mov  esi,MBlockActionStream

								  sub   ebp,ecx

								   jle  MBVert_CenterBest


								  mov   ecx,Block1.CentralInterSWD

								   mov  edx,Block1.Ref1InterSWD

								  sub   ecx,edx

								   mov  edx,Block1.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  al,[esi].BlkY1.PVMV        ; Half pel above is best.

								  mov   Block1.CentralInterSWD,edx

								   dec  al

								  mov   ecx,Block3.CentralInterSWD

								   mov  edx,Block3.Ref1InterSWD

								  sub   ecx,edx

								   mov  edx,Block3.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  ecx,Block2.CentralInterSWD

								  mov   Block3.CentralInterSWD,edx

								   mov  edx,Block2.Ref1InterSWD

								  sub   ecx,edx

								   mov  edx,Block2.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  ecx,Block4.CentralInterSWD

								  mov   Block2.CentralInterSWD,edx

								   mov  edx,Block4.Ref1InterSWD

								  sub   ecx,edx

								   mov  edx,Block4.CentralInterSWD_BLS

								  sub   edx,ecx

								   mov  ecx,[esi].BlkY1.PastRef

								  mov   Block4.CentralInterSWD,edx

								   sub  ecx,PITCH

								  mov   [esi].BlkY1.PVMV,al

								   mov  [esi].BlkY1.PastRef,ecx


								MBVert_Done:


								  mov   ecx,HalfPelHorzSavings

								   mov  edx,esi

								  add   ebp,ecx                    ; Savings for horz and vert half pel motion.

								   mov  ecx,MBCentralInterSWD      ; Reload SWD for new central point.

								  sub   ecx,ebp                    ; Approx SWD for prescribed half pel motion.

								   mov  esi,[edx].BlkY1.PastRef    ; Reload address of reference MB selected.

								  mov   MBCentralInterSWD,ecx


								SkipHalfPelSearch_1MV:


								ENDIF ; H263


								  mov   ebp,[edx].BlkY1.MVs    ; Load Motion Vectors

								   add  esi,8

								  mov   [edx].BlkY2.PastRef,esi

								   mov  [edx].BlkY2.MVs,ebp

								  lea   edi,[esi+PITCH*8]

								   add  esi,PITCH*8-8

								  mov   [edx].BlkY3.PastRef,esi

								   mov  [edx].BlkY3.MVs,ebp

								  mov   [edx].BlkY4.PastRef,edi

								   mov  [edx].BlkY4.MVs,ebp

								IFDEF H261

								ELSE ; H263

								  mov   MBMotionVectors,ebp        ; Stash macroblock level motion vectors.

								   mov  ebp,640 ; ??? BlockMVDifferential

								  cmp   ecx,ebp

								   jl   NoBlockMotionVectors


								  mov   ecx,DoBlockLevelVectors

								  test  ecx,ecx                    ; Are we doing block level motion vectors?

								   je   NoBlockMotionVectors


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;  The following search is done similarly to the searches done above, except

								;  these are block searches, instead of macroblock searches.

								;

								; Expected performance:

								;

								;   Execution frequency:  Six times per block for which motion analysis is done

								;                         beyond the 0-motion vector.

								;

								; Pentium (tm) microprocessor times per six iterations:

								;   180 clocks for instruction execution setup to DoSWDLoop

								;  2520 clocks for DoSWDLoop procedure, instruction execution.

								;   192 clocks for bank conflicts in DoSWDLoop

								;    30 clocks generously estimated for an average of 6 cache line fills for

								;       the reference area.

								;  ----

								;  2922 clocks total time for this section.


								;

								;    Set up for the "BlkFullPelSWDLoop_4blks" loop to follow.

								;    -  Store the SWD values for blocks 4, 3, 2, 1.

								;    -  Compute and store the address of the central reference

								;       point for blocks 1, 2, 3, 4.

								;	 -  Compute and store the first address for ref 1 (minus 4

								;       pels horizontally) and ref 2 (plus 4 pels horizontally)

								;       for blocks 4, 3, 2, 1 (in that order).

								;    -  Initialize MotionOffsetsCursor

								;    -  On exit:

								;       esi = ref 1 address for block 1

								;       edi = ref 2 address for block 1

								;

								  mov   esi,Block4.CentralInterSWD

								   mov  edi,Block3.CentralInterSWD

								  mov   Block4.CentralInterSWD_BLS,esi

								   mov  Block3.CentralInterSWD_BLS,edi

								  mov   esi,Block2.CentralInterSWD

								   mov  edi,Block1.CentralInterSWD

								  mov   Block2.CentralInterSWD_BLS,esi

								   mov  eax,MBAddrCentralPoint   ; Reload addr of central, integer pel ref MB.

								  mov   Block1.CentralInterSWD_BLS,edi

								   mov  Block1.AddrCentralPoint,eax

								  lea   edi,[eax+PITCH*8+8+1]

								   lea  esi,[eax+PITCH*8+8-1]

								  mov   Block4.Ref1Addr,esi

								   mov  Block4.Ref2Addr,edi

								  sub   esi,8

								   add  eax,8

								  mov   Block2.AddrCentralPoint,eax

								   add  eax,PITCH*8-8

								  mov   Block3.AddrCentralPoint,eax

								   add  eax,8

								  mov   Block4.AddrCentralPoint,eax

								   sub  edi,8

								  mov   Block3.Ref1Addr,esi

								   mov  Block3.Ref2Addr,edi

								  sub   esi,PITCH*8-8

								   sub  edi,PITCH*8-8

								  mov   Block2.Ref1Addr,esi

								   mov  Block2.Ref2Addr,edi

								  sub   esi,8

								   mov  eax,OFFSET MotionOffsets

								  mov   MotionOffsetsCursor,eax

								   sub  edi,8

								  mov   Block1.Ref1Addr,esi

								   mov  Block1.Ref2Addr,edi


								;

								;  This loop will execute 6 times:

								;    +- 4 pels horizontally

								;    +- 4 pels vertically

								;    +- 2 pels horizontally

								;    +- 2 pels vertically

								;    +- 1 pel horizontally

								;    +- 1 pel vertically

								;  It terminates when ref1 = ref2.  This simple termination

								;  condition is what forces unrestricted motion vectors (UMV)

								;  to be ON when advanced prediction (4MV) is ON.  Otherwise

								;  we would need a state engine as above to distinguish edge

								;  pels.

								;

								BlkFullPelSWDLoop_4blks:


								;    esi -- Points to ref1

								;    edi -- Points to ref2

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  call  DoSWDLoop


								;    ebp -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  mov   eax,MotionOffsetsCursor


								BlkFullPelSWDLoop_1blk:


								  xor   esi,esi

								   cmp  ebp,edx                         ; CF == 1 iff ref1 SWD < ref2 SWD.

								  mov   edi,BlockNM1.CentralInterSWD_BLS; Get SWD for central pt of these 3 refs

								   adc  esi,esi                         ; esi == 1 iff ref1 SWD < ref2 SWD.

								  cmp   ebp,edi                         ; CF == 1 iff ref1 SWD < central SWD.

								   mov  ebp,BlockNM2.Ref1InterSWD       ; Fetch next block's Ref1 SWD.

								  adc   esi,esi

								   cmp  edx,edi                         ; CF == 1 iff ref2 SWD < central SWD.

								  adc   esi,esi                         ; 0 --> Pick central point.

								  ;                                     ; 1 --> Pick ref2.

								  ;                                     ; 2 --> Not possible.

								  ;                                     ; 3 --> Pick ref2.

								  ;                                     ; 4 --> Pick central point.

								  ;                                     ; 5 --> Not possible.

								  ;                                     ; 6 --> Pick ref1.

								  ;                                     ; 7 --> Pick ref1.

								   mov  edx,BlockNM2.Ref2InterSWD       ; Fetch next block's Ref2 SWD.

								  sub   esp,BlockLen                    ; Move ahead to next block.

								   mov  edi,[eax]                       ; Next ref2 motion vector offset.

								  mov   cl,PickPoint_BLS[esi]           ; cl == 6: central pt; 2: ref1; 4: ref2

								   mov  ebx,esp                         ; For testing completion.

								  ;

								   ;

								  mov   esi,BlockN.AddrCentralPoint[ecx*2-12] ; Get the addr for pt we picked.

								   mov  ecx,BlockN.CentralInterSWD[ecx*2]    ; Get the SWD for point we picked.

								  mov   BlockN.AddrCentralPoint,esi          ; Stash addr for new central point.

								   sub  esi,edi                              ; Compute next ref1 addr.

								  mov   BlockN.Ref1Addr,esi                  ; Stash next ref1 addr.

								   mov  BlockN.CentralInterSWD_BLS,ecx       ; Stash the SWD for central point.

								  lea   edi,[esi+edi*2]                      ; Compute next ref2 addr.

								   xor  ecx,ecx

								  mov   BlockN.Ref2Addr,edi                  ; Stash next ref2 addr.

								   and  ebx,00000001FH                       ; Done when esp at 32-byte bound.

								  jne   BlkFullPelSWDLoop_1blk


								  add   esp,BlockLen*4

								   add  eax,4                       ; Advance MotionOffsets pointer.

								  mov   MotionOffsetsCursor,eax

								   cmp  esi,edi

								  jne   BlkFullPelSWDLoop_4blks


								IF PITCH-384

								  *** Error:  The magic leaks out of the following code if PITCH isn't 384.

								ENDIF


								;

								;  The following code has been modified to correctly decode the motion vectors

								;  The previous code was simply subtracting the target frame base address

								;  from the chosen (central) reference block address.

								;  What is now done is the begining reference macroblock address computed

								;  in ebp, then subtracted from the chosen (central) reference block address.

								;  Then, for blocks 2, 3, and 4, the distance from block 1 to that block

								;  is subtracted.  Care was taken to preserve the original pairing.

								;

								  mov   esi,Block1.AddrCentralPoint ; B1a  Reload address of central ref block.

								   mov  ebp,TargetMBAddr			; ****  CHANGE  ****  addr. of target MB


								  mov   edi,Block2.AddrCentralPoint ; B2a

								   add  ebp,TargToRef				; ****  CHANGE	****  add Reference - Target


								; mov   ebp,PreviousFrameBaseAddress  ****  CHANGE  ****  DELETED


								  mov   Block1.Ref1Addr,esi         ; B1b  Stash addr central ref block.

								   sub  esi,ebp                     ; B1c  Addr of ref blk, but in target frame.


								  mov   Block2.Ref1Addr,edi         ; B2b

								   sub  edi,ebp                     ; B2c


								  sub   edi,8                       ; ****  CHANGE  ****  Correct for block 2

								   mov  eax,esi                     ; B1e Copy linearized MV.


								  sar   esi,8                       ; B1f High 24 bits of lin MV lookup vert MV.

								   mov  ebx,edi                     ; B2e


								  sar   edi,8                       ; B2f

								   add  eax,eax                     ; B1g Sign extend HMV;  *2 (# of half pels).


								  mov   Block1.BlkHMV,al            ; B1h Save winning horz motion vector.

								   add  ebx,ebx                     ; B2g


								  mov   Block2.BlkHMV,bl            ; B2h

								   mov  al,UnlinearizedVertMV[esi]  ; B1i Look up proper vert motion vector.


								  mov   Block1.BlkVMV,al            ; B1j Save winning vert motion vector.

								   mov  al,UnlinearizedVertMV[edi]  ; B2i


								  mov   esi,Block3.AddrCentralPoint ; B3a

								   mov  edi,Block4.AddrCentralPoint ; B4a


								  mov   Block3.Ref1Addr,esi         ; B3b

								   mov  Block4.Ref1Addr,edi         ; B4b


								  mov   Block2.BlkVMV,al            ; B2j

								   sub  esi,ebp                     ; B3c


								  sub   esi,8*PITCH                 ; ****  CHANGE  ****  Correct for block 3

								   sub  edi,ebp                     ; B4c


								  sub   edi,8*PITCH+8               ; ****  CHANGE  ****  Correct for block 4

								   mov  eax,esi                     ; B3e


								  sar   esi,8                       ; B3f

								   mov  ebx,edi                     ; B4e


								  sar   edi,8                       ; B4f

								   add  eax,eax                     ; B3g


								  mov   Block3.BlkHMV,al            ; B3h

								   add  ebx,ebx                     ; B4g


								  mov   Block4.BlkHMV,bl            ; B4h

								   mov  al,UnlinearizedVertMV[esi]  ; B3i


								  mov   Block3.BlkVMV,al            ; B3j

								   mov  al,UnlinearizedVertMV[edi]  ; B4i


								  mov   ebp,Block1.CentralInterSWD_BLS

								   mov  ebx,Block2.CentralInterSWD_BLS


								  add   ebp,Block3.CentralInterSWD_BLS

								   add  ebx,Block4.CentralInterSWD_BLS


								  add   ebx,ebp

								   mov  Block4.BlkVMV,al            ; B4j


								  mov   ecx,DoHalfPelEstimation

								   mov  MBCentralInterSWD_BLS,ebx


								  test  ecx,ecx

								   je   NoHalfPelBlockLevelMVs


								HalfPelBlockLevelMotionSearch:


								  mov   edi,Block1.AddrCentralPoint

								   xor  ebp,ebp


								;    ebp -- Initialized to 0, implying can search both left and right.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel to left.  Ref2 is .5 to right.


								  call  DoSWDHalfPelHorzLoop


								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								NextBlkHorz:


								  mov   ebx,BlockNM1.CentralInterSWD_BLS

								   cmp  ecx,edx

								  mov   BlockNM1.HalfPelSavings,ebp

								   jl   BlkHorz_Ref1LTRef2


								  mov   al,BlockNM1.BlkHMV

								   sub  esp,BlockLen

								  sub   ebx,edx

								   jle  BlkHorz_CenterBest


								  inc   al

								   mov  BlockN.HalfPelSavings,ebx

								  mov   BlockN.BlkHMV,al

								   jmp  BlkHorz_Done


								BlkHorz_Ref1LTRef2:


								  mov   al,BlockNM1.BlkHMV

								   sub  esp,BlockLen

								  sub   ebx,ecx

								   jle  BlkHorz_CenterBest


								  mov   ecx,BlockN.Ref1Addr

								   dec  al

								  mov   BlockN.HalfPelSavings,ebx

								   dec  ecx

								  mov   BlockN.BlkHMV,al

								   mov  BlockN.Ref1Addr,ecx


								BlkHorz_CenterBest:

								BlkHorz_Done:


								  mov   ecx,BlockNM1.Ref1InterSWD

								   mov  edx,BlockNM1.Ref2InterSWD

								  test  esp,000000018H

								  jne   NextBlkHorz


								  mov   edi,BlockN.AddrCentralPoint

								   add  esp,BlockLen*4


								;    ebp -- Initialized to 0, implying search both up and down is okay.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel above.  Ref2 is .5 below.


								  call  DoSWDHalfPelVertLoop


								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								NextBlkVert:


								  mov   ebx,BlockNM1.CentralInterSWD_BLS

								   cmp  ecx,edx

								  mov   edi,BlockNM1.HalfPelSavings

								   jl   BlkVert_Ref1LTRef2


								  mov   al,BlockNM1.BlkVMV

								   sub  esp,BlockLen

								  sub   edx,ebx

								   jge  BlkVert_CenterBest


								  inc   al

								   sub  edi,edx

								  mov   BlockN.BlkVMV,al

								   jmp  BlkVert_Done


								BlkVert_Ref1LTRef2:


								  mov   al,BlockNM1.BlkVMV

								   sub  esp,BlockLen

								  sub   ecx,ebx

								   jge  BlkVert_CenterBest


								  sub   edi,ecx

								   mov  ecx,BlockN.Ref1Addr

								  dec   al

								   sub  ecx,PITCH

								  mov   BlockN.BlkVMV,al

								   mov  BlockN.Ref1Addr,ecx


								BlkVert_CenterBest:

								BlkVert_Done:


								  mov   ecx,BlockNM1.Ref1InterSWD

								   sub  ebx,edi

								  mov   BlockN.CentralInterSWD_BLS,ebx

								   mov  edx,BlockNM1.Ref2InterSWD

								  test  esp,000000018H

								  lea   ebp,[ebp+edi]

								   jne  NextBlkVert


								  mov   ebx,MBCentralInterSWD_BLS+BlockLen*4

								   add  esp,BlockLen*4

								  sub   ebx,ebp

								   xor  eax,eax  ; ??? Keep pairing happy


								NoHalfPelBlockLevelMVs:


								  mov   eax,MBCentralInterSWD

								   mov  ecx,BlockMVDifferential

								  sub   eax,ebx

								   mov  edi,MB0MVInterSWD

								  cmp   eax,ecx

								   jle  BlockMVNotBigEnoughGain


								  sub   edi,ebx

								   mov  ecx,NonZeroMVDifferential

								  cmp   edi,ecx

								   jle  NonZeroMVNotBigEnoughGain


								; Block motion vectors are best.


								  mov   MBCentralInterSWD,ebx           ; Set MBlock's SWD to sum of 4 blocks.

								   mov  edx,MBlockActionStream

								  mov   eax,Block1.CentralInterSWD_BLS  ; Set each block's SWD.

								   mov  ebx,Block2.CentralInterSWD_BLS

								  mov   Block1.CentralInterSWD,eax

								   mov  Block2.CentralInterSWD,ebx

								  mov   eax,Block3.CentralInterSWD_BLS

								   mov  ebx,Block4.CentralInterSWD_BLS

								  mov   Block3.CentralInterSWD,eax

								   mov  Block4.CentralInterSWD,ebx

								  mov   eax,Block1.BlkMVs               ; Set each block's motion vector.

								   mov  ebx,Block2.BlkMVs

								  mov   [edx].BlkY1.MVs,eax

								   mov  [edx].BlkY2.MVs,ebx

								  mov   eax,Block3.BlkMVs

								   mov  ebx,Block4.BlkMVs

								  mov   [edx].BlkY3.MVs,eax

								   mov  [edx].BlkY4.MVs,ebx

								  mov   eax,Block1.Ref1Addr             ; Set each block's reference blk addr.

								   mov  ebx,Block2.Ref1Addr

								  mov   [edx].BlkY1.PastRef,eax

								   mov  [edx].BlkY2.PastRef,ebx

								  mov   eax,Block3.Ref1Addr

								   mov  ebx,Block4.Ref1Addr

								  mov   [edx].BlkY3.PastRef,eax

								   mov  eax,INTER4MV                    ; Set type for MB to INTER-coded, 4 MVs.

								  mov   [edx].BlkY4.PastRef,ebx

								   mov  [edx].BlockType,al

								  jmp   MotionVectorSettled


								NoBlockMotionVectors:


								ENDIF ; H263


								  mov   edi,MB0MVInterSWD


								BlockMVNotBigEnoughGain:                ; Try MB-level motion vector.


								  mov   eax,MBCentralInterSWD

								   mov  ecx,NonZeroMVDifferential

								  sub   edi,eax

								   mov  edx,MBlockActionStream

								  cmp   edi,ecx

								   jg   MotionVectorSettled


								NonZeroMVNotBigEnoughGain:              ; Settle on zero MV.


								  mov   eax,Block1.ZeroMVInterSWD       ; Restore Zero MV SWD.

								   mov  edx,Block2.ZeroMVInterSWD

								  mov   Block1.CentralInterSWD,eax

								   mov  Block2.CentralInterSWD,edx

								  mov   eax,Block3.ZeroMVInterSWD

								   mov  edx,Block4.ZeroMVInterSWD

								  mov   Block3.CentralInterSWD,eax

								   mov  Block4.CentralInterSWD,edx

								  mov   eax,MB0MVInterSWD               ; Restore SWD for zero motion vector.


								BelowZeroThresh:


								  mov   edx,MBlockActionStream

								   mov  ebx,TargetMBAddr              ; Get address of this target macroblock.

								  mov   MBCentralInterSWD,eax         ; Save SWD.

								   xor  ebp,ebp

								  add   ebx,TargToRef

								   mov  [edx].BlkY1.MVs,ebp           ; Set horz and vert MVs to 0 in all blks.

								  mov   [edx].BlkY1.PastRef,ebx       ; Save address of ref block, all blks.

								   add  ebx,8

								  mov   [edx].BlkY2.PastRef,ebx

								   mov  [edx].BlkY2.MVs,ebp

								  lea   ecx,[ebx+PITCH*8]

								   add  ebx,PITCH*8-8

								  mov   [edx].BlkY3.PastRef,ebx

								   mov  [edx].BlkY3.MVs,ebp

								  mov   [edx].BlkY4.PastRef,ecx

								   mov  [edx].BlkY4.MVs,ebp


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;     6)  We've settled on the motion vector that will be used if we do indeed

								;         code the macroblock with inter-coding.  We need to determine if some

								;         or all of the blocks can be forced as empty (copy).

								;         blocks.  If all the blocks can be forced empty, we force the whole

								;         macroblock to be empty.

								;

								; Expected Pentium (tm) microprocessor performance for this section:

								;

								;   Execution frequency:  Once per macroblock.

								;

								;    23 clocks.

								;


								MotionVectorSettled:


								IFDEF H261

								   mov  edi,MBCentralInterSWD

								  mov   eax,DoSpatialFiltering   ; Are we doing spatial filtering?

								   mov  edi,TargetMBAddr

								  test  eax,eax

								   je   SkipSpatialFiltering


								  mov   ebx,MBCentralInterSWD

								   mov  esi,SpatialFiltThreshold

								  cmp   ebx,esi

								   jle  SkipSpatialFiltering


								  add   edi,TargToSLF            ; Compute addr at which to put SLF prediction.

								   xor  ebx,ebx

								  mov   esi,[edx].BlkY1.PastRef

								   xor  edx,edx

								  mov   ebp,16

								   xor  ecx,ecx


								SpatialFilterHorzLoop:


								  mov   dl,[edi]        ; Pre-load cache line for output.

								   mov  bl,[esi+6]      ; p6

								  mov   al,[esi+7]      ; p7

								   inc  bl              ; p6+1

								  mov   cl,[esi+5]      ; p5

								   mov  [edi+7],al      ; p7' = p7

								  add   al,bl           ; p7 + p6 + 1

								   add  bl,cl           ; p6 + p5 + 1

								  mov   dl,[esi+4]      ; p4

								   add  eax,ebx         ; p7 + 2p6 + p5 + 2

								  shr   eax,2           ; p6' = (p7 + 2p6 + p5 + 2) / 4

								   inc  dl              ; p4 + 1

								  add   cl,dl           ; p5 + p4 + 1

								   mov  [edi+6],al      ; p6'

								  mov   al,[esi+3]      ; p3

								   add  ebx,ecx         ; p6 + 2p5 + p4 + 2

								  shr   ebx,2           ; p5' = (p6 + 2p5 + p4 + 2) / 4

								   add  dl,al           ; p4 + p3 + 1

								  mov   [edi+5],bl      ; p5'

								   mov  bl,[esi+2]      ; p2

								  add   ecx,edx         ; p5 + 2p4 + p3 + 2

								   inc  bl              ; p2 + 1

								  shr   ecx,2           ; p4' = (p5 + 2p4 + p3 + 2) / 4

								   add  al,bl           ; p3 + p2 + 1

								  mov   [edi+4],cl      ; p4'

								   add  edx,eax         ; p4 + 2p3 + p2 + 2

								  shr   edx,2           ; p3' = (p4 + 2p3 + p2 + 2) / 4

								   mov  cl,[esi+1]      ; p1

								  add   bl,cl           ; p2 + p1 + 1

								   mov  [edi+3],dl      ; p3'

								  add   eax,ebx         ; p3 + 2p2 + p1 + 2

								   mov  dl,[esi]        ; p0

								  shr   eax,2           ; p2' = (p3 + 2p2 + p1 + 2) / 4

								   inc  ebx             ; p2 + p1 + 2

								  mov   [edi+2],al      ; p2'

								   add  ebx,ecx         ; p2 + 2p1 + 2

								  mov   [edi],dl        ; p0' = p0

								   add  ebx,edx         ; p2 + 2p1 + p0 + 2

								  shr   ebx,2           ; p1' = (p2 + 2p1 + p0 + 2) / 4

								   mov  al,[esi+7+8]

								  mov   [edi+1],bl      ; p1'

								   mov  bl,[esi+6+8]

								  inc   bl

								   mov  cl,[esi+5+8]

								  mov   [edi+7+8],al

								   add  al,bl

								  add   bl,cl

								   mov  dl,[esi+4+8]

								  add   eax,ebx

								   ;

								  shr   eax,2

								   inc  dl

								  add   cl,dl

								   mov  [edi+6+8],al

								  mov   al,[esi+3+8]

								   add  ebx,ecx

								  shr   ebx,2

								   add  dl,al

								  mov   [edi+5+8],bl

								   mov  bl,[esi+2+8]

								  add   ecx,edx

								   inc  bl

								  shr   ecx,2

								   add  al,bl

								  mov   [edi+4+8],cl

								   add  edx,eax

								  shr   edx,2

								   mov  cl,[esi+1+8]

								  add   bl,cl

								   mov  [edi+3+8],dl

								  add   eax,ebx

								   mov  dl,[esi+8]

								  shr   eax,2

								   inc  ebx

								  mov   [edi+2+8],al

								   add  ebx,ecx

								  mov   [edi+8],dl

								   add  ebx,edx

								  shr   ebx,2

								   add  esi,PITCH

								  mov   [edi+1+8],bl

								   add  edi,PITCH

								  dec   ebp             ; Done?

								   jne  SpatialFilterHorzLoop


								  mov   VertFilterDoneAddr,edi

								   sub  edi,PITCH*16


								SpatialFilterVertLoop:


								  mov   eax,[edi]                ;  p0

								   ;                             ;  Bank conflict for sure.

								  ;

								   mov  ebx,[edi+PITCH]          ;  p1

								  add   eax,ebx                  ;  p0+p1

								   mov  ecx,[edi+PITCH*2]        ;  p2

								  add   ebx,ecx                  ;  p1+p2

								   mov  edx,[edi+PITCH*3]        ;  p3

								  shr   eax,1                    ; (p0+p1)/2                       dirty

								   mov  esi,[edi+PITCH*4]        ;  p4

								  add   ecx,edx                  ;  p2+p3

								   mov  ebp,[edi+PITCH*5]        ;  p5

								  shr   ebx,1                    ; (p1+p2)/2                       dirty

								   add  edx,esi                  ;  p3+p4

								  and   eax,07F7F7F7FH           ; (p0+p1)/2                       clean

								   and  ebx,07F7F7F7FH           ; (p1+p2)/2                       clean

								  and   ecx,0FEFEFEFEH           ;  p2+p3                          pre-cleaned

								   and  edx,0FEFEFEFEH           ;  p3+p4                          pre-cleaned

								  shr   ecx,1                    ; (p2+p3)/2                       clean

								   add  esi,ebp                  ;  p4+p5

								  shr   edx,1                    ; (p3+p4)/2                       clean

								   lea  eax,[eax+ebx+001010101H] ; (p0+p1)/2+(p1+p2)/2+1

								  shr   esi,1                    ; (p4+p5)/2                       dirty

								   ;

								  and   esi,07F7F7F7FH           ; (p4+p5)/2                       clean

								   lea  ebx,[ebx+ecx+001010101H] ; (p1+p2)/2+(p2+p3)/2+1

								  shr   eax,1                    ; p1' = ((p0+p1)/2+(p1+p2)/2+1)/2 dirty

								   lea  ecx,[ecx+edx+001010101H] ; (p2+p3)/2+(p3+p4)/2+1

								  shr   ebx,1                    ; p2' = ((p1+p2)/2+(p2+p3)/2+1)/2 dirty

								   lea  edx,[edx+esi+001010101H] ; (p3+p4)/2+(p4+p5)/2+1

								  and   eax,07F7F7F7FH           ; p1'                             clean

								   and  ebx,07F7F7F7FH           ; p2'                             clean

								  shr   ecx,1                    ; p3' = ((p2+p3)/2+(p3+p4)/2+1)/2 dirty

								   mov  [edi+PITCH],eax          ; p1'

								  shr   edx,1                    ; p4' = ((p3+p4)/2+(p4+p5)/2+1)/2 dirty

								   mov  eax,[edi+PITCH*6]        ;  p6

								  and   ecx,07F7F7F7FH           ; p3'                             clean

								   and  edx,07F7F7F7FH           ; p4'                             clean

								  mov   [edi+PITCH*2],ebx        ; p2'

								   add  ebp,eax                  ;  p5+p6

								  shr   ebp,1                    ; (p5+p6)/2                       dirty

								   mov  ebx,[edi+PITCH*7]        ;  p7

								  add   eax,ebx                  ;  p6+p7

								   and  ebp,07F7F7F7FH           ; (p5+p6)/2                       clean

								  mov   [edi+PITCH*3],ecx        ; p3'

								   and  eax,0FEFEFEFEH           ; (p6+p7)/2                       pre-cleaned

								  shr   eax,1                    ; (p6+p7)/2                       clean

								   lea  esi,[esi+ebp+001010101H] ; (p4+p5)/2+(p5+p6)/2+1

								  shr   esi,1                    ; p5' = ((p4+p5)/2+(p5+p6)/2+1)/2 dirty

								   mov  [edi+PITCH*4],edx        ; p4'

								  lea   ebp,[ebp+eax+001010101H] ; (p5+p6)/2+(p6+p7)/2+1

								   and  esi,07F7F7F7FH           ; p5'                             clean

								  shr   ebp,1                    ; p6' = ((p5+p6)/2+(p6+p7)/2+1)/2 dirty

								   mov  [edi+PITCH*5],esi        ; p5'

								  and   ebp,07F7F7F7FH           ; p6'                             clean

								   add  edi,4

								  test  edi,00000000FH

								  mov   [edi+PITCH*6-4],ebp      ; p6'

								   jne  SpatialFilterVertLoop


								  add   edi,PITCH*8-16

								   mov  eax,VertFilterDoneAddr

								  cmp   eax,edi

								   jne  SpatialFilterVertLoop


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;     9)  The SAD for the spatially filtered reference macroblock is calculated

								;         with half the pel differences accumulating into the low order half

								;         of ebp, and the other half into the high order half.

								;

								; Register usage for this section:

								;

								;   Input of this section:

								;

								;     edi -- Address of pel 0,0 of spatially filtered reference macroblock.

								;

								;   Predominate usage for body of this section:

								;

								;     edi -- Address of pel 0,0 of spatially filtered reference macroblock.

								;     esi, eax -- -8 times pel values from target macroblock.

								;     ebp[ 0:15] -- SAD Accumulator for half of the match points.

								;     ebp[16:31] -- SAD Accumulator for other half of the match points.

								;     edx[ 0: 7] -- Weighted difference for one pel.

								;     edx[ 8:15] -- Zero.

								;     edx[16:23] -- Weighted difference for another pel.

								;     edx[24:31] -- Zero.

								;     bl, cl -- Pel values from the spatially filtered reference macroblock.

								;

								; Expected Pentium (tm) microprocessor performance for this section:

								;

								;   Execution frequency:  Once per block for which motion analysis is done

								;                         beyond the 0-motion vector.

								;

								;   146 clocks instruction execution (typically).

								;     6 clocks for bank conflicts (1/8 chance with 48 dual mem ops).

								;     0 clocks for new cache line fills.

								;  ----

								;   152 clocks total time for this section.

								;


								SpatialFilterDone:


								  sub   edi,PITCH*8-8             ; Get to block 4.

								   xor  ebp,ebp

								  xor   ebx,ebx

								   xor  ecx,ecx


								SLFSWDLoop:


								  mov   eax,BlockNM1.N8T00        ; Get -8 times target Pel00.

								   mov  bl,[edi]                  ; Get Pel00 in spatially filtered reference.

								  mov   esi,BlockNM1.N8T04

								   mov  cl,[edi+4]

								  mov   edx,[eax+ebx*8]           ; Get abs diff for spatial filtered ref pel00.

								   mov  eax,BlockNM1.N8T02

								  mov   dl,[esi+ecx*8+2]          ; Get abs diff for spatial filtered ref pel04.

								   mov  bl,[edi+2]

								  mov   esi,BlockNM1.N8T06

								   mov  cl,[edi+6]

								  mov   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T11

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*1+1]

								   mov  cl,[edi+PITCH*1+5]

								  mov   esi,BlockNM1.N8T15

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T13

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*1+3]

								  mov   cl,[edi+PITCH*1+7]

								   mov  esi,BlockNM1.N8T17

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T20

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*2+0]

								   mov  cl,[edi+PITCH*2+4]

								  mov   esi,BlockNM1.N8T24

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T22

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*2+2]

								  mov   cl,[edi+PITCH*2+6]

								   mov  esi,BlockNM1.N8T26

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T31

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*3+1]

								   mov  cl,[edi+PITCH*3+5]

								  mov   esi,BlockNM1.N8T35

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T33

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*3+3]

								  mov   cl,[edi+PITCH*3+7]

								   mov  esi,BlockNM1.N8T37

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T40

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*4+0]

								   mov  cl,[edi+PITCH*4+4]

								  mov   esi,BlockNM1.N8T44

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T42

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*4+2]

								  mov   cl,[edi+PITCH*4+6]

								   mov  esi,BlockNM1.N8T46

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T51

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*5+1]

								   mov  cl,[edi+PITCH*5+5]

								  mov   esi,BlockNM1.N8T55

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T53

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*5+3]

								  mov   cl,[edi+PITCH*5+7]

								   mov  esi,BlockNM1.N8T57

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T60

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*6+0]

								   mov  cl,[edi+PITCH*6+4]

								  mov   esi,BlockNM1.N8T64

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T62

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*6+2]

								  mov   cl,[edi+PITCH*6+6]

								   mov  esi,BlockNM1.N8T66

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  mov   eax,BlockNM1.N8T71

								   mov  dl,[esi+ecx*8+2]

								  mov   bl,[edi+PITCH*7+1]

								   mov  cl,[edi+PITCH*7+5]

								  mov   esi,BlockNM1.N8T75

								   add  ebp,edx

								  mov   edx,[eax+ebx*8]

								   mov  eax,BlockNM1.N8T73

								  mov   dl,[esi+ecx*8+2]

								   mov  bl,[edi+PITCH*7+3]

								  mov   cl,[edi+PITCH*7+7]

								   mov  esi,BlockNM1.N8T77

								  add   ebp,edx

								   mov  edx,[eax+ebx*8]

								  add   edx,ebp

								   mov  cl,[esi+ecx*8+2]

								  shr   edx,16

								   add  ebp,ecx

								  and   ebp,0FFFFH

								   sub  esp,BlockLen

								  add   ebp,edx

								   sub  edi,8

								  test  esp,000000008H

								  mov   BlockN.CentralInterSWD_SLF,ebp

								   jne  SLFSWDLoop


								  test  esp,000000010H

								  lea   edi,[edi-PITCH*8+16]

								   jne  SLFSWDLoop


								  mov   eax,Block2.CentralInterSWD_SLF+BlockLen*4

								   mov  ebx,Block3.CentralInterSWD_SLF+BlockLen*4

								  mov   ecx,Block4.CentralInterSWD_SLF+BlockLen*4

								   add  esp,BlockLen*4

								  add   ebp,ecx

								   lea  edx,[eax+ebx]

								  add   ebp,edx

								   mov  edx,SpatialFiltDifferential

								  lea   esi,[edi+PITCH*8-8]

								   mov  edi,MBCentralInterSWD

								  sub   edi,edx

								   mov  edx,MBlockActionStream

								  cmp   ebp,edi

								   jge  SpatialFilterNotAsGood


								  mov   MBCentralInterSWD,ebp            ; Spatial filter was better.  Stash

								   mov  ebp,Block1.CentralInterSWD_SLF   ; pertinent calculations.

								  mov   Block2.CentralInterSWD,eax

								   mov  Block3.CentralInterSWD,ebx

								  mov   Block4.CentralInterSWD,ecx

								   mov  Block1.CentralInterSWD,ebp

								  mov   [edx].BlkY1.PastRef,esi

								   mov  al,INTERSLF

								  mov   [edx].BlockType,al


								SkipSpatialFiltering:

								SpatialFilterNotAsGood:

								ENDIF ; H261


								  mov   al,[edx].CodedBlocks       ; Fetch coded block pattern.

								   mov  edi,EmptyThreshold         ; Get threshold for forcing block empty?

								  mov   ebp,MBCentralInterSWD

								   mov  esi,InterSWDBlocks

								  mov   ebx,Block4.CentralInterSWD ; Is SWD > threshold?

								  cmp   ebx,edi

								   jg   @f


								  and   al,0F7H                    ; If not, indicate block 4 is NOT coded.

								   dec  esi

								  sub   ebp,ebx


								@@:


								  mov   ebx,Block3.CentralInterSWD

								  cmp   ebx,edi

								   jg   @f


								  and   al,0FBH

								   dec  esi

								  sub   ebp,ebx


								@@:


								  mov   ebx,Block2.CentralInterSWD

								  cmp   ebx,edi

								   jg   @f


								  and   al,0FDH

								   dec  esi

								  sub   ebp,ebx


								@@:


								  mov   ebx,Block1.CentralInterSWD

								  cmp   ebx,edi

								   jg   @f


								  and   al,0FEH

								   dec  esi

								  sub   ebp,ebx


								@@:


								  mov   [edx].CodedBlocks,al     ; Store coded block pattern.

								   add  esi,4

								  mov   InterSWDBlocks,esi

								   xor  ebx,ebx

								  and   eax,00FH

								   mov  MBCentralInterSWD,ebp

								  cmp   al,00FH                  ; Are any blocks marked empty?

								   jne  InterBest                ; If some blocks are empty, can't code as Intra


								  cmp   ebp,InterCodingThreshold ; Is InterSWD below inter-coding threshhold.

								   lea  esi,Block1+128

								  mov   ebp,0

								   jae  CalculateIntraSWD


								InterBest:


								  mov   ecx,InterSWDTotal

								   mov  ebp,MBCentralInterSWD

								  add   ecx,ebp                    ; Add to total for this macroblock class.

								   mov  PD [edx].SWD,ebp

								  mov   InterSWDTotal,ecx

								   jmp  NextMacroBlock


								;  Activity Details for this section of code  (refer to flow diagram above):

								;

								;    11)  The IntraSWD is calculated as two partial sums, one in the low order

								;         16 bits of ebp and one in the high order 16 bits.  An average pel

								;         value for each block will be calculated to the nearest half.

								;

								; Register usage for this section:

								;

								;   Input of this section:

								;

								;     None

								;

								;   Predominate usage for body of this section:

								;

								;     esi -- Address of target block 1 (3), plus 128.

								;     ebp[ 0:15] -- IntraSWD Accumulator for block 1 (3).

								;     ebp[16:31] -- IntraSWD Accumulator for block 2 (4).

								;     edi -- Block 2 (4) target pel, times -8, and with WeightedDiff added.

								;     edx -- Block 1 (3) target pel, times -8, and with WeightedDiff added.

								;     ecx[ 0: 7] -- Weighted difference for one pel in block 2 (4).

								;     ecx[ 8:15] -- Zero.

								;     ecx[16:23] -- Weighted difference for one pel in block 1 (3).

								;     ecx[24:31] -- Zero.

								;     ebx -- Average block 2 (4) target pel to nearest .5.

								;     eax -- Average block 1 (3) target pel to nearest .5.

								;

								;   Output of this section:

								;

								;     edi -- Scratch.

								;     ebp[ 0:15] -- IntraSWD.  (Also written to MBlockActionStream.)

								;     ebp[16:31] -- garbage.

								;     ebx -- Zero.

								;     eax -- MBlockActionStream.

								;

								; Expected Pentium (tm) microprocessor performance for this section:

								;

								;   Executed once per macroblock, (except for those for which one of more blocks

								;   are marked empty, or where the InterSWD is less than a threshold).

								;

								;   183 clocks for instruction execution

								;    12 clocks for bank conflicts  (94 dual mem ops with 1/8 chance of conflict)

								;  ----

								;   195 clocks total time for this section.


								IntraByDecree:


								  mov   eax,InterSWDBlocks            ; Inc by 4, because we will undo it below.

								   xor  ebp,ebp

								  mov   MBMotionVectors,ebp           ; Stash zero for MB level motion vectors.

								   mov  ebp,040000000H                ; Set Inter SWD artificially high.

								  lea   esi,Block1+128

								   add  eax,4

								  mov   MBCentralInterSWD,ebp

								   mov  InterSWDBlocks,eax


								CalculateIntraSWD:

								CalculateIntraSWDLoop:


								  mov   eax,[esi-128].AccumTargetPels  ; Fetch acc of target pels for 1st block.

								   mov  edx,[esi-128].N8T00

								  add   eax,8

								   mov  ebx,[esi-128+BlockLen].AccumTargetPels

								  shr   eax,4                ; Average block 1 target pel rounded to nearest .5.

								   add  ebx,8

								  shr   ebx,4

								   mov  edi,[esi-128+BlockLen].N8T00

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T02

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T02

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T04

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T04

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T06

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T06

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T11

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T11

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T13

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T13

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T15

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T15

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T17

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T17

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T20

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T20

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T22

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T22

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T24

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T24

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T26

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T26

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T31

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T31

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T33

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T33

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T35

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T35

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T37

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T37

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T40

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T40

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T42

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T42

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T44

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T44

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T46

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T46

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T51

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T51

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T53

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T53

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T55

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T55

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T57

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T57

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T60

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T60

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T62

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T62

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T64

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T64

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T66

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T66

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T71

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T71

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T73

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T73

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   edx,[esi-128].N8T75

								   mov  cl,PB [edi+ebx*4+2]

								  mov   edi,[esi-128+BlockLen].N8T75

								   add  ebp,ecx

								  mov   ecx,PD [edx+eax*4]

								   mov  edx,[esi-128].N8T77

								  mov   cl,PB [edi+ebx*4+2]

								   mov  edi,[esi-128+BlockLen].N8T77

								  add   ebp,ecx

								   mov  ecx,PD [edx+eax*4]

								  mov   cl,PB [edi+ebx*4+2]

								   mov  eax,000007FFFH

								  add   ebp,ecx

								   add  esi,BlockLen*2

								  and   eax,ebp

								   mov  ecx,MBCentralInterSWD

								  shr   ebp,16

								   sub  ecx,IntraCodingDifferential

								  add   ebp,eax

								   mov  edx,MBlockActionStream    ; Reload list ptr.

								  cmp   ecx,ebp                    ; Is IntraSWD > InterSWD - differential?

								   jl   InterBest


								  lea   ecx,Block1+128+BlockLen*2

								  cmp   ecx,esi

								   je   CalculateIntraSWDLoop


								;  ebp  -- IntraSWD

								;  edx  -- MBlockActionStream


								DoneCalcIntraSWD:


								IntraBest:


								  mov   ecx,IntraSWDTotal

								   mov  edi,IntraSWDBlocks

								  add   ecx,ebp                    ; Add to total for this macroblock class.

								   add  edi,4                      ; Accumulate # of blocks for this type.

								  mov   IntraSWDBlocks,edi

								   mov  edi,InterSWDBlocks

								  sub   edi,4

								   mov  IntraSWDTotal,ecx

								  mov   InterSWDBlocks,edi

								   mov  bl,INTRA

								  mov   PB [edx].BlockType,bl      ; Indicate macroblock handling decision.

								IFDEF H261

								   xor  ebx,ebx

								ELSE ; H263

								   mov  ebx,MBMotionVectors        ; Set MVs to best MB level motion vectors.

								ENDIF

								  mov   PD [edx].BlkY1.MVs,ebx

								   mov  PD [edx].BlkY2.MVs,ebx

								  mov   PD [edx].BlkY3.MVs,ebx

								   mov  PD [edx].BlkY4.MVs,ebx

								  xor   ebx,ebx

								  mov   PD [edx].SWD,ebp

								   jmp  NextMacroBlock


								;==============================================================================

								; Internal functions

								;==============================================================================


								DoSWDLoop:


								;  Upon entry:

								;    esi -- Points to ref1

								;    edi -- Points to ref2

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  mov   bl,PB [esi]               ; 00A -- Get Pel 00 in reference ref1.

								   mov  eax,Block1.N8T00+4        ; 00B -- Get -8 times target pel 00.

								  mov   cl,PB [edi]               ; 00C -- Get Pel 00 in reference ref2.

								   sub  esp,BlockLen*4+28


								SWDLoop:


								  mov   edx,PD [eax+ebx*8]        ; 00D -- Get weighted diff for ref1 pel 00.

								   mov  bl,PB [esi+2]             ; 02A

								  mov   dl,PB [eax+ecx*8+2]       ; 00E -- Get weighted diff for ref2 pel 00.

								   mov  eax,BlockN.N8T02+32       ; 02B

								  mov   ebp,edx                   ; 00F -- Accum weighted diffs for pel 00.

								   mov  cl,PB [edi+2]             ; 02C

								  mov   edx,PD [eax+ebx*8]        ; 02D

								   mov  bl,PB [esi+4]             ; 04A

								  mov   dl,PB [eax+ecx*8+2]       ; 02E

								   mov  eax,BlockN.N8T04+32       ; 04B

								  mov   cl,PB [edi+4]             ; 04C

								   add  ebp,edx                   ; 02F

								  mov   edx,PD [eax+ebx*8]        ; 04D

								   mov  bl,PB [esi+6]

								  mov   dl,PB [eax+ecx*8+2]       ; 04E

								   mov  eax,BlockN.N8T06+32

								  mov   cl,PB [edi+6]

								   add  ebp,edx                   ; 04F

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*1+1]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T11+32

								  mov   cl,PB [edi+PITCH*1+1]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*1+3]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T13+32

								  mov   cl,PB [edi+PITCH*1+3]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*1+5]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T15+32

								  mov   cl,PB [edi+PITCH*1+5]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*1+7]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T17+32

								  mov   cl,PB [edi+PITCH*1+7]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*2+0]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T20+32

								  mov   cl,PB [edi+PITCH*2+0]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*2+2]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T22+32

								  mov   cl,PB [edi+PITCH*2+2]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*2+4]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T24+32

								  mov   cl,PB [edi+PITCH*2+4]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*2+6]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T26+32

								  mov   cl,PB [edi+PITCH*2+6]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*3+1]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T31+32

								  mov   cl,PB [edi+PITCH*3+1]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*3+3]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T33+32

								  mov   cl,PB [edi+PITCH*3+3]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*3+5]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T35+32

								  mov   cl,PB [edi+PITCH*3+5]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*3+7]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T37+32

								  mov   cl,PB [edi+PITCH*3+7]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*4+0]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T40+32

								  mov   cl,PB [edi+PITCH*4+0]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*4+2]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T42+32

								  mov   cl,PB [edi+PITCH*4+2]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*4+4]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T44+32

								  mov   cl,PB [edi+PITCH*4+4]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*4+6]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T46+32

								  mov   cl,PB [edi+PITCH*4+6]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*5+1]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T51+32

								  mov   cl,PB [edi+PITCH*5+1]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*5+3]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T53+32

								  mov   cl,PB [edi+PITCH*5+3]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*5+5]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T55+32

								  mov   cl,PB [edi+PITCH*5+5]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*5+7]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T57+32

								  mov   cl,PB [edi+PITCH*5+7]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*6+0]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T60+32

								  mov   cl,PB [edi+PITCH*6+0]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*6+2]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T62+32

								  mov   cl,PB [edi+PITCH*6+2]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*6+4]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T64+32

								  mov   cl,PB [edi+PITCH*6+4]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*6+6]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T66+32

								  mov   cl,PB [edi+PITCH*6+6]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*7+1]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T71+32

								  mov   cl,PB [edi+PITCH*7+1]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*7+3]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T73+32

								  mov   cl,PB [edi+PITCH*7+3]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*7+5]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T75+32

								  mov   cl,PB [edi+PITCH*7+5]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   mov  bl,PB [esi+PITCH*7+7]

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,BlockN.N8T77+32

								  mov   cl,PB [edi+PITCH*7+7]

								   add  ebp,edx

								  mov   edx,PD [eax+ebx*8]

								   add  esp,BlockLen

								  mov   dl,PB [eax+ecx*8+2]

								   mov  eax,ebp

								  add   ebp,edx

								   add  edx,eax

								  shr   ebp,16                       ; Extract SWD for ref1.

								   and  edx,00000FFFFH               ; Extract SWD for ref2.

								  mov   esi,BlockN.Ref1Addr+32       ; Get address of next ref1 block.

								   mov  edi,BlockN.Ref2Addr+32       ; Get address of next ref2 block.

								  mov   BlockNM1.Ref1InterSWD+32,ebp ; Store SWD for ref1.

								   mov  BlockNM1.Ref2InterSWD+32,edx ; Store SWD for ref2.

								  mov   bl,PB [esi]                  ; 00A -- Get Pel 02 in reference ref1.

								   mov  eax,BlockN.N8T00+32          ; 00B -- Get -8 times target pel 00.

								  test  esp,000000018H               ; Done when esp is 32-byte aligned.

								  mov   cl,PB [edi]                  ; 00C -- Get Pel 02 in reference ref2.

								   jne  SWDLoop


								; Output:

								;    ebp -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4

								;    ecx -- Upper 24 bits zero

								;    ebx -- Upper 24 bits zero


								  add   esp,28

								  ret


								IFDEF H261

								ELSE ; H263


								DoSWDHalfPelHorzLoop:


								;    ebp -- Initialized to 0, except when can't search off left or right edge.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel to left.  Ref2 is .5 to right.


								  xor   ecx,ecx

								   sub  esp,BlockLen*4+28

								  xor   eax,eax

								   xor  ebx,ebx


								SWDHalfPelHorzLoop:


								  mov   al,[edi]           ; 00A -- Fetch center ref pel 00.

								   mov  esi,BlockN.N8T00+32; 00B -- Target pel 00 (times -8).

								  mov   bl,[edi+2]         ; 02A -- Fetch center ref pel 02.

								   mov  edx,BlockN.N8T02+32; 02B -- Target pel 02 (times -8).

								  lea   esi,[esi+eax*4]    ; 00C -- Combine target pel 00 and center ref pel 00.

								   mov  al,[edi-1]         ; 00D -- Get pel to left for match against pel 00.

								  lea   edx,[edx+ebx*4]    ; 02C -- Combine target pel 02 and center ref pel 02.

								   mov  bl,[edi+1]         ; 00E -- Get pel to right for match against pel 00,

								   ;                       ; 02D -- and pel to left for match against pel 02.

								  mov   ecx,[esi+eax*4]    ; 00F -- [16:23]  weighted diff for left ref pel 00.

								   mov  al,[edi+3]         ; 02E -- Get pel to right for match against pel 02.

								  add   ebp,ecx            ; 00G -- Accumulate left ref pel 00.

								   mov  ecx,[edx+ebx*4]    ; 02F -- [16:23]  weighted diff for left ref pel 02.

								  mov   cl,[edx+eax*4+2]   ; 02H -- [0:7] is weighted diff for right ref pel 02.

								   mov  al,[edi+4]         ; 04A

								  add   ebp,ecx            ; 02I -- Accumulate right ref pel 02,

								  ;                        ; 02G -- Accumulate left ref pel 02.

								   mov  bl,[esi+ebx*4+2]   ; 00H -- [0:7] is weighted diff for right ref pel 00.

								  add   ebp,ebx            ; 00I -- Accumulate right ref pel 00.

								   mov  esi,BlockN.N8T04+32; 04B

								  mov   bl,[edi+6]         ; 06A

								   mov  edx,BlockN.N8T06+32; 06B

								  lea   esi,[esi+eax*4]    ; 04C

								   mov  al,[edi+3]         ; 04D

								  lea   edx,[edx+ebx*4]    ; 06C

								   mov  bl,[edi+5]         ; 04E & 06D

								  mov   ecx,[esi+eax*4]    ; 04F

								   mov  al,[edi+7]         ; 06E

								  add   ebp,ecx            ; 04G

								   mov  ecx,[edx+ebx*4]    ; 06F

								  mov   cl,[edx+eax*4+2]   ; 06H

								   mov  al,[edi+PITCH*1+1] ; 11A

								  add   ebp,ecx            ; 04I & 06G

								   mov  bl,[esi+ebx*4+2]   ; 04H

								  add   ebp,ebx            ; 04I

								   mov  esi,BlockN.N8T11+32; 11B

								  mov   bl,[edi+PITCH*1+3] ; 13A

								   mov  edx,BlockN.N8T13+32; 13B

								  lea   esi,[esi+eax*4]    ; 11C

								   mov  al,[edi+PITCH*1+0] ; 11D

								  lea   edx,[edx+ebx*4]    ; 13C

								   mov  bl,[edi+PITCH*1+2] ; 11E & 13D

								  mov   ecx,[esi+eax*4]    ; 11F

								   mov  al,[edi+PITCH*1+4] ; 13E

								  add   ebp,ecx            ; 11G

								   mov  ecx,[edx+ebx*4]    ; 13F

								  mov   cl,[edx+eax*4+2]   ; 13H

								   mov  al,[edi+PITCH*1+5] ; 15A

								  add   ebp,ecx            ; 11I & 13G

								   mov  bl,[esi+ebx*4+2]   ; 11H

								  add   ebp,ebx            ; 11I

								   mov  esi,BlockN.N8T15+32; 15B

								  mov   bl,[edi+PITCH*1+7] ; 17A

								   mov  edx,BlockN.N8T17+32; 17B

								  lea   esi,[esi+eax*4]    ; 15C

								   mov  al,[edi+PITCH*1+4] ; 15D

								  lea   edx,[edx+ebx*4]    ; 17C

								   mov  bl,[edi+PITCH*1+6] ; 15E & 17D

								  mov   ecx,[esi+eax*4]    ; 15F

								   mov  al,[edi+PITCH*1+8] ; 17E

								  add   ebp,ecx            ; 15G

								   mov  ecx,[edx+ebx*4]    ; 17F

								  mov   cl,[edx+eax*4+2]   ; 17H

								   mov  al,[edi+PITCH*2+0] ; 20A

								  add   ebp,ecx            ; 15I & 17G

								   mov  bl,[esi+ebx*4+2]   ; 15H

								  add   ebp,ebx            ; 15I

								   mov  esi,BlockN.N8T20+32; 20B

								  mov   bl,[edi+PITCH*2+2] ; 22A

								   mov  edx,BlockN.N8T22+32; 22B

								  lea   esi,[esi+eax*4]    ; 20C

								   mov  al,[edi+PITCH*2-1] ; 20D

								  lea   edx,[edx+ebx*4]    ; 22C

								   mov  bl,[edi+PITCH*2+1] ; 20E & 22D

								  mov   ecx,[esi+eax*4]    ; 20F

								   mov  al,[edi+PITCH*2+3] ; 22E

								  add   ebp,ecx            ; 20G

								   mov  ecx,[edx+ebx*4]    ; 22F

								  mov   cl,[edx+eax*4+2]   ; 22H

								   mov  al,[edi+PITCH*2+4] ; 24A

								  add   ebp,ecx            ; 20I & 22G

								   mov  bl,[esi+ebx*4+2]   ; 20H

								  add   ebp,ebx            ; 20I

								   mov  esi,BlockN.N8T24+32; 24B

								  mov   bl,[edi+PITCH*2+6] ; 26A

								   mov  edx,BlockN.N8T26+32; 26B

								  lea   esi,[esi+eax*4]    ; 24C

								   mov  al,[edi+PITCH*2+3] ; 24D

								  lea   edx,[edx+ebx*4]    ; 26C

								   mov  bl,[edi+PITCH*2+5] ; 24E & 26D

								  mov   ecx,[esi+eax*4]    ; 24F

								   mov  al,[edi+PITCH*2+7] ; 26E

								  add   ebp,ecx            ; 24G

								   mov  ecx,[edx+ebx*4]    ; 26F

								  mov   cl,[edx+eax*4+2]   ; 26H

								   mov  al,[edi+PITCH*3+1] ; 31A

								  add   ebp,ecx            ; 24I & 26G

								   mov  bl,[esi+ebx*4+2]   ; 24H

								  add   ebp,ebx            ; 24I

								   mov  esi,BlockN.N8T31+32; 31B

								  mov   bl,[edi+PITCH*3+3] ; 33A

								   mov  edx,BlockN.N8T33+32; 33B

								  lea   esi,[esi+eax*4]    ; 31C

								   mov  al,[edi+PITCH*3+0] ; 31D

								  lea   edx,[edx+ebx*4]    ; 33C

								   mov  bl,[edi+PITCH*3+2] ; 31E & 33D

								  mov   ecx,[esi+eax*4]    ; 31F

								   mov  al,[edi+PITCH*3+4] ; 33E

								  add   ebp,ecx            ; 31G

								   mov  ecx,[edx+ebx*4]    ; 33F

								  mov   cl,[edx+eax*4+2]   ; 33H

								   mov  al,[edi+PITCH*3+5] ; 35A

								  add   ebp,ecx            ; 31I & 33G

								   mov  bl,[esi+ebx*4+2]   ; 31H

								  add   ebp,ebx            ; 31I

								   mov  esi,BlockN.N8T35+32; 35B

								  mov   bl,[edi+PITCH*3+7] ; 37A

								   mov  edx,BlockN.N8T37+32; 37B

								  lea   esi,[esi+eax*4]    ; 35C

								   mov  al,[edi+PITCH*3+4] ; 35D

								  lea   edx,[edx+ebx*4]    ; 37C

								   mov  bl,[edi+PITCH*3+6] ; 35E & 37D

								  mov   ecx,[esi+eax*4]    ; 35F

								   mov  al,[edi+PITCH*3+8] ; 37E

								  add   ebp,ecx            ; 35G

								   mov  ecx,[edx+ebx*4]    ; 37F

								  mov   cl,[edx+eax*4+2]   ; 37H

								   mov  al,[edi+PITCH*4+0] ; 40A

								  add   ebp,ecx            ; 35I & 37G

								   mov  bl,[esi+ebx*4+2]   ; 35H

								  add   ebp,ebx            ; 35I

								   mov  esi,BlockN.N8T40+32; 40B

								  mov   bl,[edi+PITCH*4+2] ; 42A

								   mov  edx,BlockN.N8T42+32; 42B

								  lea   esi,[esi+eax*4]    ; 40C

								   mov  al,[edi+PITCH*4-1] ; 40D

								  lea   edx,[edx+ebx*4]    ; 42C

								   mov  bl,[edi+PITCH*4+1] ; 40E & 42D

								  mov   ecx,[esi+eax*4]    ; 40F

								   mov  al,[edi+PITCH*4+3] ; 42E

								  add   ebp,ecx            ; 40G

								   mov  ecx,[edx+ebx*4]    ; 42F

								  mov   cl,[edx+eax*4+2]   ; 42H

								   mov  al,[edi+PITCH*4+4] ; 44A

								  add   ebp,ecx            ; 40I & 42G

								   mov  bl,[esi+ebx*4+2]   ; 40H

								  add   ebp,ebx            ; 40I

								   mov  esi,BlockN.N8T44+32; 44B

								  mov   bl,[edi+PITCH*4+6] ; 46A

								   mov  edx,BlockN.N8T46+32; 46B

								  lea   esi,[esi+eax*4]    ; 44C

								   mov  al,[edi+PITCH*4+3] ; 44D

								  lea   edx,[edx+ebx*4]    ; 46C

								   mov  bl,[edi+PITCH*4+5] ; 44E & 46D

								  mov   ecx,[esi+eax*4]    ; 44F

								   mov  al,[edi+PITCH*4+7] ; 46E

								  add   ebp,ecx            ; 44G

								   mov  ecx,[edx+ebx*4]    ; 46F

								  mov   cl,[edx+eax*4+2]   ; 46H

								   mov  al,[edi+PITCH*5+1] ; 51A

								  add   ebp,ecx            ; 44I & 46G

								   mov  bl,[esi+ebx*4+2]   ; 44H

								  add   ebp,ebx            ; 44I

								   mov  esi,BlockN.N8T51+32; 51B

								  mov   bl,[edi+PITCH*5+3] ; 53A

								   mov  edx,BlockN.N8T53+32; 53B

								  lea   esi,[esi+eax*4]    ; 51C

								   mov  al,[edi+PITCH*5+0] ; 51D

								  lea   edx,[edx+ebx*4]    ; 53C

								   mov  bl,[edi+PITCH*5+2] ; 51E & 53D

								  mov   ecx,[esi+eax*4]    ; 51F

								   mov  al,[edi+PITCH*5+4] ; 53E

								  add   ebp,ecx            ; 51G

								   mov  ecx,[edx+ebx*4]    ; 53F

								  mov   cl,[edx+eax*4+2]   ; 53H

								   mov  al,[edi+PITCH*5+5] ; 55A

								  add   ebp,ecx            ; 51I & 53G

								   mov  bl,[esi+ebx*4+2]   ; 51H

								  add   ebp,ebx            ; 51I

								   mov  esi,BlockN.N8T55+32; 55B

								  mov   bl,[edi+PITCH*5+7] ; 57A

								   mov  edx,BlockN.N8T57+32; 57B

								  lea   esi,[esi+eax*4]    ; 55C

								   mov  al,[edi+PITCH*5+4] ; 55D

								  lea   edx,[edx+ebx*4]    ; 57C

								   mov  bl,[edi+PITCH*5+6] ; 55E & 57D

								  mov   ecx,[esi+eax*4]    ; 55F

								   mov  al,[edi+PITCH*5+8] ; 57E

								  add   ebp,ecx            ; 55G

								   mov  ecx,[edx+ebx*4]    ; 57F

								  mov   cl,[edx+eax*4+2]   ; 57H

								   mov  al,[edi+PITCH*6+0] ; 60A

								  add   ebp,ecx            ; 55I & 57G

								   mov  bl,[esi+ebx*4+2]   ; 55H

								  add   ebp,ebx            ; 55I

								   mov  esi,BlockN.N8T60+32; 60B

								  mov   bl,[edi+PITCH*6+2] ; 62A

								   mov  edx,BlockN.N8T62+32; 62B

								  lea   esi,[esi+eax*4]    ; 60C

								   mov  al,[edi+PITCH*6-1] ; 60D

								  lea   edx,[edx+ebx*4]    ; 62C

								   mov  bl,[edi+PITCH*6+1] ; 60E & 62D

								  mov   ecx,[esi+eax*4]    ; 60F

								   mov  al,[edi+PITCH*6+3] ; 62E

								  add   ebp,ecx            ; 60G

								   mov  ecx,[edx+ebx*4]    ; 62F

								  mov   cl,[edx+eax*4+2]   ; 62H

								   mov  al,[edi+PITCH*6+4] ; 64A

								  add   ebp,ecx            ; 60I & 62G

								   mov  bl,[esi+ebx*4+2]   ; 60H

								  add   ebp,ebx            ; 60I

								   mov  esi,BlockN.N8T64+32; 64B

								  mov   bl,[edi+PITCH*6+6] ; 66A

								   mov  edx,BlockN.N8T66+32; 66B

								  lea   esi,[esi+eax*4]    ; 64C

								   mov  al,[edi+PITCH*6+3] ; 64D

								  lea   edx,[edx+ebx*4]    ; 66C

								   mov  bl,[edi+PITCH*6+5] ; 64E & 66D

								  mov   ecx,[esi+eax*4]    ; 64F

								   mov  al,[edi+PITCH*6+7] ; 66E

								  add   ebp,ecx            ; 64G

								   mov  ecx,[edx+ebx*4]    ; 66F

								  mov   cl,[edx+eax*4+2]   ; 66H

								   mov  al,[edi+PITCH*7+1] ; 71A

								  add   ebp,ecx            ; 64I & 66G

								   mov  bl,[esi+ebx*4+2]   ; 64H

								  add   ebp,ebx            ; 64I

								   mov  esi,BlockN.N8T71+32; 71B

								  mov   bl,[edi+PITCH*7+3] ; 73A

								   mov  edx,BlockN.N8T73+32; 73B

								  lea   esi,[esi+eax*4]    ; 71C

								   mov  al,[edi+PITCH*7+0] ; 71D

								  lea   edx,[edx+ebx*4]    ; 73C

								   mov  bl,[edi+PITCH*7+2] ; 71E & 73D

								  mov   ecx,[esi+eax*4]    ; 71F

								   mov  al,[edi+PITCH*7+4] ; 73E

								  add   ebp,ecx            ; 71G

								   mov  ecx,[edx+ebx*4]    ; 73F

								  mov   cl,[edx+eax*4+2]   ; 73H

								   mov  al,[edi+PITCH*7+5] ; 75A

								  add   ebp,ecx            ; 71I & 73G

								   mov  bl,[esi+ebx*4+2]   ; 71H

								  add   ebp,ebx            ; 71I

								   mov  esi,BlockN.N8T75+32; 75B

								  mov   bl,[edi+PITCH*7+7] ; 77A

								   mov  edx,BlockN.N8T77+32; 77B

								  lea   esi,[esi+eax*4]    ; 75C

								   mov  al,[edi+PITCH*7+4] ; 75D

								  lea   edx,[edx+ebx*4]    ; 77C

								   mov  bl,[edi+PITCH*7+6] ; 75E & 77D

								  mov   ecx,[esi+eax*4]    ; 75F

								   mov  al,[edi+PITCH*7+8] ; 77E

								  add   ebp,ecx            ; 75G

								   mov  ecx,[edx+ebx*4]    ; 77F

								  mov   cl,[edx+eax*4+2]   ; 77H

								   add  esp,BlockLen

								  add   ecx,ebp            ; 75I & 77G

								   mov  bl,[esi+ebx*4+2]   ; 75H

								  add   ebx,ecx            ; 75I

								   mov  edi,BlockN.AddrCentralPoint+32 ; Get address of next ref1 block.

								  shr   ecx,16                         ; Extract SWD for ref1.

								   and  ebx,00000FFFFH                 ; Extract SWD for ref2.

								  mov   BlockNM1.Ref1InterSWD+32,ecx   ; Store SWD for ref1.

								   mov  BlockNM1.Ref2InterSWD+32,ebx   ; Store SWD for ref2.

								  xor   ebp,ebp

								   mov  edx,ebx

								  test  esp,000000018H

								  mov   ebx,ebp

								   jne  SWDHalfPelHorzLoop


								; Output:

								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								  add  esp,28

								  ret


								DoSWDHalfPelVertLoop:


								;    ebp -- Initialized to 0, except when can't search off left or right edge.

								;    edi -- Ref addr for block 1.  Ref1 is .5 pel up.  Ref2 is .5 down.


								  xor   ecx,ecx

								   sub  esp,BlockLen*4+28

								  xor   eax,eax

								   xor  ebx,ebx


								SWDHalfPelVertLoop:


								  mov   al,[edi]

								   mov  esi,BlockN.N8T00+32

								  mov   bl,[edi+2*PITCH]

								   mov  edx,BlockN.N8T20+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi-1*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+1*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+3*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+4*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T40+32

								  mov   bl,[edi+6*PITCH]

								   mov  edx,BlockN.N8T60+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+3*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+5*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+7*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+1+1*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T11+32

								  mov   bl,[edi+1+3*PITCH]

								   mov  edx,BlockN.N8T31+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+1+0*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+1+2*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+1+4*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+1+5*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T51+32

								  mov   bl,[edi+1+7*PITCH]

								   mov  edx,BlockN.N8T71+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+1+4*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+1+6*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+1+8*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+2+0*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T02+32

								  mov   bl,[edi+2+2*PITCH]

								   mov  edx,BlockN.N8T22+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+2-1*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+2+1*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+2+3*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+2+4*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T42+32

								  mov   bl,[edi+2+6*PITCH]

								   mov  edx,BlockN.N8T62+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+2+3*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+2+5*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+2+7*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+3+1*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T13+32

								  mov   bl,[edi+3+3*PITCH]

								   mov  edx,BlockN.N8T33+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+3+0*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+3+2*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+3+4*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+3+5*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T53+32

								  mov   bl,[edi+3+7*PITCH]

								   mov  edx,BlockN.N8T73+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+3+4*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+3+6*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+3+8*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+4+0*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T04+32

								  mov   bl,[edi+4+2*PITCH]

								   mov  edx,BlockN.N8T24+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+4-1*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+4+1*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+4+3*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+4+4*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T44+32

								  mov   bl,[edi+4+6*PITCH]

								   mov  edx,BlockN.N8T64+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+4+3*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+4+5*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+4+7*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+5+1*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T15+32

								  mov   bl,[edi+5+3*PITCH]

								   mov  edx,BlockN.N8T35+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+5+0*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+5+2*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+5+4*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+5+5*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T55+32

								  mov   bl,[edi+5+7*PITCH]

								   mov  edx,BlockN.N8T75+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+5+4*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+5+6*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+5+8*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+6+0*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T06+32

								  mov   bl,[edi+6+2*PITCH]

								   mov  edx,BlockN.N8T26+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+6-1*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+6+1*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+6+3*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+6+4*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T46+32

								  mov   bl,[edi+6+6*PITCH]

								   mov  edx,BlockN.N8T66+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+6+3*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+6+5*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+6+7*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+7+1*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T17+32

								  mov   bl,[edi+7+3*PITCH]

								   mov  edx,BlockN.N8T37+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+7+0*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+7+2*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+7+4*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   mov  al,[edi+7+5*PITCH]

								  add   ebp,ecx

								   mov  bl,[esi+ebx*4+2]

								  add   ebp,ebx

								   mov  esi,BlockN.N8T57+32

								  mov   bl,[edi+7+7*PITCH]

								   mov  edx,BlockN.N8T77+32

								  lea   esi,[esi+eax*4]

								   mov  al,[edi+7+4*PITCH]

								  lea   edx,[edx+ebx*4]

								   mov  bl,[edi+7+6*PITCH]

								  mov   ecx,[esi+eax*4]

								   mov  al,[edi+7+8*PITCH]

								  add   ebp,ecx

								   mov  ecx,[edx+ebx*4]

								  mov   cl,[edx+eax*4+2]

								   add  esp,BlockLen

								  add   ecx,ebp

								   mov  bl,[esi+ebx*4+2]

								  add   ebx,ecx

								   mov  edi,BlockN.AddrCentralPoint+32

								  shr   ecx,16

								   and  ebx,00000FFFFH

								  mov   BlockNM1.Ref1InterSWD+32,ecx

								   mov  BlockNM1.Ref2InterSWD+32,ebx

								  xor   ebp,ebp

								   mov  edx,ebx

								  test  esp,000000018H

								  mov   ebx,ebp

								   jne  SWDHalfPelVertLoop


								; Output:

								;    ebp, ebx -- Zero

								;    ecx -- Ref1 SWD for block 4

								;    edx -- Ref2 SWD for block 4


								  add  esp,28

								  ret


								ENDIF ; H263


								; Performance for common macroblocks:

								;   298 clocks:  prepare target pels, compute avg target pel, compute 0-MV SWD.

								;    90 clocks:  compute IntraSWD.

								;  1412 clocks:  6-level search for best SWD.

								;    16 clocks:  record best fit.

								;   945 clocks:  calculate spatial loop filtered prediction.

								;   152 clocks:  calculate SWD for spatially filtered prediction and classify.

								;  ----

								;  2913 clocks total

								;

								; Performance for macroblocks in which 0-motion vector is "good enough":

								;   298 clocks:  prepare target pels, compute avg target pel, compute 0-MV SWD.

								;    90 clocks:  compute IntraSWD.

								;    16 clocks:  record best fit.

								;    58 clocks:  extra cache fill burden on adjacent MB if SWD-search not done.

								;   945 clocks:  calculate spatial loop filtered prediction.

								;   152 clocks:  calculate SWD for spatially filtered prediction and classify.

								;  ----

								;  1559 clocks total

								;

								; Performance for macroblocks marked as intrablock by decree of caller:

								;   298 clocks:  prepare target pels, compute avg target pel, compute 0-MV SWD.

								;    90 clocks:  compute IntraSWD.

								;    58 clocks:  extra cache fill burden on adjacent MB if SWD-search not done.

								;    20 clocks:  classify (just weight the SWD for # of match points).

								;  ----

								;   476 clocks total

								;

								; 160*120 performance, generously estimated (assuming lots of motion):

								;

								;  2913 * 80 = 233000 clocks for luma.

								;  2913 * 12 =  35000 clocks for chroma.

								;              268000 clocks per frame * 15 = 4,020,000 clocks/sec.

								;

								; 160*120 performance, assuming typical motion:

								;

								;  2913 * 40 + 1559 * 40 = 179000 clocks for luma.

								;  2913 *  8 + 1559 *  4 =  30000 clocks for chroma.

								;                          209000 clocks per frame * 15 = 3,135,000 clocks/sec.

								;

								; Add 10-20% to allow for initial cache-filling, and unfortunate cases where

								; cache-filling policy preempts areas of the tables that are not locally "hot",

								; instead of preempting macroblocks upon which the processing was just finished.


								Done:


								  mov   eax,IntraSWDTotal

								  mov   ebx,IntraSWDBlocks

								  mov   ecx,InterSWDTotal

								  mov   edx,InterSWDBlocks

								  mov   esp,StashESP

								  mov   edi,[esp+IntraSWDTotal_arg]

								  mov   [edi],eax

								  mov   edi,[esp+IntraSWDBlocks_arg]

								  mov   [edi],ebx

								  mov   edi,[esp+InterSWDTotal_arg]

								  mov   [edi],ecx

								  mov   edi,[esp+InterSWDBlocks_arg]

								  mov   [edi],edx

								  pop   ebx

								  pop   ebp

								  pop   edi

								  pop   esi

								  rturn


								MOTIONESTIMATION endp


								END