windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/d3mbimot.asm


								;--------------------------------------------------------------------------;

								;    INTEL Corporation Proprietary Information

								;

								;    This listing is supplied under the terms of a license

								;    agreement with INTEL Corporation and may not be copied

								;    nor disclosed except in accordance with the terms of

								;    that agreement.

								;

								;    Copyright (c) 1996 Intel Corporation.

								;    All Rights Reserved.

								;

								;--------------------------------------------------------------------------;


								;--------------------------------------------------------------------------;

								;

								;  D3mBiMot.asm

								;

								;  Description:

								;    This module does bi-directional motion compensated prediction for

								;    B frames.  It is called after forward prediction has been computed

								;    and will average in the backward prediction for those pels where

								;    the backward motion vector points inside of the referenced P frame.

								;

								;  MMx Version

								;

								;  Routines:                          prototypes in:

								;    MMX_BiMotionComp                 none

								;

								;--------------------------------------------------------------------------;


								;--------------------------------------------------------------------------;

								;

								;  $Header:   S:\h26x\src\dec\d3mbimot.asv   1.2   01 Apr 1996 12:35:48   RMCKENZX  $

								;  $Log:   S:\h26x\src\dec\d3mbimot.asv  $

								;//

								;//    Rev 1.2   01 Apr 1996 12:35:48   RMCKENZX

								;//

								;// Added MMXCODE1 and MMXDATA1 segments, moved global data

								;// to MMXDATA1 segment.

								;//

								;//    Rev 1.1   14 Mar 1996 13:58:00   RMCKENZX

								;//

								;// Optimized routine for speed of execution.

								;//

								;//    Rev 1.0   07 Mar 1996 18:36:36   RMCKENZX

								;// Initial revision.

								;

								;--------------------------------------------------------------------------;


								;--------------------------------------------------------------------------;

								;

								;  Routine Name:

								;    MMX_BiMotionComp(U32, U32, I32, I32, I32)

								;

								;  Inputs -- C calling convention:

								;    pPrev       flat pointer to prediction from previous P frame

								;                used for "forward" motion vector prediction.

								;    pCurr       flat pointer into current P frame

								;                to be used for "backward" motion vector prediction.

								;    mvx         x component of backward motion vector.

								;    mvy         y component of backward motion vector.

								;    iNum        block number.

								;

								;  Returns:

								;    updates the values pointed to by pPrev.

								;

								;--------------------------------------------------------------------------;

								;

								;  Version:     .006

								;  Date:        14 March 1996

								;  Author:      R. McKenzie

								;

								;--------------------------------------------------------------------------;


								.586

								.MODEL FLAT


								;  make all symbols case sensitive

								OPTION CASEMAP:NONE


								.xlist

								include iammx.inc

								.list


								MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'

								MMXCODE1 ENDS


								MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'

								MMXDATA1 ENDS


								;-------------------;

								;     Stack Use     ;

								;-------------------;


								; register storage (rel to old stack ptr as saved in ebp)

								;	esi               ebp+00

								;	edi               ebp+04

								;	ebp               ebp+08

								;	ebx               ebp+12


								; return address      ebp+16


								; C input parameters

								  pPrev        EQU    ebp+20

								  pCurr        EQU    ebp+24

								  mvx          EQU    ebp+28

								  mvy          EQU    ebp+32

								  iNum         EQU    ebp+36


								; local variables

								  uColEnd      EQU    esp+00

								  uRowEnd      EQU    esp+02

								  uColStart    EQU    esp+04

								  uRowStart    EQU    esp+06

								  mmxTempL     EQU    esp+08

								  mmxTempH     EQU    esp+16


								  PITCH         =     384

								  FRAMESIZE     =      32


								MMXDATA1 SEGMENT

								ALIGN 8

								                         ;     End        Start

								                         ;  Row   Col   Row   Col

								                         ;   y     x     y     x

								mmxFudge        DWORD       001e001eh,  00010001h

								                DWORD       001e000eh,  0001fff1h

								                DWORD       000e001eh, 0fff10001h

								                DWORD       000e000eh, 0fff1fff1h

								                DWORD       000e000eh,  00010001h

								                DWORD       000e000eh,  00010001h


								mmxClipT        DWORD       7ff87ff8h,  7ff77ff7h

								mmxClipB        DWORD       7ff77ff7h,  7ff77ff7h

								                                                    ; start

								ColStartMask    DWORD      0ffffffffh, 0ffffffffh   ;   0

								                DWORD      0ffffff00h, 0ffffffffh   ;   1

								                DWORD      0ffff0000h, 0ffffffffh   ;   2

								                DWORD      0ff000000h, 0ffffffffh   ;   3

								                DWORD       00000000h, 0ffffffffh   ;   4

								                DWORD       00000000h, 0ffffff00h   ;   5

								                DWORD       00000000h, 0ffff0000h   ;   6

								                DWORD       00000000h, 0ff000000h   ;   7    end

								ColEndMask      DWORD       00000000h,  00000000h   ;   8     0

								                DWORD       000000ffh,  00000000h   ;         1

								                DWORD       0000ffffh,  00000000h   ;         2

								                DWORD       00ffffffh,  00000000h   ;         3

								                DWORD      0ffffffffh,  00000000h   ;         4

								                DWORD      0ffffffffh,  000000ffh   ;         5

								                DWORD      0ffffffffh,  0000ffffh   ;         6

								                DWORD      0ffffffffh,  00ffffffh   ;         7

								                DWORD      0ffffffffh, 0ffffffffh   ;         8


								ShiftMask       DWORD       7f7f7f7fh,  7f7f7f7fh	; used for byte shifts

								BottomBitMask   DWORD       01010101h,  01010101h	; used for packed averages

								Round1          DWORD       00010001h,  00010001h


								MMXDATA1 ENDS


								;-------------------;

								;      Set Up       ;

								;-------------------;

								MMXCODE1 SEGMENT


								PUBLIC C MMX_BiMotionComp


								MMX_BiMotionComp:

								  push       ebx

								  push       ebp


								  push       edi

								  push       esi


								  mov        ebp, esp

								  and        esp, -32                    ; align the stack on a cache line


								  sub        esp, FRAMESIZE              ; make room for locals


								  mov        edi, [iNum]

								  mov        esi, [pCurr]


								                                         ;         start      end

								  movd       mm1, [mvx]				     ; mm1 = 0000 0000 .... .mvx


								  movd       mm2, [mvy]                  ; mm2 = 0000 0000 .... .mvy


								  movq       mm0, [mmxFudge+8*edi]

								  punpcklwd  mm1, mm2                    ; mm1 = .... .... .mvy .mvx


								  movq       mm3, [mmxClipT]

								  punpckldq  mm1, mm1                    ; mm1 = .mvy .mvx .mvy .mvx


								  movq       mm4, [mmxClipB]

								  psubw      mm0, mm1


								  mov        edi, [pPrev]

								  psraw      mm0, 1                      ; mm0 = RowStart ColStart RowEnd ColEnd


								  mov        ebx, [mvy]

								  paddsw     mm0, mm3                    ; clip at 8 or higher


								  and        ebx, -2                     ; 2*(mvy>>1)

								  psubusw    mm0, mm4				     ; clip at 0 or lower


								  shl        ebx, 6                      ; 128*(mvy>>1)

								  mov        eax, [mvx]


								  movq       [uColEnd], mm0


								  sar        eax, 1                      ; mvx>>1

								  lea        ebx, [ebx+2*ebx]            ; PITCH*(mvy>>1)


								  add        esi, ebx                    ; pCurr += PITCH*(mvy>>1)

								  xor        ecx, ecx


								  add        esi, eax                    ; pCurr += mvx>>1

								  xor        edx, edx


								  mov        cl, [uColStart]             ; uColStart

								  mov        dl, [uColEnd]               ; uColEnd


								  cmp        ecx, edx                    ; iColCount = ColStart - ColEnd

								  jge        hasta_la_vista_baby


								  movq       mm6, ColStartMask[8*ecx]


								  movq       mm7, ColEndMask[8*edx]

								  pxor       mm4, mm4                    ; mm4 = 0


								  mov        cl, [uRowStart]             ; RowStart

								  mov        dl, [uRowEnd]               ; RowEnd


								  sub        edx, ecx                    ; iRowCount = RowEnd - RowStart

								  jle        hasta_la_vista_baby


								  pand       mm7, mm6                    ; mm7 = ff for those cols to use back pred.

								  pxor       mm6, mm6


								  shl        ecx, 7                      ; 128*RowStart

								  mov        eax, [mvx]


								  movq       mm5, [ShiftMask]            ; mm5 = 7f 7f 7f 7f  7f 7f 7f 7f

								  pcmpeqb    mm6, mm7                    ; mm6 is the complement of mm7


								  lea        ecx, [ecx+2*ecx]            ; PITCH*RowStart

								  mov        ebx, [mvy]


								  add        esi, ecx                    ; pCurr += PITCH*RowStart

								  add        edi, ecx                    ; pPrev += PITCH*RowStart


								  mov        ecx, PITCH


								  and      eax, 1

								  je       even_mvx


								  and      ebx, 1

								  je       odd_even


								;

								;  mvx is odd (horizontal half pel motion)

								;  mvy is odd (vertical half pel motion)

								;

								odd_odd:

								  movq       mm0, [esi+4]


								  movq       mm1, mm0

								  psrlq      mm0, 8


								  movq       mm2, [esi]

								  punpcklbw  mm1, mm4


								  movq       mm3, mm2

								  punpcklbw  mm0, mm4


								  paddw      mm0, mm1

								  psrlq      mm2, 8


								  paddw      mm0, [Round1]

								  punpcklbw  mm3, mm4


								  punpcklbw  mm2, mm4

								  add        esi, ecx


								  movq       [mmxTempH], mm0

								  paddw      mm2, mm3


								  paddw      mm2, [Round1]


								  sub        edi, ecx                    ; pre decrement destination pointer


								  movq       [mmxTempL], mm2


								;

								;  This loop is 2-folded and works on 2 results (rows) per pass.

								;  It finishes one result per iteration.

								;

								;  Stage I

								;    computes the partial sums of a row with a shifted copy of the row.

								;    It stores the partial sums for the next iteration's Stage II.

								;  Stage II

								;    reads the partial sums of the prior row and averages them with the

								;    just computed (in Stage I) partial sums of the current row to get

								;    the backward prediction.  These computations are done unpacked as

								;    16-bit words.  A rounding factor is added to each partial sum before

								;    storage.  Then stage II averages the result (with truncation) with

								;    the forward prediction.

								;

								;    Those bytes of the backwards prediction which are not to be used are

								;    replaced by the corresponding bytes of the forwards prediction prior

								;    to averaging (using the masks in registers mm6 and mm7).

								;

								;    Averaging of the forward with backward is done packed in 8-bit bytes by

								;    dividing both inputs by 2, adding them together, and then adding in an

								;    adjustment.  To average with truncation, the adjustment is 1 when BOTH

								;    inputs are odd.  Due to the absence of a byte shift instruction, divide

								;    by 2 is done by shifting the entire mmx register and then masking off

								;    (zeroing) bits , 15, ..., and 63 (the old low-order bits) using mm5.

								;

								OddOddLoop:

								  movq       mm1, [esi]                  ; load left half


								  movq       mm0, mm1                    ; copy left half

								  psrlq      mm1, 8                      ; shift left over


								  movq       mm3, [esi+4]                ; load right half

								  punpcklbw  mm0, mm4                    ; unpack left half


								  movq       mm2, mm3                    ; copy right half

								  punpcklbw  mm1, mm4                    ; unpack shifted left half


								  paddw      mm1, mm0                    ; add left side

								  psrlq      mm3, 8                      ; shift right over


								  paddw      mm1, [Round1]               ; add in round to left

								  punpcklbw  mm2, mm4                    ; unpack right half


								  movq       mm0, [mmxTempL]             ; fetch prior row's left half

								  punpcklbw  mm3, mm4                    ; unpack shifted right half


								  movq       [mmxTempL], mm1             ; stash this row's left half

								  paddw      mm3, mm2                    ; add right side


								  paddw      mm3, [Round1]               ; add in round to right

								  paddw      mm0, mm1                    ; sum current & prior lefts


								  movq       mm2, [mmxTempH]             ; fetch prior row's right half

								  psrlw      mm0, 2                      ; divide left sum by four


								  movq       [mmxTempH], mm3             ; stash this rows right half

								  paddw      mm2, mm3                    ; sum current & prior rights


								  movq       mm1, [edi+ecx]              ; fetch forward prediction

								  psrlw      mm2, 2                      ; divide right sum by four


								  packuswb   mm0, mm2                    ; complete backward prediction

								  movq       mm2, mm1                    ; copy forward


								  pand       mm0, mm7                    ; mask off unused bytes

								  pand       mm2, mm6                    ; create replacement bytes


								  por        mm0, mm2                    ; new backward prediction

								  movq       mm3, mm1                    ; copy forward for adjustment


								  pand       mm3, mm0                    ; adjustment with truncation

								  psrlq      mm0, 1                      ; divide new backward by 2


								  pand       mm0, mm5                    ; clear extra bits

								  psrlq      mm1, 1                      ; divide forward by 2


								  pand       mm3, [BottomBitMask]        ; complete adjustment

								  pand       mm1, mm5                    ; clear extra bits


								  paddb      mm0, mm1                    ; sum quotients

								  add        edi, ecx                    ; increment destination pointer


								  paddb      mm0, mm3                    ; add addjustment

								  add        esi, ecx                    ; increment source pointer


								  movq       [edi], mm0                  ; store result

								                                         ; *** 1 cycle store penalty ***


								  dec        edx                         ; decrement loop control

								  jg         OddOddLoop                  ; back up if not done


								;  wrap up and go home

								  mov        esp, ebp


								  pop        esi

								  pop        edi


								  pop        ebp

								  pop        ebx


								  ret


								;

								;  mvx is odd (horizontal half pel motion)

								;  mvy is even (vertical full pel motion)

								;

								odd_even:

								  sub        edi, ecx                    ; pre decrement destination pointer


								;

								;  This loop is not folded and does 1 result (row) per pass.

								;

								;    It loads the backward predicted row into mm0 and brings in the last

								;    (eighth) byte through al, which is or'd with the shifted row.  It

								;    completes the bacward prediction (by averaging the rows with round)

								;    and averages the result (with truncation) with the forward prediction.

								;    Those bytes of the backwards prediction which are not to be used are

								;    replaced by the corresponding bytes of the forwards prediction prior

								;    to averaging (using the masks in registers mm6 and mm7).

								;

								;    Averaging is done by dividing both inputs by 2, adding them together,

								;    and then adding in an adjustment.

								;    To average with round, the adjustment is 1 when EITHER input is odd.

								;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.

								;    Due to the absence of a byte shift instruction, divide by 2 is done

								;    by shifting the entire mmx register and then masking off (zeroing) bits

								;    7, 15, ..., and 63 (the old low-order bits) using mm5.

								;

								OddEvenLoop:

								  movq       mm0, [esi]                  ; fetch backward predicted row


								  mov        al, [esi+8]                 ; fetch last byte

								  movq       mm1, mm0                    ; copy row


								  movd       mm2, eax                    ; last byte

								  psrlq      mm0, 8	                     ; shift row right 1 byte


								  movq       mm3, mm1                    ; copy row for adjustment

								  psllq      mm2, 56                     ; move last byte to left end


								  por        mm0, mm2                    ; or in last byte on left

								  psrlq      mm1, 1                      ; divide row by 2


								  por        mm3, mm0                    ; averaging with rounding bit

								  psrlq      mm0, 1                      ; divide shifted row by 2


								  pand       mm0, mm5                    ; clear extra bits

								  pand       mm1, mm5                    ; clear extra bits


								  pand       mm3, [BottomBitMask]        ; finish adjustment (with round)

								  paddb      mm0, mm1                    ; sum quotients


								  movq       mm4, [edi+ecx]              ; fetch forward prediction

								  paddb      mm3, mm0                    ; add adjustment, got back pred.


								  movq       mm2, mm4                    ; copy forward

								  pand       mm3, mm7                    ; mask off unused bytes


								  movq       mm1, mm4                    ; copy forward

								  pand       mm2, mm6                    ; mask forward copy


								  por        mm3, mm2                    ; backward with forward replacing

								  psrlq      mm4, 1                      ; divide forward by 2


								  pand       mm1, mm3                    ; adjustment for truncation

								  psrlq      mm3, 1                      ; divide bacwards by 2


								  pand       mm3, mm5                    ; clear extra bits

								  pand       mm4, mm5                    ; clear extra bits


								  pand       mm1, [BottomBitMask]        ; finish adjustment (with truncation)

								  paddb      mm4, mm3                    ; sum quotients


								  paddb      mm4, mm1                    ; add adjusment, have result

								  add        edi, ecx                    ; increment destination pointer


								  add        esi, ecx                    ; increment source pointer

								  dec        edx                         ; decrement loop control


								  movq       [edi], mm4                  ; save result


								  jg         OddEvenLoop                 ; loop when not done


								;  wrap up and go home

								  mov        esp, ebp


								  pop        esi

								  pop        edi


								  pop        ebp

								  pop        ebx


								  ret


								;---------------------------;

								;  mvx is even -- test mvy  ;

								;---------------------------;

								even_mvx:

								  and      ebx, 1

								  je       even_even


								;

								;  mvx is even (horizontal full pel motion)

								;  mvy is odd (vertical half pel motion)

								;

								even_odd:

								  movq       mm0, [esi]                  ; 1:  first row


								  movq       mm1, [esi+ecx]              ; 1:  second row

								  movq       mm2, mm0                    ; 1:  copy for rounding


								  por        mm2, mm1                    ; 1:  averaging with round

								  sub        edi, ecx                    ; pre deccrement destination pointer


								  dec        edx                         ; note that edx is positive on entry

								  jz         EvenOddPost


								;

								;  This loop is 2-folded and works on 2 results (rows) per pass.

								;  It finishes one result per iteration.

								;  Stage I

								;    loads both backward predicted rows into mm0 and mm1, copies the first

								;    into mm2, and ors with the second for the rounding adjustment.

								;  Stage II

								;    completes the bacward prediction (by averaging the rows with round)

								;    and averages the result (with truncation) with the forward prediction.

								;    Those bytes of the backwards prediction which are not to be used are

								;    replaced by the corresponding bytes of the forwards prediction prior

								;    to averaging (using the masks in registers mm6 and mm7).

								;

								;    Averaging is done by dividing both inputs by 2, adding them together,

								;    and then adding in an adjustment (in mm2).

								;    To average with round, the adjustment is 1 when EITHER input is odd.

								;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.

								;    Due to the absence of a byte shift instruction, divide by 2 is done

								;    by shifting the entire mmx register and then masking off (zeroing) bits

								;    7, 15, ..., and 63 (the old low-order bits) using mm5.

								;

								EvenOddLoop:

								  psrlq      mm0, 1                      ; 2:  divide first row by 2

								  add        edi, ecx                    ; increment destination pointer


								  psrlq      mm1, 1                      ; 2:  divide second row by 2

								  pand       mm0, mm5                    ; 2:  clear extra bits


								  pand       mm2, [BottomBitMask]        ; 2:  rounding bits

								  pand       mm1, mm5                    ; 2:  clear extra bits


								  movq       mm3, [edi]                  ; 2:  fetch forward prediction

								  paddb      mm1, mm0                    ; 2:  average backward rows


								  paddb      mm1, mm2                    ; 2:  add in round

								  movq       mm4, mm3                    ; 2:  copy for mask


								  pand       mm1, mm7                    ; 2:  masked backward prediction

								  pand       mm4, mm6                    ; 2:  masked forward prediction


								  por        mm4, mm1                    ; 2:  adjusted backwards prediction

								  movq       mm2, mm3                    ; 2:  copy for rounding


								  pand       mm2, mm4                    ; 2:  averaging with truncation

								  psrlq      mm4, 1                      ; 2:  divide bacwards by 2


								  psrlq      mm3, 1                      ; 2:  divide forwards by 2

								  pand       mm4, mm5                    ; 2:  clear extra bits


								  pand       mm2, [BottomBitMask]        ; 2:  "no-round" bits

								  pand       mm3, mm5                    ; 2:  clear extra bits


								  movq       mm0, [esi+ecx]              ; 1:  first row

								  paddb      mm4, mm3                    ; 2:  average forward & backwards


								  movq       mm1, [esi+2*ecx]            ; 1:  second row

								  paddb      mm4, mm2                    ; 2:  add in "no-round" bits


								  movq       mm2, mm0                    ; 1:  copy for rounding

								  add        esi, ecx                    ; increment source pointer


								  movq       [edi], mm4                  ; 2:  store resulting row

								  por        mm2, mm1                    ; 1:  averaging with rounding bit


								  dec        edx                         ; decrement loop count

								  jg         EvenOddLoop                 ; back up if not done


								EvenOddPost:

								  psrlq      mm0, 1                      ; 2:  divide first row by 2

								  add        edi, ecx                    ; increment destination pointer


								  psrlq      mm1, 1                      ; 2:  divide second row by 2

								  pand       mm0, mm5                    ; 2:  clear extra bits


								  pand       mm2, [BottomBitMask]        ; 2:  rounding bits

								  pand       mm1, mm5                    ; 2:  clear extra bits


								  movq       mm3, [edi]                  ; 2:  fetch forward prediction

								  paddb      mm1, mm0                    ; 2:  average backward rows


								  paddb      mm1, mm2                    ; 2:  add in round

								  movq       mm4, mm3                    ; 2:  copy for mask


								  pand       mm1, mm7                    ; 2:  masked backward prediction

								  pand       mm4, mm6                    ; 2:  masked forward prediction


								  por        mm4, mm1                    ; 2:  adjusted backwards prediction

								  movq       mm2, mm3                    ; 2:  copy for rounding


								  pand       mm2, mm4                    ; 2:  averaging with truncation

								  psrlq      mm4, 1                      ; 2:  divide bacwards by 2


								  psrlq      mm3, 1                      ; 2:  divide forwards by 2

								  pand       mm4, mm5                    ; 2:  clear extra bits


								  pand       mm2, [BottomBitMask]        ; 2:  "no-round" bits

								  pand       mm3, mm5                    ; 2:  clear extra bits


								  paddb      mm4, mm3                    ; 2:  average forward & backwards

								  mov        esp, ebp


								  paddb      mm4, mm2                    ; 2:  add in "no-round" bits

								  mov        ecx, edi


								  pop        esi

								  pop        edi


								  pop        ebp

								  pop        ebx


								  movq       [ecx], mm4                  ; 2:  store resulting row


								  ret


								;

								;  mvx is even (horizontal full pel motion)

								;  mvy is even (vertical full pel motion)

								;

								even_even:

								  movq       mm1, [edi]                  ; 1:  forward prediction


								  movq       mm0, [esi]                  ; 1:  backward prediction

								  movq       mm2, mm1                    ; 1:  copy forward for mask


								  pand       mm0, mm7                    ; 1:  mask off unused bytes

								  sub        edi, ecx                    ; pre deccrement destination pointer


								  dec        edx                         ; note that edx is positive on entry

								  jz         EvenEvenPost


								;

								;  This loop is 2-folded and works on 2 results (rows) per pass.

								;  It finishes one result per iteration.

								;  Stage I

								;    loads mm0 and mm1 with the predictions and begins the replacement

								;    procedure for the forward prediction.

								;  Stage II

								;    finishes the replacement procedure for the forward prediction and

								;    averages that (with truncation) with the bacwards prediction.

								;    Those bytes of the backwards prediction which are not to be used are

								;    replaced by the corresponding bytes of the forwards prediction prior

								;    to averaging (using the masks in registers mm6 and mm7).

								;

								;    Averaging is done by dividing both inputs by 2, adding them together,

								;    and then adding in an adjustment (in mm2).

								;    To average with round, the adjustment is 1 when EITHER input is odd.

								;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.

								;    Due to the absence of a byte shift instruction, divide by 2 is done

								;    by shifting the entire mmx register and then masking off (zeroing) bits

								;    7, 15, ..., and 63 (the old low-order bits) using mm5.

								;

								EvenEvenLoop:

								  pand       mm2, mm6                    ; 2:  mask corresponding bytes

								  add        edi, ecx                    ; increment destination pointer


								  por        mm0, mm2                    ; 2:  replace unused back with for.

								  movq       mm3, mm1                    ; 2:  copy forward for adjustment


								  pand       mm3, mm0                    ; 2:  adjustment for truncation

								  psrlq      mm0, 1                      ; 2:  divide back by 2


								  psrlq      mm1, 1                      ; 2:  divide forward by 2

								  pand       mm0, mm5                    ; 2:  clear extra bits


								  pand       mm3, [BottomBitMask]        ; 2:  finish adjustment

								  pand       mm1, mm5                    ; 2:  clear extra bits


								  paddb      mm0, mm1                    ; 2:  sum quotients

								  add        esi, ecx                    ; increment source pointer


								  movq       mm1, [edi+ecx]              ; 1:  forward prediction

								  paddb      mm3, mm0                    ; 2:  add in adjusment


								  movq       mm0, [esi]                  ; 1:  backward prediction

								  movq       mm2, mm1                    ; 1:  copy forward for mask


								  movq       [edi], mm3                  ; 2:  store result

								  pand       mm0, mm7                    ; 1:  mask off unused bytes


								  dec        edx                         ; decrement loop control

								  jg         EvenEvenLoop                ; loop back when not done


								EvenEvenPost:

								  pand       mm2, mm6                    ; 2:  mask corresponding bytes

								  add        ecx, edi


								  por        mm0, mm2                    ; 2:  replace unused back with for.

								  movq       mm3, mm1                    ; 2:  copy forward for adjustment


								  pand       mm3, mm0                    ; 2:  adjustment for truncation

								  psrlq      mm0, 1                      ; 2:  divide back by 2


								  psrlq      mm1, 1                      ; 2:  divide forward by 2

								  pand       mm0, mm5                    ; 2:  clear extra bits


								  pand       mm3, [BottomBitMask]        ; 2:  finish adjustment

								  pand       mm1, mm5                    ; 2:  clear extra bits


								  paddb      mm0, mm1                    ; 2:  sum quotients

								  mov        esp, ebp


								  paddb      mm3, mm0                    ; 2:  add in adjusment

								  nop


								  pop        esi

								  pop        edi


								  pop        ebp

								  pop        ebx


								  movq       [ecx], mm3


								  ret


								;

								;  "Remember when I promised to kill you last?"

								;

								bye_bye:

								hasta_la_vista_baby:

								  mov        esp, ebp


								  pop        esi

								  pop        edi


								  pop        ebp

								  pop        ebx


								  ret

								MMXCODE1 ENDS


								;        1111111111222222222233333333334444444444555555555566666666667777777

								;234567890123456789012345678901234567890123456789012345678901234567890123456

								;--------------------------------------------------------------------------;

								END