;--------------------------------------------------------------------------;
;    INTEL Corporation Proprietary Information
;
;    This listing is supplied under the terms of a license
;    agreement with INTEL Corporation and may not be copied
;    nor disclosed except in accordance with the terms of
;    that agreement.
;
;    Copyright (c) 1996 Intel Corporation.
;    All Rights Reserved.
;
;--------------------------------------------------------------------------;

;--------------------------------------------------------------------------;
;
;  D3mBiMot.asm
;
;  Description:
;    This module does bi-directional motion compensated prediction for 
;    B frames.  It is called after forward prediction has been computed
;    and will average in the backward prediction for those pels where 
;    the backward motion vector points inside of the referenced P frame.
;
;  MMx Version
;
;  Routines:                          prototypes in:
;    MMX_BiMotionComp                 none
;
;--------------------------------------------------------------------------;

;--------------------------------------------------------------------------;
;
;  $Header:   S:\h26x\src\dec\d3mbimot.asv   1.2   01 Apr 1996 12:35:48   RMCKENZX  $
;  $Log:   S:\h26x\src\dec\d3mbimot.asv  $
;// 
;//    Rev 1.2   01 Apr 1996 12:35:48   RMCKENZX
;// 
;// Added MMXCODE1 and MMXDATA1 segments, moved global data
;// to MMXDATA1 segment.
;// 
;//    Rev 1.1   14 Mar 1996 13:58:00   RMCKENZX
;// 
;// Optimized routine for speed of execution.
;// 
;//    Rev 1.0   07 Mar 1996 18:36:36   RMCKENZX
;// Initial revision.
;
;--------------------------------------------------------------------------;

;--------------------------------------------------------------------------;
;
;  Routine Name:
;    MMX_BiMotionComp(U32, U32, I32, I32, I32)
;
;  Inputs -- C calling convention:
;    pPrev       flat pointer to prediction from previous P frame
;                used for "forward" motion vector prediction.
;    pCurr       flat pointer into current P frame
;                to be used for "backward" motion vector prediction.
;    mvx         x component of backward motion vector.
;    mvy         y component of backward motion vector.
;    iNum        block number.
;
;  Returns:
;    updates the values pointed to by pPrev.
;
;--------------------------------------------------------------------------;
;
;  Version:     .006
;  Date:        14 March 1996
;  Author:      R. McKenzie
;
;--------------------------------------------------------------------------;

.586
.MODEL FLAT

;  make all symbols case sensitive
OPTION CASEMAP:NONE

.xlist
include iammx.inc
.list

MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS

MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS

;-------------------;
;     Stack Use     ;
;-------------------;

; register storage (rel to old stack ptr as saved in ebp)
;	esi               ebp+00
;	edi               ebp+04
;	ebp               ebp+08
;	ebx               ebp+12

; return address      ebp+16

; C input parameters
  pPrev        EQU    ebp+20
  pCurr        EQU    ebp+24
  mvx          EQU    ebp+28
  mvy          EQU    ebp+32
  iNum         EQU    ebp+36


; local variables
  uColEnd      EQU    esp+00
  uRowEnd      EQU    esp+02
  uColStart    EQU    esp+04
  uRowStart    EQU    esp+06
  mmxTempL     EQU    esp+08
  mmxTempH     EQU    esp+16


  PITCH         =     384
  FRAMESIZE     =      32

MMXDATA1 SEGMENT
ALIGN 8
                         ;     End        Start
                         ;  Row   Col   Row   Col
                         ;   y     x     y     x
mmxFudge        DWORD       001e001eh,  00010001h 
                DWORD       001e000eh,  0001fff1h 
                DWORD       000e001eh, 0fff10001h 
                DWORD       000e000eh, 0fff1fff1h 
                DWORD       000e000eh,  00010001h
                DWORD       000e000eh,  00010001h

mmxClipT        DWORD       7ff87ff8h,  7ff77ff7h
mmxClipB        DWORD       7ff77ff7h,  7ff77ff7h
                                                    ; start
ColStartMask    DWORD      0ffffffffh, 0ffffffffh   ;   0
                DWORD      0ffffff00h, 0ffffffffh   ;   1      
                DWORD      0ffff0000h, 0ffffffffh   ;   2      
                DWORD      0ff000000h, 0ffffffffh   ;   3      
                DWORD       00000000h, 0ffffffffh   ;   4      
                DWORD       00000000h, 0ffffff00h   ;   5      
                DWORD       00000000h, 0ffff0000h   ;   6      
                DWORD       00000000h, 0ff000000h   ;   7    end      
ColEndMask      DWORD       00000000h,  00000000h   ;   8     0      
                DWORD       000000ffh,  00000000h   ;         1      
                DWORD       0000ffffh,  00000000h   ;         2      
                DWORD       00ffffffh,  00000000h   ;         3      
                DWORD      0ffffffffh,  00000000h   ;         4      
                DWORD      0ffffffffh,  000000ffh   ;         5      
                DWORD      0ffffffffh,  0000ffffh   ;         6      
                DWORD      0ffffffffh,  00ffffffh   ;         7
                DWORD      0ffffffffh, 0ffffffffh   ;         8

ShiftMask       DWORD       7f7f7f7fh,  7f7f7f7fh	; used for byte shifts
BottomBitMask   DWORD       01010101h,  01010101h	; used for packed averages
Round1          DWORD       00010001h,  00010001h

MMXDATA1 ENDS


;-------------------;
;      Set Up       ;
;-------------------;
MMXCODE1 SEGMENT

PUBLIC C MMX_BiMotionComp

MMX_BiMotionComp:
  push       ebx
  push       ebp

  push       edi
  push       esi

  mov        ebp, esp
  and        esp, -32                    ; align the stack on a cache line

  sub        esp, FRAMESIZE              ; make room for locals

  mov        edi, [iNum]
  mov        esi, [pCurr]

                                         ;         start      end
  movd       mm1, [mvx]				     ; mm1 = 0000 0000 .... .mvx

  movd       mm2, [mvy]                  ; mm2 = 0000 0000 .... .mvy

  movq       mm0, [mmxFudge+8*edi]
  punpcklwd  mm1, mm2                    ; mm1 = .... .... .mvy .mvx

  movq       mm3, [mmxClipT]
  punpckldq  mm1, mm1                    ; mm1 = .mvy .mvx .mvy .mvx

  movq       mm4, [mmxClipB]
  psubw      mm0, mm1

  mov        edi, [pPrev]
  psraw      mm0, 1                      ; mm0 = RowStart ColStart RowEnd ColEnd

  mov        ebx, [mvy]
  paddsw     mm0, mm3                    ; clip at 8 or higher

  and        ebx, -2                     ; 2*(mvy>>1)
  psubusw    mm0, mm4				     ; clip at 0 or lower

  shl        ebx, 6                      ; 128*(mvy>>1)
  mov        eax, [mvx]

  movq       [uColEnd], mm0

  sar        eax, 1                      ; mvx>>1
  lea        ebx, [ebx+2*ebx]            ; PITCH*(mvy>>1)

  add        esi, ebx                    ; pCurr += PITCH*(mvy>>1)
  xor        ecx, ecx

  add        esi, eax                    ; pCurr += mvx>>1
  xor        edx, edx

  mov        cl, [uColStart]             ; uColStart
  mov        dl, [uColEnd]               ; uColEnd

  cmp        ecx, edx                    ; iColCount = ColStart - ColEnd
  jge        hasta_la_vista_baby

  movq       mm6, ColStartMask[8*ecx]

  movq       mm7, ColEndMask[8*edx]
  pxor       mm4, mm4                    ; mm4 = 0

  mov        cl, [uRowStart]             ; RowStart
  mov        dl, [uRowEnd]               ; RowEnd

  sub        edx, ecx                    ; iRowCount = RowEnd - RowStart
  jle        hasta_la_vista_baby

  pand       mm7, mm6                    ; mm7 = ff for those cols to use back pred.
  pxor       mm6, mm6

  shl        ecx, 7                      ; 128*RowStart
  mov        eax, [mvx]

  movq       mm5, [ShiftMask]            ; mm5 = 7f 7f 7f 7f  7f 7f 7f 7f
  pcmpeqb    mm6, mm7                    ; mm6 is the complement of mm7
  
  lea        ecx, [ecx+2*ecx]            ; PITCH*RowStart
  mov        ebx, [mvy]

  add        esi, ecx                    ; pCurr += PITCH*RowStart
  add        edi, ecx                    ; pPrev += PITCH*RowStart        

  mov        ecx, PITCH


  and      eax, 1
  je       even_mvx

  and      ebx, 1
  je       odd_even

;
;  mvx is odd (horizontal half pel motion)
;  mvy is odd (vertical half pel motion)
;
odd_odd:
  movq       mm0, [esi+4]

  movq       mm1, mm0
  psrlq      mm0, 8

  movq       mm2, [esi]
  punpcklbw  mm1, mm4

  movq       mm3, mm2
  punpcklbw  mm0, mm4

  paddw      mm0, mm1
  psrlq      mm2, 8

  paddw      mm0, [Round1]
  punpcklbw  mm3, mm4

  punpcklbw  mm2, mm4
  add        esi, ecx

  movq       [mmxTempH], mm0
  paddw      mm2, mm3

  paddw      mm2, [Round1]

  sub        edi, ecx                    ; pre decrement destination pointer

  movq       [mmxTempL], mm2


;
;  This loop is 2-folded and works on 2 results (rows) per pass.
;  It finishes one result per iteration.
;
;  Stage I
;    computes the partial sums of a row with a shifted copy of the row.
;    It stores the partial sums for the next iteration's Stage II.
;  Stage II 
;    reads the partial sums of the prior row and averages them with the
;    just computed (in Stage I) partial sums of the current row to get
;    the backward prediction.  These computations are done unpacked as
;    16-bit words.  A rounding factor is added to each partial sum before
;    storage.  Then stage II averages the result (with truncation) with 
;    the forward prediction.
;
;    Those bytes of the backwards prediction which are not to be used are
;    replaced by the corresponding bytes of the forwards prediction prior 
;    to averaging (using the masks in registers mm6 and mm7). 
;
;    Averaging of the forward with backward is done packed in 8-bit bytes by 
;    dividing both inputs by 2, adding them together, and then adding in an 
;    adjustment.  To average with truncation, the adjustment is 1 when BOTH 
;    inputs are odd.  Due to the absence of a byte shift instruction, divide
;    by 2 is done by shifting the entire mmx register and then masking off 
;    (zeroing) bits , 15, ..., and 63 (the old low-order bits) using mm5.
;   
OddOddLoop:
  movq       mm1, [esi]                  ; load left half

  movq       mm0, mm1                    ; copy left half
  psrlq      mm1, 8                      ; shift left over

  movq       mm3, [esi+4]                ; load right half
  punpcklbw  mm0, mm4                    ; unpack left half

  movq       mm2, mm3                    ; copy right half
  punpcklbw  mm1, mm4                    ; unpack shifted left half

  paddw      mm1, mm0                    ; add left side
  psrlq      mm3, 8                      ; shift right over

  paddw      mm1, [Round1]               ; add in round to left
  punpcklbw  mm2, mm4                    ; unpack right half

  movq       mm0, [mmxTempL]             ; fetch prior row's left half
  punpcklbw  mm3, mm4                    ; unpack shifted right half

  movq       [mmxTempL], mm1             ; stash this row's left half
  paddw      mm3, mm2                    ; add right side

  paddw      mm3, [Round1]               ; add in round to right
  paddw      mm0, mm1                    ; sum current & prior lefts

  movq       mm2, [mmxTempH]             ; fetch prior row's right half
  psrlw      mm0, 2                      ; divide left sum by four

  movq       [mmxTempH], mm3             ; stash this rows right half
  paddw      mm2, mm3                    ; sum current & prior rights

  movq       mm1, [edi+ecx]              ; fetch forward prediction
  psrlw      mm2, 2                      ; divide right sum by four

  packuswb   mm0, mm2                    ; complete backward prediction
  movq       mm2, mm1                    ; copy forward

  pand       mm0, mm7                    ; mask off unused bytes
  pand       mm2, mm6                    ; create replacement bytes

  por        mm0, mm2                    ; new backward prediction
  movq       mm3, mm1                    ; copy forward for adjustment

  pand       mm3, mm0                    ; adjustment with truncation
  psrlq      mm0, 1                      ; divide new backward by 2

  pand       mm0, mm5                    ; clear extra bits
  psrlq      mm1, 1                      ; divide forward by 2

  pand       mm3, [BottomBitMask]        ; complete adjustment
  pand       mm1, mm5                    ; clear extra bits

  paddb      mm0, mm1                    ; sum quotients
  add        edi, ecx                    ; increment destination pointer

  paddb      mm0, mm3                    ; add addjustment
  add        esi, ecx                    ; increment source pointer

  movq       [edi], mm0                  ; store result
                                         ; *** 1 cycle store penalty ***

  dec        edx                         ; decrement loop control
  jg         OddOddLoop                  ; back up if not done


;  wrap up and go home
  mov        esp, ebp

  pop        esi
  pop        edi

  pop        ebp
  pop        ebx

  ret


;
;  mvx is odd (horizontal half pel motion)
;  mvy is even (vertical full pel motion)
;
odd_even:
  sub        edi, ecx                    ; pre decrement destination pointer

;
;  This loop is not folded and does 1 result (row) per pass.
;
;    It loads the backward predicted row into mm0 and brings in the last
;    (eighth) byte through al, which is or'd with the shifted row.  It
;    completes the bacward prediction (by averaging the rows with round)
;    and averages the result (with truncation) with the forward prediction.
;    Those bytes of the backwards prediction which are not to be used are
;    replaced by the corresponding bytes of the forwards prediction prior 
;    to averaging (using the masks in registers mm6 and mm7). 
;
;    Averaging is done by dividing both inputs by 2, adding them together,
;    and then adding in an adjustment.
;    To average with round, the adjustment is 1 when EITHER input is odd. 
;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.
;    Due to the absence of a byte shift instruction, divide by 2 is done
;    by shifting the entire mmx register and then masking off (zeroing) bits
;    7, 15, ..., and 63 (the old low-order bits) using mm5.
;   
OddEvenLoop:
  movq       mm0, [esi]                  ; fetch backward predicted row

  mov        al, [esi+8]                 ; fetch last byte
  movq       mm1, mm0                    ; copy row

  movd       mm2, eax                    ; last byte
  psrlq      mm0, 8	                     ; shift row right 1 byte

  movq       mm3, mm1                    ; copy row for adjustment
  psllq      mm2, 56                     ; move last byte to left end                     

  por        mm0, mm2                    ; or in last byte on left
  psrlq      mm1, 1                      ; divide row by 2

  por        mm3, mm0                    ; averaging with rounding bit
  psrlq      mm0, 1                      ; divide shifted row by 2

  pand       mm0, mm5                    ; clear extra bits
  pand       mm1, mm5                    ; clear extra bits

  pand       mm3, [BottomBitMask]        ; finish adjustment (with round)
  paddb      mm0, mm1                    ; sum quotients

  movq       mm4, [edi+ecx]              ; fetch forward prediction
  paddb      mm3, mm0                    ; add adjustment, got back pred.
  
  movq       mm2, mm4                    ; copy forward
  pand       mm3, mm7                    ; mask off unused bytes

  movq       mm1, mm4                    ; copy forward
  pand       mm2, mm6                    ; mask forward copy

  por        mm3, mm2                    ; backward with forward replacing
  psrlq      mm4, 1                      ; divide forward by 2

  pand       mm1, mm3                    ; adjustment for truncation
  psrlq      mm3, 1                      ; divide bacwards by 2

  pand       mm3, mm5                    ; clear extra bits
  pand       mm4, mm5                    ; clear extra bits

  pand       mm1, [BottomBitMask]        ; finish adjustment (with truncation)
  paddb      mm4, mm3                    ; sum quotients

  paddb      mm4, mm1                    ; add adjusment, have result
  add        edi, ecx                    ; increment destination pointer

  add        esi, ecx                    ; increment source pointer
  dec        edx                         ; decrement loop control

  movq       [edi], mm4                  ; save result

  jg         OddEvenLoop                 ; loop when not done


;  wrap up and go home
  mov        esp, ebp

  pop        esi
  pop        edi

  pop        ebp
  pop        ebx

  ret

;---------------------------;
;  mvx is even -- test mvy  ;
;---------------------------;
even_mvx:
  and      ebx, 1
  je       even_even

;
;  mvx is even (horizontal full pel motion)
;  mvy is odd (vertical half pel motion)
;
even_odd:
  movq       mm0, [esi]                  ; 1:  first row

  movq       mm1, [esi+ecx]              ; 1:  second row
  movq       mm2, mm0                    ; 1:  copy for rounding

  por        mm2, mm1                    ; 1:  averaging with round
  sub        edi, ecx                    ; pre deccrement destination pointer

  dec        edx                         ; note that edx is positive on entry 
  jz         EvenOddPost

;
;  This loop is 2-folded and works on 2 results (rows) per pass.
;  It finishes one result per iteration.
;  Stage I
;    loads both backward predicted rows into mm0 and mm1, copies the first
;    into mm2, and ors with the second for the rounding adjustment.
;  Stage II
;    completes the bacward prediction (by averaging the rows with round)
;    and averages the result (with truncation) with the forward prediction.
;    Those bytes of the backwards prediction which are not to be used are
;    replaced by the corresponding bytes of the forwards prediction prior 
;    to averaging (using the masks in registers mm6 and mm7). 
;
;    Averaging is done by dividing both inputs by 2, adding them together,
;    and then adding in an adjustment (in mm2).
;    To average with round, the adjustment is 1 when EITHER input is odd. 
;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.
;    Due to the absence of a byte shift instruction, divide by 2 is done
;    by shifting the entire mmx register and then masking off (zeroing) bits
;    7, 15, ..., and 63 (the old low-order bits) using mm5.
;   
EvenOddLoop:
  psrlq      mm0, 1                      ; 2:  divide first row by 2
  add        edi, ecx                    ; increment destination pointer

  psrlq      mm1, 1                      ; 2:  divide second row by 2
  pand       mm0, mm5                    ; 2:  clear extra bits

  pand       mm2, [BottomBitMask]        ; 2:  rounding bits
  pand       mm1, mm5                    ; 2:  clear extra bits

  movq       mm3, [edi]                  ; 2:  fetch forward prediction
  paddb      mm1, mm0                    ; 2:  average backward rows

  paddb      mm1, mm2                    ; 2:  add in round
  movq       mm4, mm3                    ; 2:  copy for mask  

  pand       mm1, mm7                    ; 2:  masked backward prediction
  pand       mm4, mm6                    ; 2:  masked forward prediction

  por        mm4, mm1                    ; 2:  adjusted backwards prediction
  movq       mm2, mm3                    ; 2:  copy for rounding

  pand       mm2, mm4                    ; 2:  averaging with truncation
  psrlq      mm4, 1                      ; 2:  divide bacwards by 2

  psrlq      mm3, 1                      ; 2:  divide forwards by 2
  pand       mm4, mm5                    ; 2:  clear extra bits

  pand       mm2, [BottomBitMask]        ; 2:  "no-round" bits
  pand       mm3, mm5                    ; 2:  clear extra bits

  movq       mm0, [esi+ecx]              ; 1:  first row
  paddb      mm4, mm3                    ; 2:  average forward & backwards

  movq       mm1, [esi+2*ecx]            ; 1:  second row
  paddb      mm4, mm2                    ; 2:  add in "no-round" bits 

  movq       mm2, mm0                    ; 1:  copy for rounding
  add        esi, ecx                    ; increment source pointer

  movq       [edi], mm4                  ; 2:  store resulting row
  por        mm2, mm1                    ; 1:  averaging with rounding bit

  dec        edx                         ; decrement loop count
  jg         EvenOddLoop                 ; back up if not done

EvenOddPost:
  psrlq      mm0, 1                      ; 2:  divide first row by 2
  add        edi, ecx                    ; increment destination pointer

  psrlq      mm1, 1                      ; 2:  divide second row by 2
  pand       mm0, mm5                    ; 2:  clear extra bits

  pand       mm2, [BottomBitMask]        ; 2:  rounding bits
  pand       mm1, mm5                    ; 2:  clear extra bits

  movq       mm3, [edi]                  ; 2:  fetch forward prediction
  paddb      mm1, mm0                    ; 2:  average backward rows

  paddb      mm1, mm2                    ; 2:  add in round
  movq       mm4, mm3                    ; 2:  copy for mask  

  pand       mm1, mm7                    ; 2:  masked backward prediction
  pand       mm4, mm6                    ; 2:  masked forward prediction

  por        mm4, mm1                    ; 2:  adjusted backwards prediction
  movq       mm2, mm3                    ; 2:  copy for rounding

  pand       mm2, mm4                    ; 2:  averaging with truncation
  psrlq      mm4, 1                      ; 2:  divide bacwards by 2

  psrlq      mm3, 1                      ; 2:  divide forwards by 2
  pand       mm4, mm5                    ; 2:  clear extra bits

  pand       mm2, [BottomBitMask]        ; 2:  "no-round" bits
  pand       mm3, mm5                    ; 2:  clear extra bits

  paddb      mm4, mm3                    ; 2:  average forward & backwards
  mov        esp, ebp

  paddb      mm4, mm2                    ; 2:  add in "no-round" bits 
  mov        ecx, edi

  pop        esi
  pop        edi

  pop        ebp
  pop        ebx

  movq       [ecx], mm4                  ; 2:  store resulting row

  ret


;
;  mvx is even (horizontal full pel motion)
;  mvy is even (vertical full pel motion)
;
even_even:
  movq       mm1, [edi]                  ; 1:  forward prediction

  movq       mm0, [esi]                  ; 1:  backward prediction
  movq       mm2, mm1                    ; 1:  copy forward for mask

  pand       mm0, mm7                    ; 1:  mask off unused bytes
  sub        edi, ecx                    ; pre deccrement destination pointer

  dec        edx                         ; note that edx is positive on entry 
  jz         EvenEvenPost

;
;  This loop is 2-folded and works on 2 results (rows) per pass.
;  It finishes one result per iteration.
;  Stage I
;    loads mm0 and mm1 with the predictions and begins the replacement
;    procedure for the forward prediction.
;  Stage II
;    finishes the replacement procedure for the forward prediction and 
;    averages that (with truncation) with the bacwards prediction.
;    Those bytes of the backwards prediction which are not to be used are
;    replaced by the corresponding bytes of the forwards prediction prior 
;    to averaging (using the masks in registers mm6 and mm7). 
;
;    Averaging is done by dividing both inputs by 2, adding them together,
;    and then adding in an adjustment (in mm2).
;    To average with round, the adjustment is 1 when EITHER input is odd. 
;    To average with truncation, the adjustment is 1 when BOTH inputs are odd.
;    Due to the absence of a byte shift instruction, divide by 2 is done
;    by shifting the entire mmx register and then masking off (zeroing) bits
;    7, 15, ..., and 63 (the old low-order bits) using mm5.
;          
EvenEvenLoop:
  pand       mm2, mm6                    ; 2:  mask corresponding bytes
  add        edi, ecx                    ; increment destination pointer

  por        mm0, mm2                    ; 2:  replace unused back with for.
  movq       mm3, mm1                    ; 2:  copy forward for adjustment

  pand       mm3, mm0                    ; 2:  adjustment for truncation
  psrlq      mm0, 1                      ; 2:  divide back by 2

  psrlq      mm1, 1                      ; 2:  divide forward by 2
  pand       mm0, mm5                    ; 2:  clear extra bits

  pand       mm3, [BottomBitMask]        ; 2:  finish adjustment
  pand       mm1, mm5                    ; 2:  clear extra bits

  paddb      mm0, mm1                    ; 2:  sum quotients
  add        esi, ecx                    ; increment source pointer

  movq       mm1, [edi+ecx]              ; 1:  forward prediction
  paddb      mm3, mm0                    ; 2:  add in adjusment

  movq       mm0, [esi]                  ; 1:  backward prediction
  movq       mm2, mm1                    ; 1:  copy forward for mask

  movq       [edi], mm3                  ; 2:  store result
  pand       mm0, mm7                    ; 1:  mask off unused bytes

  dec        edx                         ; decrement loop control
  jg         EvenEvenLoop                ; loop back when not done
          
EvenEvenPost:
  pand       mm2, mm6                    ; 2:  mask corresponding bytes
  add        ecx, edi

  por        mm0, mm2                    ; 2:  replace unused back with for.
  movq       mm3, mm1                    ; 2:  copy forward for adjustment

  pand       mm3, mm0                    ; 2:  adjustment for truncation
  psrlq      mm0, 1                      ; 2:  divide back by 2

  psrlq      mm1, 1                      ; 2:  divide forward by 2
  pand       mm0, mm5                    ; 2:  clear extra bits

  pand       mm3, [BottomBitMask]        ; 2:  finish adjustment
  pand       mm1, mm5                    ; 2:  clear extra bits

  paddb      mm0, mm1                    ; 2:  sum quotients
  mov        esp, ebp

  paddb      mm3, mm0                    ; 2:  add in adjusment
  nop

  pop        esi
  pop        edi

  pop        ebp
  pop        ebx

  movq       [ecx], mm3

  ret

;
;  "Remember when I promised to kill you last?"
;
bye_bye:
hasta_la_vista_baby:
  mov        esp, ebp

  pop        esi
  pop        edi

  pop        ebp
  pop        ebx

  ret
MMXCODE1 ENDS

;        1111111111222222222233333333334444444444555555555566666666667777777
;234567890123456789012345678901234567890123456789012345678901234567890123456
;--------------------------------------------------------------------------;
END