Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

753 lines
27 KiB

;--------------------------------------------------------------------------;
; INTEL Corporation Proprietary Information
;
; This listing is supplied under the terms of a license
; agreement with INTEL Corporation and may not be copied
; nor disclosed except in accordance with the terms of
; that agreement.
;
; Copyright (c) 1996 Intel Corporation.
; All Rights Reserved.
;
;--------------------------------------------------------------------------;
;--------------------------------------------------------------------------;
;
; D3mBiMot.asm
;
; Description:
; This module does bi-directional motion compensated prediction for
; B frames. It is called after forward prediction has been computed
; and will average in the backward prediction for those pels where
; the backward motion vector points inside of the referenced P frame.
;
; MMx Version
;
; Routines: prototypes in:
; MMX_BiMotionComp none
;
;--------------------------------------------------------------------------;
;--------------------------------------------------------------------------;
;
; $Header: S:\h26x\src\dec\d3mbimot.asv 1.2 01 Apr 1996 12:35:48 RMCKENZX $
; $Log: S:\h26x\src\dec\d3mbimot.asv $
;//
;// Rev 1.2 01 Apr 1996 12:35:48 RMCKENZX
;//
;// Added MMXCODE1 and MMXDATA1 segments, moved global data
;// to MMXDATA1 segment.
;//
;// Rev 1.1 14 Mar 1996 13:58:00 RMCKENZX
;//
;// Optimized routine for speed of execution.
;//
;// Rev 1.0 07 Mar 1996 18:36:36 RMCKENZX
;// Initial revision.
;
;--------------------------------------------------------------------------;
;--------------------------------------------------------------------------;
;
; Routine Name:
; MMX_BiMotionComp(U32, U32, I32, I32, I32)
;
; Inputs -- C calling convention:
; pPrev flat pointer to prediction from previous P frame
; used for "forward" motion vector prediction.
; pCurr flat pointer into current P frame
; to be used for "backward" motion vector prediction.
; mvx x component of backward motion vector.
; mvy y component of backward motion vector.
; iNum block number.
;
; Returns:
; updates the values pointed to by pPrev.
;
;--------------------------------------------------------------------------;
;
; Version: .006
; Date: 14 March 1996
; Author: R. McKenzie
;
;--------------------------------------------------------------------------;
.586
.MODEL FLAT
; make all symbols case sensitive
OPTION CASEMAP:NONE
.xlist
include iammx.inc
.list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
;-------------------;
; Stack Use ;
;-------------------;
; register storage (rel to old stack ptr as saved in ebp)
; esi ebp+00
; edi ebp+04
; ebp ebp+08
; ebx ebp+12
; return address ebp+16
; C input parameters
pPrev EQU ebp+20
pCurr EQU ebp+24
mvx EQU ebp+28
mvy EQU ebp+32
iNum EQU ebp+36
; local variables
uColEnd EQU esp+00
uRowEnd EQU esp+02
uColStart EQU esp+04
uRowStart EQU esp+06
mmxTempL EQU esp+08
mmxTempH EQU esp+16
PITCH = 384
FRAMESIZE = 32
MMXDATA1 SEGMENT
ALIGN 8
; End Start
; Row Col Row Col
; y x y x
mmxFudge DWORD 001e001eh, 00010001h
DWORD 001e000eh, 0001fff1h
DWORD 000e001eh, 0fff10001h
DWORD 000e000eh, 0fff1fff1h
DWORD 000e000eh, 00010001h
DWORD 000e000eh, 00010001h
mmxClipT DWORD 7ff87ff8h, 7ff77ff7h
mmxClipB DWORD 7ff77ff7h, 7ff77ff7h
; start
ColStartMask DWORD 0ffffffffh, 0ffffffffh ; 0
DWORD 0ffffff00h, 0ffffffffh ; 1
DWORD 0ffff0000h, 0ffffffffh ; 2
DWORD 0ff000000h, 0ffffffffh ; 3
DWORD 00000000h, 0ffffffffh ; 4
DWORD 00000000h, 0ffffff00h ; 5
DWORD 00000000h, 0ffff0000h ; 6
DWORD 00000000h, 0ff000000h ; 7 end
ColEndMask DWORD 00000000h, 00000000h ; 8 0
DWORD 000000ffh, 00000000h ; 1
DWORD 0000ffffh, 00000000h ; 2
DWORD 00ffffffh, 00000000h ; 3
DWORD 0ffffffffh, 00000000h ; 4
DWORD 0ffffffffh, 000000ffh ; 5
DWORD 0ffffffffh, 0000ffffh ; 6
DWORD 0ffffffffh, 00ffffffh ; 7
DWORD 0ffffffffh, 0ffffffffh ; 8
ShiftMask DWORD 7f7f7f7fh, 7f7f7f7fh ; used for byte shifts
BottomBitMask DWORD 01010101h, 01010101h ; used for packed averages
Round1 DWORD 00010001h, 00010001h
MMXDATA1 ENDS
;-------------------;
; Set Up ;
;-------------------;
MMXCODE1 SEGMENT
PUBLIC C MMX_BiMotionComp
MMX_BiMotionComp:
push ebx
push ebp
push edi
push esi
mov ebp, esp
and esp, -32 ; align the stack on a cache line
sub esp, FRAMESIZE ; make room for locals
mov edi, [iNum]
mov esi, [pCurr]
; start end
movd mm1, [mvx] ; mm1 = 0000 0000 .... .mvx
movd mm2, [mvy] ; mm2 = 0000 0000 .... .mvy
movq mm0, [mmxFudge+8*edi]
punpcklwd mm1, mm2 ; mm1 = .... .... .mvy .mvx
movq mm3, [mmxClipT]
punpckldq mm1, mm1 ; mm1 = .mvy .mvx .mvy .mvx
movq mm4, [mmxClipB]
psubw mm0, mm1
mov edi, [pPrev]
psraw mm0, 1 ; mm0 = RowStart ColStart RowEnd ColEnd
mov ebx, [mvy]
paddsw mm0, mm3 ; clip at 8 or higher
and ebx, -2 ; 2*(mvy>>1)
psubusw mm0, mm4 ; clip at 0 or lower
shl ebx, 6 ; 128*(mvy>>1)
mov eax, [mvx]
movq [uColEnd], mm0
sar eax, 1 ; mvx>>1
lea ebx, [ebx+2*ebx] ; PITCH*(mvy>>1)
add esi, ebx ; pCurr += PITCH*(mvy>>1)
xor ecx, ecx
add esi, eax ; pCurr += mvx>>1
xor edx, edx
mov cl, [uColStart] ; uColStart
mov dl, [uColEnd] ; uColEnd
cmp ecx, edx ; iColCount = ColStart - ColEnd
jge hasta_la_vista_baby
movq mm6, ColStartMask[8*ecx]
movq mm7, ColEndMask[8*edx]
pxor mm4, mm4 ; mm4 = 0
mov cl, [uRowStart] ; RowStart
mov dl, [uRowEnd] ; RowEnd
sub edx, ecx ; iRowCount = RowEnd - RowStart
jle hasta_la_vista_baby
pand mm7, mm6 ; mm7 = ff for those cols to use back pred.
pxor mm6, mm6
shl ecx, 7 ; 128*RowStart
mov eax, [mvx]
movq mm5, [ShiftMask] ; mm5 = 7f 7f 7f 7f 7f 7f 7f 7f
pcmpeqb mm6, mm7 ; mm6 is the complement of mm7
lea ecx, [ecx+2*ecx] ; PITCH*RowStart
mov ebx, [mvy]
add esi, ecx ; pCurr += PITCH*RowStart
add edi, ecx ; pPrev += PITCH*RowStart
mov ecx, PITCH
and eax, 1
je even_mvx
and ebx, 1
je odd_even
;
; mvx is odd (horizontal half pel motion)
; mvy is odd (vertical half pel motion)
;
odd_odd:
movq mm0, [esi+4]
movq mm1, mm0
psrlq mm0, 8
movq mm2, [esi]
punpcklbw mm1, mm4
movq mm3, mm2
punpcklbw mm0, mm4
paddw mm0, mm1
psrlq mm2, 8
paddw mm0, [Round1]
punpcklbw mm3, mm4
punpcklbw mm2, mm4
add esi, ecx
movq [mmxTempH], mm0
paddw mm2, mm3
paddw mm2, [Round1]
sub edi, ecx ; pre decrement destination pointer
movq [mmxTempL], mm2
;
; This loop is 2-folded and works on 2 results (rows) per pass.
; It finishes one result per iteration.
;
; Stage I
; computes the partial sums of a row with a shifted copy of the row.
; It stores the partial sums for the next iteration's Stage II.
; Stage II
; reads the partial sums of the prior row and averages them with the
; just computed (in Stage I) partial sums of the current row to get
; the backward prediction. These computations are done unpacked as
; 16-bit words. A rounding factor is added to each partial sum before
; storage. Then stage II averages the result (with truncation) with
; the forward prediction.
;
; Those bytes of the backwards prediction which are not to be used are
; replaced by the corresponding bytes of the forwards prediction prior
; to averaging (using the masks in registers mm6 and mm7).
;
; Averaging of the forward with backward is done packed in 8-bit bytes by
; dividing both inputs by 2, adding them together, and then adding in an
; adjustment. To average with truncation, the adjustment is 1 when BOTH
; inputs are odd. Due to the absence of a byte shift instruction, divide
; by 2 is done by shifting the entire mmx register and then masking off
; (zeroing) bits , 15, ..., and 63 (the old low-order bits) using mm5.
;
OddOddLoop:
movq mm1, [esi] ; load left half
movq mm0, mm1 ; copy left half
psrlq mm1, 8 ; shift left over
movq mm3, [esi+4] ; load right half
punpcklbw mm0, mm4 ; unpack left half
movq mm2, mm3 ; copy right half
punpcklbw mm1, mm4 ; unpack shifted left half
paddw mm1, mm0 ; add left side
psrlq mm3, 8 ; shift right over
paddw mm1, [Round1] ; add in round to left
punpcklbw mm2, mm4 ; unpack right half
movq mm0, [mmxTempL] ; fetch prior row's left half
punpcklbw mm3, mm4 ; unpack shifted right half
movq [mmxTempL], mm1 ; stash this row's left half
paddw mm3, mm2 ; add right side
paddw mm3, [Round1] ; add in round to right
paddw mm0, mm1 ; sum current & prior lefts
movq mm2, [mmxTempH] ; fetch prior row's right half
psrlw mm0, 2 ; divide left sum by four
movq [mmxTempH], mm3 ; stash this rows right half
paddw mm2, mm3 ; sum current & prior rights
movq mm1, [edi+ecx] ; fetch forward prediction
psrlw mm2, 2 ; divide right sum by four
packuswb mm0, mm2 ; complete backward prediction
movq mm2, mm1 ; copy forward
pand mm0, mm7 ; mask off unused bytes
pand mm2, mm6 ; create replacement bytes
por mm0, mm2 ; new backward prediction
movq mm3, mm1 ; copy forward for adjustment
pand mm3, mm0 ; adjustment with truncation
psrlq mm0, 1 ; divide new backward by 2
pand mm0, mm5 ; clear extra bits
psrlq mm1, 1 ; divide forward by 2
pand mm3, [BottomBitMask] ; complete adjustment
pand mm1, mm5 ; clear extra bits
paddb mm0, mm1 ; sum quotients
add edi, ecx ; increment destination pointer
paddb mm0, mm3 ; add addjustment
add esi, ecx ; increment source pointer
movq [edi], mm0 ; store result
; *** 1 cycle store penalty ***
dec edx ; decrement loop control
jg OddOddLoop ; back up if not done
; wrap up and go home
mov esp, ebp
pop esi
pop edi
pop ebp
pop ebx
ret
;
; mvx is odd (horizontal half pel motion)
; mvy is even (vertical full pel motion)
;
odd_even:
sub edi, ecx ; pre decrement destination pointer
;
; This loop is not folded and does 1 result (row) per pass.
;
; It loads the backward predicted row into mm0 and brings in the last
; (eighth) byte through al, which is or'd with the shifted row. It
; completes the bacward prediction (by averaging the rows with round)
; and averages the result (with truncation) with the forward prediction.
; Those bytes of the backwards prediction which are not to be used are
; replaced by the corresponding bytes of the forwards prediction prior
; to averaging (using the masks in registers mm6 and mm7).
;
; Averaging is done by dividing both inputs by 2, adding them together,
; and then adding in an adjustment.
; To average with round, the adjustment is 1 when EITHER input is odd.
; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
; Due to the absence of a byte shift instruction, divide by 2 is done
; by shifting the entire mmx register and then masking off (zeroing) bits
; 7, 15, ..., and 63 (the old low-order bits) using mm5.
;
OddEvenLoop:
movq mm0, [esi] ; fetch backward predicted row
mov al, [esi+8] ; fetch last byte
movq mm1, mm0 ; copy row
movd mm2, eax ; last byte
psrlq mm0, 8 ; shift row right 1 byte
movq mm3, mm1 ; copy row for adjustment
psllq mm2, 56 ; move last byte to left end
por mm0, mm2 ; or in last byte on left
psrlq mm1, 1 ; divide row by 2
por mm3, mm0 ; averaging with rounding bit
psrlq mm0, 1 ; divide shifted row by 2
pand mm0, mm5 ; clear extra bits
pand mm1, mm5 ; clear extra bits
pand mm3, [BottomBitMask] ; finish adjustment (with round)
paddb mm0, mm1 ; sum quotients
movq mm4, [edi+ecx] ; fetch forward prediction
paddb mm3, mm0 ; add adjustment, got back pred.
movq mm2, mm4 ; copy forward
pand mm3, mm7 ; mask off unused bytes
movq mm1, mm4 ; copy forward
pand mm2, mm6 ; mask forward copy
por mm3, mm2 ; backward with forward replacing
psrlq mm4, 1 ; divide forward by 2
pand mm1, mm3 ; adjustment for truncation
psrlq mm3, 1 ; divide bacwards by 2
pand mm3, mm5 ; clear extra bits
pand mm4, mm5 ; clear extra bits
pand mm1, [BottomBitMask] ; finish adjustment (with truncation)
paddb mm4, mm3 ; sum quotients
paddb mm4, mm1 ; add adjusment, have result
add edi, ecx ; increment destination pointer
add esi, ecx ; increment source pointer
dec edx ; decrement loop control
movq [edi], mm4 ; save result
jg OddEvenLoop ; loop when not done
; wrap up and go home
mov esp, ebp
pop esi
pop edi
pop ebp
pop ebx
ret
;---------------------------;
; mvx is even -- test mvy ;
;---------------------------;
even_mvx:
and ebx, 1
je even_even
;
; mvx is even (horizontal full pel motion)
; mvy is odd (vertical half pel motion)
;
even_odd:
movq mm0, [esi] ; 1: first row
movq mm1, [esi+ecx] ; 1: second row
movq mm2, mm0 ; 1: copy for rounding
por mm2, mm1 ; 1: averaging with round
sub edi, ecx ; pre deccrement destination pointer
dec edx ; note that edx is positive on entry
jz EvenOddPost
;
; This loop is 2-folded and works on 2 results (rows) per pass.
; It finishes one result per iteration.
; Stage I
; loads both backward predicted rows into mm0 and mm1, copies the first
; into mm2, and ors with the second for the rounding adjustment.
; Stage II
; completes the bacward prediction (by averaging the rows with round)
; and averages the result (with truncation) with the forward prediction.
; Those bytes of the backwards prediction which are not to be used are
; replaced by the corresponding bytes of the forwards prediction prior
; to averaging (using the masks in registers mm6 and mm7).
;
; Averaging is done by dividing both inputs by 2, adding them together,
; and then adding in an adjustment (in mm2).
; To average with round, the adjustment is 1 when EITHER input is odd.
; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
; Due to the absence of a byte shift instruction, divide by 2 is done
; by shifting the entire mmx register and then masking off (zeroing) bits
; 7, 15, ..., and 63 (the old low-order bits) using mm5.
;
EvenOddLoop:
psrlq mm0, 1 ; 2: divide first row by 2
add edi, ecx ; increment destination pointer
psrlq mm1, 1 ; 2: divide second row by 2
pand mm0, mm5 ; 2: clear extra bits
pand mm2, [BottomBitMask] ; 2: rounding bits
pand mm1, mm5 ; 2: clear extra bits
movq mm3, [edi] ; 2: fetch forward prediction
paddb mm1, mm0 ; 2: average backward rows
paddb mm1, mm2 ; 2: add in round
movq mm4, mm3 ; 2: copy for mask
pand mm1, mm7 ; 2: masked backward prediction
pand mm4, mm6 ; 2: masked forward prediction
por mm4, mm1 ; 2: adjusted backwards prediction
movq mm2, mm3 ; 2: copy for rounding
pand mm2, mm4 ; 2: averaging with truncation
psrlq mm4, 1 ; 2: divide bacwards by 2
psrlq mm3, 1 ; 2: divide forwards by 2
pand mm4, mm5 ; 2: clear extra bits
pand mm2, [BottomBitMask] ; 2: "no-round" bits
pand mm3, mm5 ; 2: clear extra bits
movq mm0, [esi+ecx] ; 1: first row
paddb mm4, mm3 ; 2: average forward & backwards
movq mm1, [esi+2*ecx] ; 1: second row
paddb mm4, mm2 ; 2: add in "no-round" bits
movq mm2, mm0 ; 1: copy for rounding
add esi, ecx ; increment source pointer
movq [edi], mm4 ; 2: store resulting row
por mm2, mm1 ; 1: averaging with rounding bit
dec edx ; decrement loop count
jg EvenOddLoop ; back up if not done
EvenOddPost:
psrlq mm0, 1 ; 2: divide first row by 2
add edi, ecx ; increment destination pointer
psrlq mm1, 1 ; 2: divide second row by 2
pand mm0, mm5 ; 2: clear extra bits
pand mm2, [BottomBitMask] ; 2: rounding bits
pand mm1, mm5 ; 2: clear extra bits
movq mm3, [edi] ; 2: fetch forward prediction
paddb mm1, mm0 ; 2: average backward rows
paddb mm1, mm2 ; 2: add in round
movq mm4, mm3 ; 2: copy for mask
pand mm1, mm7 ; 2: masked backward prediction
pand mm4, mm6 ; 2: masked forward prediction
por mm4, mm1 ; 2: adjusted backwards prediction
movq mm2, mm3 ; 2: copy for rounding
pand mm2, mm4 ; 2: averaging with truncation
psrlq mm4, 1 ; 2: divide bacwards by 2
psrlq mm3, 1 ; 2: divide forwards by 2
pand mm4, mm5 ; 2: clear extra bits
pand mm2, [BottomBitMask] ; 2: "no-round" bits
pand mm3, mm5 ; 2: clear extra bits
paddb mm4, mm3 ; 2: average forward & backwards
mov esp, ebp
paddb mm4, mm2 ; 2: add in "no-round" bits
mov ecx, edi
pop esi
pop edi
pop ebp
pop ebx
movq [ecx], mm4 ; 2: store resulting row
ret
;
; mvx is even (horizontal full pel motion)
; mvy is even (vertical full pel motion)
;
even_even:
movq mm1, [edi] ; 1: forward prediction
movq mm0, [esi] ; 1: backward prediction
movq mm2, mm1 ; 1: copy forward for mask
pand mm0, mm7 ; 1: mask off unused bytes
sub edi, ecx ; pre deccrement destination pointer
dec edx ; note that edx is positive on entry
jz EvenEvenPost
;
; This loop is 2-folded and works on 2 results (rows) per pass.
; It finishes one result per iteration.
; Stage I
; loads mm0 and mm1 with the predictions and begins the replacement
; procedure for the forward prediction.
; Stage II
; finishes the replacement procedure for the forward prediction and
; averages that (with truncation) with the bacwards prediction.
; Those bytes of the backwards prediction which are not to be used are
; replaced by the corresponding bytes of the forwards prediction prior
; to averaging (using the masks in registers mm6 and mm7).
;
; Averaging is done by dividing both inputs by 2, adding them together,
; and then adding in an adjustment (in mm2).
; To average with round, the adjustment is 1 when EITHER input is odd.
; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
; Due to the absence of a byte shift instruction, divide by 2 is done
; by shifting the entire mmx register and then masking off (zeroing) bits
; 7, 15, ..., and 63 (the old low-order bits) using mm5.
;
EvenEvenLoop:
pand mm2, mm6 ; 2: mask corresponding bytes
add edi, ecx ; increment destination pointer
por mm0, mm2 ; 2: replace unused back with for.
movq mm3, mm1 ; 2: copy forward for adjustment
pand mm3, mm0 ; 2: adjustment for truncation
psrlq mm0, 1 ; 2: divide back by 2
psrlq mm1, 1 ; 2: divide forward by 2
pand mm0, mm5 ; 2: clear extra bits
pand mm3, [BottomBitMask] ; 2: finish adjustment
pand mm1, mm5 ; 2: clear extra bits
paddb mm0, mm1 ; 2: sum quotients
add esi, ecx ; increment source pointer
movq mm1, [edi+ecx] ; 1: forward prediction
paddb mm3, mm0 ; 2: add in adjusment
movq mm0, [esi] ; 1: backward prediction
movq mm2, mm1 ; 1: copy forward for mask
movq [edi], mm3 ; 2: store result
pand mm0, mm7 ; 1: mask off unused bytes
dec edx ; decrement loop control
jg EvenEvenLoop ; loop back when not done
EvenEvenPost:
pand mm2, mm6 ; 2: mask corresponding bytes
add ecx, edi
por mm0, mm2 ; 2: replace unused back with for.
movq mm3, mm1 ; 2: copy forward for adjustment
pand mm3, mm0 ; 2: adjustment for truncation
psrlq mm0, 1 ; 2: divide back by 2
psrlq mm1, 1 ; 2: divide forward by 2
pand mm0, mm5 ; 2: clear extra bits
pand mm3, [BottomBitMask] ; 2: finish adjustment
pand mm1, mm5 ; 2: clear extra bits
paddb mm0, mm1 ; 2: sum quotients
mov esp, ebp
paddb mm3, mm0 ; 2: add in adjusment
nop
pop esi
pop edi
pop ebp
pop ebx
movq [ecx], mm3
ret
;
; "Remember when I promised to kill you last?"
;
bye_bye:
hasta_la_vista_baby:
mov esp, ebp
pop esi
pop edi
pop ebp
pop ebx
ret
MMXCODE1 ENDS
; 1111111111222222222233333333334444444444555555555566666666667777777
;234567890123456789012345678901234567890123456789012345678901234567890123456
;--------------------------------------------------------------------------;
END