Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

832 lines
38 KiB

;/* *************************************************************************
;** INTEL Corporation Proprietary Information
;**
;** This listing is supplied under the terms of a license
;** agreement with INTEL Corporation and may not be copied
;** nor disclosed except in accordance with the terms of
;** that agreement.
;**
;** Copyright (c) 1995 Intel Corporation.
;** Copyright (c) 1996 Intel Corporation.
;** All Rights Reserved.
;**
;** *************************************************************************
;*/
;/* *************************************************************************
;** $Header: S:\h26x\src\dec\dxmidct.asv 1.5 09 Jul 1996 16:51:26 AGUPTA2 $
;** $Log: S:\h26x\src\dec\dxmidct.asv $
;//
;// Rev 1.5 09 Jul 1996 16:51:26 AGUPTA2
;// IDCT now expects actual number of coeffs.
;//
;// Rev 1.4 08 Jul 1996 11:42:50 AGUPTA2
;// Fixed the accuracy problem where a shift was in the wrong place.
;//
;// Rev 1.3 30 May 1996 12:25:02 AGUPTA2
;// Fixed the overflow problem in computing u0-u3 in first four columns.
;//
;// Rev 1.2 09 Apr 1996 09:42:08 agupta2
;// Code to clear IDCT buffer moved to MMX_BlockCopy and MMX_BlockMove.
;//
;// Rev 1.1 22 Mar 1996 10:17:26 agupta2
;// Initial revision of MMX version of IDCT.
;//
;// Rev 1.0 14 Mar 1996 14:38:02 AGUPTA2
;// Initial revision.
;** *************************************************************************
;*/
.586
.model flat
OPTION PROLOGUE:None
OPTION EPILOGUE:None
.xlist
include iammx.inc
.list
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS
MMXDATA1 SEGMENT
;
;Constants CONSTBITS, BETA1, NEGBETA2, BETA3, BETA4, and BETA5 are used in the
;IDCT. All *BETA* constants are represented in CONSTBITS fraction bits. Their
;floating-point values are:
; BETA1 = 1.414213562
; BETA2 = 2.613125930
; BETA3 = 1.414213562
; BETA4 = 1.082392200
; BETA5 = 0.765366865
;Thus scaled integral value of BETA1 is computed as:
; BETA1 = ROUND(1.414213562*2^13) = 02D41H
;
CONSTBITS = 13
ALIGN 8
BETA1 LABEL DWORD
BETA3 LABEL DWORD
DWORD 02D410000H, 02D410000H
ALIGN 8
NEGBETA2 LABEL DWORD
DWORD 0AC610000H, 0AC610000H
ALIGN 8
BETA4 LABEL DWORD
DWORD 022A30000H, 022A30000H
ALIGN 8
BETA5 LABEL DWORD
DWORD 0187E0000H, 0187E0000H
ALIGN 8
CONSTBITS_P_1_RND LABEL DWORD
DWORD 02000H, 02000H
ALIGN 8
CONSTBITS_RND LABEL DWORD
DWORD 01000H, 01000H
ALIGN 8
ONE LABEL DWORD
DWORD 000010001H, 000010001H
MMXDATA1 ENDS
MMXCODE1 SEGMENT
;
;
;
@MMX_DecodeBlock_IDCT@12 PROC
; Parameters:
; pIQ_INDEX: DWORD PTR (in ecx)
; Pointer to an array of coeff. structures; each structure consists of
; DWORD of inverse quantized and scaled coeff. and a DWORD of its index.
; CountCoeff: DWORD (in edx)
; Number of coefficients <= 64.
; pBuf: WORD PTR (at <[esp+4]> at the entry of this routine
; Output area for the IDCT; an 8X8 matrix of WORD values with 6 frac. bits
; Algorithm:
; It uses scaled IDCT algorithm credited to Arai, Agui, and Nakajima (AAN).
; High-level steps are:
; 1) Decode pIQ_INDEX array and populate the output buffer
; 2) IDCT and write to output buffer
; Note:
; If called from a C function, this routine must be declared as:
; extern "C" void _fastcall MMX_DecodeBlock_IDCT(...)
;
LocalFrameSize = 24
Tu7 textequ <[esp+0]>
Tv5 textequ <[esp+8]>
StashESP textequ <[esp+16]>
push esi
push edi
mov edi, esp
sub esp, LocalFrameSize
and esp, 0FFFFFFF8H ;Align at 8-byte boundary
pxor mm0, mm0
mov StashESP, edi
mov edi, DWORD PTR [edi+12] ;pBuf
add edi, 64 ;pBuf+64
xor eax, eax
;
; Decode coefficients and place them in the output buffer
; ecx: pIQ_INDEX
; edx: No_Coeff
; edi: pBuf+64
; eax, esi: available
;
decode_coeff:
mov esi, [ecx+edx*8-4] ;Index
mov eax, [ecx+edx*8-8] ;Inverse quantized scaled coeff
mov WORD PTR [edi+esi*2-64], ax ;
dec edx
jnz decode_coeff
IDCT_Start:
cols_0_3:
CLINE0 = 0 - 64
CLINE1 = 16 - 64
CLINE2 = 32 - 64
CLINE3 = 48 - 64
CLINE4 = 64 - 64
CLINE5 = 80 - 64
CLINE6 = 96 - 64
CLINE7 = 112- 64
pxor mm4, mm4 ;
movq mm0, [edi+CLINE5] ;
pxor mm5, mm5 ;
movq mm1, [edi+CLINE1] ;
pxor mm2, mm2 ;
psubw mm0, [edi+CLINE3] ;q4=r4
pxor mm3, mm3 ;
psubw mm1, [edi+CLINE7] ;q6=r6
punpcklwd mm4, mm0 ;
pmaddwd mm4, NEGBETA2 ;
punpckhwd mm5, mm0 ;
pmaddwd mm5, NEGBETA2 ;
psubw mm0, mm1 ;r4-r6
punpcklwd mm2, mm0 ;
pxor mm6, mm6 ;
pmaddwd mm2, BETA5 ;
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA5 ;
punpcklwd mm6, mm1 ;
pmaddwd mm6, BETA4 ;
pxor mm7, mm7 ;
punpckhwd mm7, mm1 ;
paddd mm4, mm2 ;s4l
pmaddwd mm7, BETA4 ;
paddd mm5, mm3 ;s4h
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
psubd mm6, mm2 ;s6l
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
psubd mm7, mm3 ;s6h
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
packssdw mm4, mm5 ;s4
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
movq mm0, [edi+CLINE1] ;
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
;mm0=q5 mm4=s4
;mm2=q7 mm6=s6
paddw mm0, [edi+CLINE7] ;q5
packssdw mm6, mm7 ;s6
movq mm2, [edi+CLINE3] ;
pxor mm5, mm5 ;
paddw mm2, [edi+CLINE5] ;q7
movq mm7, mm0 ;q5
psubw mm0, mm2 ;r5=q5-q7
psraw mm7, 1 ;q5>>1
punpcklwd mm5, mm0
pxor mm3, mm3
pmaddwd mm5, BETA3 ;s5l
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA3 ;s5h
psraw mm2, 1 ;q7>>1
movq mm0, [edi+CLINE2]
paddw mm7, mm2 ;r7=s7=u7
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
psubw mm6, mm7 ;u6
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
psubw mm0, [edi+CLINE6] ;r2
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
packssdw mm5, mm3 ;s5
pxor mm1, mm1
;mm0=r2 mm4=s4
;mm1 mm5=u5
;mm2 mm6=u6
;mm3 mm7=Tu7
movq Tu7, mm7 ;Save u7
pxor mm7, mm7
movq mm2, [edi+CLINE0]
punpcklwd mm1, mm0
pmaddwd mm1, BETA1 ;s2l
punpckhwd mm7, mm0
pmaddwd mm7, BETA1 ;s2h
psubw mm5, mm6 ;u5
movq mm0, [edi+CLINE2]
paddw mm4, mm5 ;-u4
;mm4=-u4 mm5=u5
;mm6=u6 mm7=u7
paddd mm1, CONSTBITS_RND ;s2l rounded
;
paddd mm7, CONSTBITS_RND ;s2h rounded
psrad mm1, CONSTBITS ;s2l rounded descaled
paddw mm0, [edi+CLINE6] ;r3=s3=t3
psrad mm7, CONSTBITS ;s2h rounded descaled
movq mm3, mm2 ;
packssdw mm1, mm7 ;s2
psubw mm2, [edi+CLINE4] ;t1
psubw mm1, mm0 ;t2=s2-s3
psraw mm0, 1 ;t3>>1
;
psraw mm2, 1 ;t1>>1
;
psraw mm1, 1 ;t2>>1
;
paddw mm3, [edi+CLINE4] ;t0
movq mm7, mm0 ;t3>>1 copy
psraw mm3, 1 ;t0>>1
;
paddw mm0, mm3 ;u0=t3+t0
psubw mm3, mm7 ;u3=t0-t3
; psraw mm3, 1 ;u3>>1
movq mm7, mm1 ;t2
paddw mm1, mm2 ;u1=t2+t1
psubw mm2, mm7 ;u2=t1-t2
;mm0=u0 mm4=-u4
;mm1=u1 mm5=u5
;mm2=u2 mm6=u6
;mm3=u3 mm7=avail.
; psraw mm2, 1 ;u2>>1
movq mm7, mm3 ;u3>>1
psubw mm3, mm4 ;v3=u3-(-u4)
paddw mm4, mm7 ;v4=-u4+u3
; psraw mm1, 1 ;u1>>1
movq mm7, mm2 ;u2>>1
; psraw mm0, 1 ;u0>>1
psubw mm2, mm5 ;v5=u2-u5
paddw mm5, mm7 ;v2=u5+u2
movq mm7, mm1 ;u1>>1
psubw mm1, mm6 ;v6=u1-u6
paddw mm6, mm7 ;v1=u6+u1
movq Tv5, mm2 ;Save v5
movq mm7, mm0 ;
movq mm2, mm5 ;T1
punpckhwd mm5, mm3 ;T1(c,d)
paddw mm7, Tu7 ;v0
;v0=mm7 v4=mm4
;v1=mm6 v5=Tv5 (to mm2 later)
;v2=mm5 v6=mm1
;v3=mm3 v7=mm0 (later)
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
movq mm3, mm7 ;T1(a,b)
punpckhwd mm7, mm6 ;T1(a,b)
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
movq mm6, mm7 ;T1
psubw mm0, Tu7 ;v7
punpckldq mm7, mm5 ;T1
punpckhdq mm6, mm5 ;T1;mm5 free
movq mm5, mm3 ;T1
movq [edi+CLINE2], mm7 ;T1
punpckldq mm3, mm2 ;T1
movq [edi+CLINE3], mm6 ;T1
punpckhdq mm5, mm2 ;T1
movq [edi+CLINE0], mm3 ;T1
movq mm6, mm1 ;T2(c,d)
movq [edi+CLINE1], mm5 ;T1
punpckhwd mm1, mm0 ;T2(c,d)
movq mm2, Tv5
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
movq mm7, mm4 ;T2(a,b)
punpckhwd mm4, mm2 ;T2(a,b)
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
movq mm2, mm4 ;T2
punpckldq mm4, mm1 ;T2
; ;cols 4-7
punpckhdq mm2, mm1 ;T2
movq mm1, mm7 ;T2
movq [edi+CLINE6], mm4 ;T2
punpckhdq mm1, mm6 ;T2
movq [edi+CLINE7], mm2 ;T2
punpckldq mm7, mm6 ;T2
movq [edi+CLINE5], mm1 ;T2
; ;cols 4-7
movq [edi+CLINE4], mm7 ;T2
; ;cols 4-7
cols_4_7:
; Add 8 to CLINE offsets
pxor mm4, mm4 ;
movq mm0, [edi+CLINE5+8] ;
pxor mm5, mm5 ;
movq mm1, [edi+CLINE1+8] ;
pxor mm2, mm2 ;
psubw mm0, [edi+CLINE3+8] ;q4=r4
pxor mm3, mm3 ;
psubw mm1, [edi+CLINE7+8] ;q6=r6
punpcklwd mm4, mm0 ;
pmaddwd mm4, NEGBETA2 ;
punpckhwd mm5, mm0 ;
pmaddwd mm5, NEGBETA2 ;
psubw mm0, mm1 ;r4-r6
punpcklwd mm2, mm0 ;
pxor mm6, mm6 ;
pmaddwd mm2, BETA5 ;
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA5 ;
punpcklwd mm6, mm1 ;
pmaddwd mm6, BETA4 ;
pxor mm7, mm7 ;
punpckhwd mm7, mm1 ;
paddd mm4, mm2 ;s4l
pmaddwd mm7, BETA4 ;
paddd mm5, mm3 ;s4h
paddd mm4, CONSTBITS_RND ;s4l rounded
psubd mm6, mm2 ;s6l
paddd mm5, CONSTBITS_RND ;s4h rounded
psrad mm4, CONSTBITS ;s4l rounded descaled
psubd mm7, mm3 ;s6h
psrad mm5, CONSTBITS ;s4h rounded descaled
paddd mm6, CONSTBITS_RND ;s6l rounded
packssdw mm4, mm5 ;s4
paddd mm7, CONSTBITS_RND ;s6h rounded
psrad mm6, CONSTBITS ;s6l rounded descaled
movq mm0, [edi+CLINE1+8] ;
psrad mm7, CONSTBITS ;s6h rounded descaled
;mm0=q5 mm4=s4
;mm2=q7 mm6=s6
paddw mm0, [edi+CLINE7+8] ;q5
packssdw mm6, mm7 ;s6
movq mm2, [edi+CLINE3+8] ;
pxor mm5, mm5 ;
paddw mm2, [edi+CLINE5+8] ;q7
movq mm7, mm0 ;q5
psubw mm0, mm2 ;r5=q5-q7
;TODO
punpcklwd mm5, mm0
pxor mm3, mm3
pmaddwd mm5, BETA3 ;s5l
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA3 ;s5h
;TODO
movq mm0, [edi+CLINE2+8]
paddw mm7, mm2 ;r7=s7=u7
paddd mm5, CONSTBITS_RND ;s5l rounded
psubw mm6, mm7 ;u6
paddd mm3, CONSTBITS_RND ;s5h rounded
psrad mm5, CONSTBITS ;s5l rounded descaled
psubw mm0, [edi+CLINE6+8] ;r2
psrad mm3, CONSTBITS ;s5h rounded descaled
packssdw mm5, mm3 ;s5
pxor mm1, mm1
;mm0=r2 mm4=s4
;mm1 mm5=u5
;mm2 mm6=u6
;mm3 mm7=Tu7
movq Tu7, mm7 ;Save u7
pxor mm7, mm7
movq mm2, [edi+CLINE0+8]
punpcklwd mm1, mm0
pmaddwd mm1, BETA1 ;s2l
punpckhwd mm7, mm0
pmaddwd mm7, BETA1 ;s2h
psubw mm5, mm6 ;u5
movq mm0, [edi+CLINE2+8]
paddw mm4, mm5 ;-u4
;mm4=-u4 mm5=u5
;mm6=u6 mm7=u7
paddd mm1, CONSTBITS_RND ;s2l rounded
;
paddd mm7, CONSTBITS_RND ;s2h rounded
psrad mm1, CONSTBITS ;s2l rounded descaled
paddw mm0, [edi+CLINE6+8] ;r3=s3=t3
psrad mm7, CONSTBITS ;s2h rounded descaled
movq mm3, mm2 ;
packssdw mm1, mm7 ;s2
psubw mm2, [edi+CLINE4+8] ;t1
psubw mm1, mm0 ;t2=s2-s3
paddw mm3, [edi+CLINE4+8] ;t0
movq mm7, mm0 ;t3
paddw mm0, mm3 ;u0=t3+t0
psubw mm3, mm7 ;u3=t0-t3
movq mm7, mm1 ;t2
paddw mm1, mm2 ;u1=t2+t1
psubw mm2, mm7 ;u2=t1-t2
;mm0=u0 mm4=-u4
;mm1=u1 mm5=u5
;mm2=u2 mm6=u6
;mm3=u3 mm7=avail.
movq mm7, mm3 ;
psubw mm3, mm4 ;u3-(-u4)
paddw mm4, mm7 ;-u4+u3
psraw mm3, 1 ;v3
movq mm7, mm2 ;
psraw mm4, 1 ;v4
psubw mm2, mm5 ;u2-u5
psraw mm2, 1 ;v5
paddw mm5, mm7 ;u5+u2
psraw mm5, 1 ;v2
movq mm7, mm1 ;
psubw mm1, mm6 ;u1-u6
paddw mm6, mm7 ;u6+u1
movq Tv5, mm2 ;Save v5
psraw mm1, 1 ;v6
psraw mm6, 1 ;v1
movq mm7, mm0 ;
movq mm2, mm5 ;T1
punpckhwd mm5, mm3 ;T1(c,d)
paddw mm7, Tu7 ;
;TODO
psraw mm7, 1 ;v0
;TODO
;v0=mm7 v4=mm4
;v1=mm6 v5=Tv5 (to mm2 later)
;v2=mm5 v6=mm1
;v3=mm3 v7=mm0 (later)
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
movq mm3, mm7 ;T1(a,b)
punpckhwd mm7, mm6 ;T1(a,b)
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
movq mm6, mm7 ;T1
psubw mm0, Tu7 ;
punpckldq mm7, mm5 ;T1
psraw mm0, 1 ;v7
;TODO
punpckhdq mm6, mm5 ;T1;mm5 free
movq mm5, mm3 ;T1
movq [edi+CLINE2+8], mm7 ;T1
punpckldq mm3, mm2 ;T1
movq [edi+CLINE3+8], mm6 ;T1
punpckhdq mm5, mm2 ;T1
movq [edi+CLINE0+8], mm3 ;T1
movq mm6, mm1 ;T2(c,d)
movq [edi+CLINE1+8], mm5 ;T1
punpckhwd mm1, mm0 ;T2(c,d)
movq mm2, Tv5
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
movq mm7, mm4 ;T2(a,b)
punpckhwd mm4, mm2 ;T2(a,b)
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
movq mm2, mm4 ;T2
punpckldq mm4, mm1 ;T2
; ;cols 4-7
punpckhdq mm2, mm1 ;T2
movq mm1, mm7 ;T2
movq [edi+CLINE6+8], mm4 ;T2
punpckhdq mm1, mm6 ;T2
movq [edi+CLINE7+8], mm2 ;T2
punpckldq mm7, mm6 ;T2
movq [edi+CLINE5+8], mm1 ;T2
; ;cols 4-7
movq [edi+CLINE4+8], mm7 ;T2
; ;cols 4-7
rows_0_3:
RLINE0 = 0 - 64
RLINE1 = 16 - 64
RLINE2 = 32 - 64
RLINE3 = 48 - 64
RLINE4 = 8 - 64
RLINE5 = 24 - 64
RLINE6 = 40 - 64
RLINE7 = 56 - 64
pxor mm4, mm4 ;
movq mm0, [edi+RLINE5] ;
pxor mm5, mm5 ;
movq mm1, [edi+RLINE1] ;
pxor mm2, mm2 ;
psubw mm0, [edi+RLINE3] ;q4=r4
pxor mm3, mm3 ;
psubw mm1, [edi+RLINE7] ;q6=r6
punpcklwd mm4, mm0 ;
pmaddwd mm4, NEGBETA2 ;
punpckhwd mm5, mm0 ;
pmaddwd mm5, NEGBETA2 ;
psubw mm0, mm1 ;r4-r6
punpcklwd mm2, mm0 ;
pxor mm6, mm6 ;
pmaddwd mm2, BETA5 ;
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA5 ;
punpcklwd mm6, mm1 ;
pmaddwd mm6, BETA4 ;
pxor mm7, mm7 ;
punpckhwd mm7, mm1 ;
paddd mm4, mm2 ;s4l
pmaddwd mm7, BETA4 ;
paddd mm5, mm3 ;s4h
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
psubd mm6, mm2 ;s6l
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
psubd mm7, mm3 ;s6h
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
packssdw mm4, mm5 ;s4
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
movq mm0, [edi+RLINE1] ;
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
;mm0=q5 mm4=s4
;mm2=q7 mm6=s6
paddw mm0, [edi+RLINE7] ;q5
packssdw mm6, mm7 ;s6
movq mm2, [edi+RLINE3] ;
pxor mm5, mm5 ;
paddw mm2, [edi+RLINE5] ;q7
movq mm7, mm0 ;q5
psubw mm0, mm2 ;r5=q5-q7
paddw mm7, mm2 ;r7=q5+q7
punpcklwd mm5, mm0
pxor mm3, mm3
pmaddwd mm5, BETA3 ;s5l
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA3 ;s5h
;TODO
paddw mm7, ONE ;
;TODO
movq mm0, [edi+RLINE2]
psraw mm7, 1 ;s7
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
psubw mm6, mm7 ;u6
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
psubw mm0, [edi+RLINE6] ;r2
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
packssdw mm5, mm3 ;s5
pxor mm1, mm1
;mm0=r2 mm4=s4
;mm1 mm5=u5
;mm2 mm6=u6
;mm3 mm7=Tu7
psllw mm7, 1 ;u7<<1
;
movq Tu7, mm7 ;Save u7<<1
pxor mm7, mm7
movq mm2, [edi+RLINE0]
punpcklwd mm1, mm0
pmaddwd mm1, BETA1 ;s2l
punpckhwd mm7, mm0
pmaddwd mm7, BETA1 ;s2h
psubw mm5, mm6 ;u5
movq mm0, [edi+RLINE2]
paddw mm4, mm5 ;-u4
;mm4=-u4 mm5=u5
;mm6=u6 mm7=
paddd mm1, CONSTBITS_RND ;s2l rounded
;
paddd mm7, CONSTBITS_RND ;s2h rounded
psrad mm1, CONSTBITS ;s2l rounded descaled
paddw mm0, [edi+RLINE6] ;r3=s3=t3
psrad mm7, CONSTBITS ;s2h rounded descaled
movq mm3, mm2 ;
packssdw mm1, mm7 ;s2
psubw mm2, [edi+RLINE4] ;t1
psubw mm1, mm0 ;t2=s2-s3
paddw mm3, [edi+RLINE4] ;t0
movq mm7, mm0 ;t3
paddw mm0, mm3 ;u0=t3+t0
psubw mm3, mm7 ;u3=t0-t3
;TODO
movq mm7, mm1 ;t2
paddw mm1, mm2 ;u1=t2+t1
psubw mm2, mm7 ;u2=t1-t2
;mm0=u0 mm4=-u4
;mm1=u1 mm5=u5
;mm2=u2 mm6=u6
;mm3=u3 mm7=avail.
psllw mm4, 1 ;-u4<<1
movq mm7, mm3 ;
psubw mm3, mm4 ;v3=u3-(-u4<<1)
paddw mm4, mm7 ;v4=(-u4<<1)+u3
psllw mm5, 1 ;u5<<1
movq mm7, mm2 ;
psubw mm2, mm5 ;v5=u2-(u5<<1)
paddw mm5, mm7 ;v2=(u5<<1)+u2
psllw mm6, 1 ;u6<<1
movq mm7, mm1 ;
psubw mm1, mm6 ;v6=u1-(u6<<1)
paddw mm6, mm7 ;v1=(u6<<1)+u1
movq Tv5, mm2 ;Save v5
movq mm7, mm0 ;
movq mm2, mm5 ;T1
punpckhwd mm5, mm3 ;T1(c,d)
paddw mm7, Tu7 ;v0=u0+(u7<<1)
;v0=mm7 v4=mm4
;v1=mm6 v5=Tv5 (to mm2 later)
;v2=mm5 v6=mm1
;v3=mm3 v7=mm0 (later)
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
movq mm3, mm7 ;T1(a,b)
punpckhwd mm7, mm6 ;T1(a,b)
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
movq mm6, mm7 ;T1
psubw mm0, Tu7 ;v7=u0-(u7<<1)
punpckldq mm7, mm5 ;T1
punpckhdq mm6, mm5 ;T1;mm5 free
movq mm5, mm3 ;T1
movq [edi+RLINE2], mm7 ;T1
punpckldq mm3, mm2 ;T1
movq [edi+RLINE3], mm6 ;T1
punpckhdq mm5, mm2 ;T1
movq [edi+RLINE0], mm3 ;T1
movq mm6, mm1 ;T2(c,d)
movq [edi+RLINE1], mm5 ;T1
punpckhwd mm1, mm0 ;T2(c,d)
movq mm2, Tv5
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
movq mm7, mm4 ;T2(a,b)
punpckhwd mm4, mm2 ;T2(a,b)
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
movq mm2, mm4 ;T2
punpckldq mm4, mm1 ;T2
; ;cols 4-7
punpckhdq mm2, mm1 ;T2
movq mm1, mm7 ;T2
movq [edi+RLINE6], mm4 ;T2
punpckhdq mm1, mm6 ;T2
movq [edi+RLINE7], mm2 ;T2
punpckldq mm7, mm6 ;T2
movq [edi+RLINE5], mm1 ;T2
; ;cols 4-7
movq [edi+RLINE4], mm7 ;T2
; ;cols 4-7
rows_4_7:
; Add 64 to RLINE offsets
pxor mm4, mm4 ;
movq mm0, [edi+RLINE5+64] ;
pxor mm5, mm5 ;
movq mm1, [edi+RLINE1+64] ;
pxor mm2, mm2 ;
psubw mm0, [edi+RLINE3+64] ;q4=r4
pxor mm3, mm3 ;
psubw mm1, [edi+RLINE7+64] ;q6=r6
punpcklwd mm4, mm0 ;
pmaddwd mm4, NEGBETA2 ;
punpckhwd mm5, mm0 ;
pmaddwd mm5, NEGBETA2 ;
psubw mm0, mm1 ;r4-r6
punpcklwd mm2, mm0 ;
pxor mm6, mm6 ;
pmaddwd mm2, BETA5 ;
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA5 ;
punpcklwd mm6, mm1 ;
pmaddwd mm6, BETA4 ;
pxor mm7, mm7 ;
punpckhwd mm7, mm1 ;
paddd mm4, mm2 ;s4l
pmaddwd mm7, BETA4 ;
paddd mm5, mm3 ;s4h
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
psubd mm6, mm2 ;s6l
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
psubd mm7, mm3 ;s6h
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
packssdw mm4, mm5 ;s4
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
movq mm0, [edi+RLINE1+64] ;
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
;mm0=q5 mm4=s4
;mm2=q7 mm6=s6
paddw mm0, [edi+RLINE7+64] ;q5
packssdw mm6, mm7 ;s6
movq mm2, [edi+RLINE3+64] ;
pxor mm5, mm5 ;
paddw mm2, [edi+RLINE5+64] ;q7
movq mm7, mm0 ;q5
psubw mm0, mm2 ;r5=q5-q7
paddw mm7, mm2 ;r7=q5+q7
punpcklwd mm5, mm0
pxor mm3, mm3
pmaddwd mm5, BETA3 ;s5l
punpckhwd mm3, mm0 ;
pmaddwd mm3, BETA3 ;s5h
;TODO
paddw mm7, ONE ;
;TODO
movq mm0, [edi+RLINE2+64]
psraw mm7, 1 ;s7
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
psubw mm6, mm7 ;u6
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
psubw mm0, [edi+RLINE6+64] ;r2
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
packssdw mm5, mm3 ;s5
pxor mm1, mm1
;mm0=r2 mm4=s4
;mm1 mm5=u5
;mm2 mm6=u6
;mm3 mm7=Tu7
psllw mm7, 1 ;u7<<1
;
movq Tu7, mm7 ;Save u7<<1
pxor mm7, mm7
movq mm2, [edi+RLINE0+64]
punpcklwd mm1, mm0
pmaddwd mm1, BETA1 ;s2l
punpckhwd mm7, mm0
pmaddwd mm7, BETA1 ;s2h
psubw mm5, mm6 ;u5
movq mm0, [edi+RLINE2+64]
paddw mm4, mm5 ;-u4
;mm4=-u4 mm5=u5
;mm6=u6 mm7=
paddd mm1, CONSTBITS_RND ;s2l rounded
;
paddd mm7, CONSTBITS_RND ;s2h rounded
psrad mm1, CONSTBITS ;s2l rounded descaled
paddw mm0, [edi+RLINE6+64] ;r3=s3=t3
psrad mm7, CONSTBITS ;s2h rounded descaled
movq mm3, mm2 ;
packssdw mm1, mm7 ;s2
psubw mm2, [edi+RLINE4+64] ;t1
psubw mm1, mm0 ;t2=s2-s3
paddw mm3, [edi+RLINE4+64] ;t0
movq mm7, mm0 ;t3
paddw mm0, mm3 ;u0=t3+t0
psubw mm3, mm7 ;u3=t0-t3
;TODO
movq mm7, mm1 ;t2
paddw mm1, mm2 ;u1=t2+t1
psubw mm2, mm7 ;u2=t1-t2
;mm0=u0 mm4=-u4
;mm1=u1 mm5=u5
;mm2=u2 mm6=u6
;mm3=u3 mm7=avail.
psllw mm4, 1 ;-u4<<1
movq mm7, mm3 ;
psubw mm3, mm4 ;v3=u3-(-u4<<1)
paddw mm4, mm7 ;v4=(-u4<<1)+u3
psllw mm5, 1 ;u5<<1
movq mm7, mm2 ;
psubw mm2, mm5 ;v5=u2-(u5<<1)
paddw mm5, mm7 ;v2=(u5<<1)+u2
psllw mm6, 1 ;u6<<1
movq mm7, mm1 ;
psubw mm1, mm6 ;v6=u1-(u6<<1)
paddw mm6, mm7 ;v1=(u6<<1)+u1
movq Tv5, mm2 ;Save v5
movq mm7, mm0 ;
movq mm2, mm5 ;T1
punpckhwd mm5, mm3 ;T1(c,d)
paddw mm7, Tu7 ;v0=u0+(u7<<1)
;v0=mm7 v4=mm4
;v1=mm6 v5=Tv5 (to mm2 later)
;v2=mm5 v6=mm1
;v3=mm3 v7=mm0 (later)
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
movq mm3, mm7 ;T1(a,b)
punpckhwd mm7, mm6 ;T1(a,b)
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
movq mm6, mm7 ;T1
psubw mm0, Tu7 ;v7=u0-(u7<<1)
punpckldq mm7, mm5 ;T1
punpckhdq mm6, mm5 ;T1;mm5 free
movq mm5, mm3 ;T1
movq [edi+RLINE2+64], mm7 ;T1
punpckldq mm3, mm2 ;T1
movq [edi+RLINE3+64], mm6 ;T1
punpckhdq mm5, mm2 ;T1
movq [edi+RLINE0+64], mm3 ;T1
movq mm6, mm1 ;T2(c,d)
movq [edi+RLINE1+64], mm5 ;T1
punpckhwd mm1, mm0 ;T2(c,d)
movq mm2, Tv5
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
movq mm7, mm4 ;T2(a,b)
punpckhwd mm4, mm2 ;T2(a,b)
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
movq mm2, mm4 ;T2
punpckldq mm4, mm1 ;T2
; ;cols 4-7
punpckhdq mm2, mm1 ;T2
movq mm1, mm7 ;T2
movq [edi+RLINE6+64], mm4 ;T2
punpckhdq mm1, mm6 ;T2
movq [edi+RLINE7+64], mm2 ;T2
punpckldq mm7, mm6 ;T2
movq [edi+RLINE5+64], mm1 ;T2
; ;cols 4-7
movq [edi+RLINE4+64], mm7 ;T2
; ;cols 4-7
IDCT_Done:
mov esp, StashESP
pop edi
pop esi
ret 4
@MMX_DecodeBlock_IDCT@12 endp
MMXCODE1 ENDS
END