You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
832 lines
38 KiB
832 lines
38 KiB
;/* *************************************************************************
|
|
;** INTEL Corporation Proprietary Information
|
|
;**
|
|
;** This listing is supplied under the terms of a license
|
|
;** agreement with INTEL Corporation and may not be copied
|
|
;** nor disclosed except in accordance with the terms of
|
|
;** that agreement.
|
|
;**
|
|
;** Copyright (c) 1995 Intel Corporation.
|
|
;** Copyright (c) 1996 Intel Corporation.
|
|
;** All Rights Reserved.
|
|
;**
|
|
;** *************************************************************************
|
|
;*/
|
|
;/* *************************************************************************
|
|
;** $Header: S:\h26x\src\dec\dxmidct.asv 1.5 09 Jul 1996 16:51:26 AGUPTA2 $
|
|
;** $Log: S:\h26x\src\dec\dxmidct.asv $
|
|
;//
|
|
;// Rev 1.5 09 Jul 1996 16:51:26 AGUPTA2
|
|
;// IDCT now expects actual number of coeffs.
|
|
;//
|
|
;// Rev 1.4 08 Jul 1996 11:42:50 AGUPTA2
|
|
;// Fixed the accuracy problem where a shift was in the wrong place.
|
|
;//
|
|
;// Rev 1.3 30 May 1996 12:25:02 AGUPTA2
|
|
;// Fixed the overflow problem in computing u0-u3 in first four columns.
|
|
;//
|
|
;// Rev 1.2 09 Apr 1996 09:42:08 agupta2
|
|
;// Code to clear IDCT buffer moved to MMX_BlockCopy and MMX_BlockMove.
|
|
;//
|
|
;// Rev 1.1 22 Mar 1996 10:17:26 agupta2
|
|
;// Initial revision of MMX version of IDCT.
|
|
;//
|
|
;// Rev 1.0 14 Mar 1996 14:38:02 AGUPTA2
|
|
;// Initial revision.
|
|
;** *************************************************************************
|
|
;*/
|
|
.586
|
|
.model flat
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:None
|
|
|
|
.xlist
|
|
include iammx.inc
|
|
.list
|
|
|
|
MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
|
|
MMXCODE1 ENDS
|
|
|
|
MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
|
|
MMXDATA1 ENDS
|
|
|
|
MMXDATA1 SEGMENT
|
|
;
|
|
;Constants CONSTBITS, BETA1, NEGBETA2, BETA3, BETA4, and BETA5 are used in the
|
|
;IDCT. All *BETA* constants are represented in CONSTBITS fraction bits. Their
|
|
;floating-point values are:
|
|
; BETA1 = 1.414213562
|
|
; BETA2 = 2.613125930
|
|
; BETA3 = 1.414213562
|
|
; BETA4 = 1.082392200
|
|
; BETA5 = 0.765366865
|
|
;Thus scaled integral value of BETA1 is computed as:
|
|
; BETA1 = ROUND(1.414213562*2^13) = 02D41H
|
|
;
|
|
CONSTBITS = 13
|
|
ALIGN 8
|
|
BETA1 LABEL DWORD
|
|
BETA3 LABEL DWORD
|
|
DWORD 02D410000H, 02D410000H
|
|
ALIGN 8
|
|
NEGBETA2 LABEL DWORD
|
|
DWORD 0AC610000H, 0AC610000H
|
|
ALIGN 8
|
|
BETA4 LABEL DWORD
|
|
DWORD 022A30000H, 022A30000H
|
|
ALIGN 8
|
|
BETA5 LABEL DWORD
|
|
DWORD 0187E0000H, 0187E0000H
|
|
ALIGN 8
|
|
CONSTBITS_P_1_RND LABEL DWORD
|
|
DWORD 02000H, 02000H
|
|
ALIGN 8
|
|
CONSTBITS_RND LABEL DWORD
|
|
DWORD 01000H, 01000H
|
|
ALIGN 8
|
|
ONE LABEL DWORD
|
|
DWORD 000010001H, 000010001H
|
|
MMXDATA1 ENDS
|
|
|
|
MMXCODE1 SEGMENT
|
|
;
|
|
;
|
|
;
|
|
@MMX_DecodeBlock_IDCT@12 PROC
|
|
; Parameters:
|
|
; pIQ_INDEX: DWORD PTR (in ecx)
|
|
; Pointer to an array of coeff. structures; each structure consists of
|
|
; DWORD of inverse quantized and scaled coeff. and a DWORD of its index.
|
|
; CountCoeff: DWORD (in edx)
|
|
; Number of coefficients <= 64.
|
|
; pBuf: WORD PTR (at <[esp+4]> at the entry of this routine
|
|
; Output area for the IDCT; an 8X8 matrix of WORD values with 6 frac. bits
|
|
; Algorithm:
|
|
; It uses scaled IDCT algorithm credited to Arai, Agui, and Nakajima (AAN).
|
|
; High-level steps are:
|
|
; 1) Decode pIQ_INDEX array and populate the output buffer
|
|
; 2) IDCT and write to output buffer
|
|
; Note:
|
|
; If called from a C function, this routine must be declared as:
|
|
; extern "C" void _fastcall MMX_DecodeBlock_IDCT(...)
|
|
;
|
|
LocalFrameSize = 24
|
|
|
|
Tu7 textequ <[esp+0]>
|
|
Tv5 textequ <[esp+8]>
|
|
StashESP textequ <[esp+16]>
|
|
|
|
push esi
|
|
push edi
|
|
mov edi, esp
|
|
sub esp, LocalFrameSize
|
|
and esp, 0FFFFFFF8H ;Align at 8-byte boundary
|
|
pxor mm0, mm0
|
|
mov StashESP, edi
|
|
mov edi, DWORD PTR [edi+12] ;pBuf
|
|
add edi, 64 ;pBuf+64
|
|
xor eax, eax
|
|
|
|
;
|
|
; Decode coefficients and place them in the output buffer
|
|
; ecx: pIQ_INDEX
|
|
; edx: No_Coeff
|
|
; edi: pBuf+64
|
|
; eax, esi: available
|
|
;
|
|
decode_coeff:
|
|
mov esi, [ecx+edx*8-4] ;Index
|
|
mov eax, [ecx+edx*8-8] ;Inverse quantized scaled coeff
|
|
mov WORD PTR [edi+esi*2-64], ax ;
|
|
dec edx
|
|
jnz decode_coeff
|
|
|
|
IDCT_Start:
|
|
|
|
cols_0_3:
|
|
CLINE0 = 0 - 64
|
|
CLINE1 = 16 - 64
|
|
CLINE2 = 32 - 64
|
|
CLINE3 = 48 - 64
|
|
CLINE4 = 64 - 64
|
|
CLINE5 = 80 - 64
|
|
CLINE6 = 96 - 64
|
|
CLINE7 = 112- 64
|
|
|
|
pxor mm4, mm4 ;
|
|
movq mm0, [edi+CLINE5] ;
|
|
pxor mm5, mm5 ;
|
|
movq mm1, [edi+CLINE1] ;
|
|
pxor mm2, mm2 ;
|
|
psubw mm0, [edi+CLINE3] ;q4=r4
|
|
pxor mm3, mm3 ;
|
|
psubw mm1, [edi+CLINE7] ;q6=r6
|
|
punpcklwd mm4, mm0 ;
|
|
pmaddwd mm4, NEGBETA2 ;
|
|
punpckhwd mm5, mm0 ;
|
|
pmaddwd mm5, NEGBETA2 ;
|
|
psubw mm0, mm1 ;r4-r6
|
|
punpcklwd mm2, mm0 ;
|
|
pxor mm6, mm6 ;
|
|
pmaddwd mm2, BETA5 ;
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA5 ;
|
|
punpcklwd mm6, mm1 ;
|
|
pmaddwd mm6, BETA4 ;
|
|
pxor mm7, mm7 ;
|
|
punpckhwd mm7, mm1 ;
|
|
paddd mm4, mm2 ;s4l
|
|
pmaddwd mm7, BETA4 ;
|
|
paddd mm5, mm3 ;s4h
|
|
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
|
|
psubd mm6, mm2 ;s6l
|
|
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
|
|
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
|
|
psubd mm7, mm3 ;s6h
|
|
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
|
|
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
|
|
packssdw mm4, mm5 ;s4
|
|
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
|
|
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
|
|
movq mm0, [edi+CLINE1] ;
|
|
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
|
|
;mm0=q5 mm4=s4
|
|
;mm2=q7 mm6=s6
|
|
paddw mm0, [edi+CLINE7] ;q5
|
|
packssdw mm6, mm7 ;s6
|
|
movq mm2, [edi+CLINE3] ;
|
|
pxor mm5, mm5 ;
|
|
paddw mm2, [edi+CLINE5] ;q7
|
|
movq mm7, mm0 ;q5
|
|
psubw mm0, mm2 ;r5=q5-q7
|
|
psraw mm7, 1 ;q5>>1
|
|
punpcklwd mm5, mm0
|
|
pxor mm3, mm3
|
|
pmaddwd mm5, BETA3 ;s5l
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA3 ;s5h
|
|
psraw mm2, 1 ;q7>>1
|
|
movq mm0, [edi+CLINE2]
|
|
paddw mm7, mm2 ;r7=s7=u7
|
|
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
|
|
psubw mm6, mm7 ;u6
|
|
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
|
|
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
|
|
psubw mm0, [edi+CLINE6] ;r2
|
|
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
|
|
packssdw mm5, mm3 ;s5
|
|
pxor mm1, mm1
|
|
;mm0=r2 mm4=s4
|
|
;mm1 mm5=u5
|
|
;mm2 mm6=u6
|
|
;mm3 mm7=Tu7
|
|
movq Tu7, mm7 ;Save u7
|
|
pxor mm7, mm7
|
|
movq mm2, [edi+CLINE0]
|
|
punpcklwd mm1, mm0
|
|
pmaddwd mm1, BETA1 ;s2l
|
|
punpckhwd mm7, mm0
|
|
pmaddwd mm7, BETA1 ;s2h
|
|
psubw mm5, mm6 ;u5
|
|
movq mm0, [edi+CLINE2]
|
|
paddw mm4, mm5 ;-u4
|
|
;mm4=-u4 mm5=u5
|
|
;mm6=u6 mm7=u7
|
|
paddd mm1, CONSTBITS_RND ;s2l rounded
|
|
;
|
|
paddd mm7, CONSTBITS_RND ;s2h rounded
|
|
psrad mm1, CONSTBITS ;s2l rounded descaled
|
|
paddw mm0, [edi+CLINE6] ;r3=s3=t3
|
|
psrad mm7, CONSTBITS ;s2h rounded descaled
|
|
movq mm3, mm2 ;
|
|
packssdw mm1, mm7 ;s2
|
|
psubw mm2, [edi+CLINE4] ;t1
|
|
psubw mm1, mm0 ;t2=s2-s3
|
|
psraw mm0, 1 ;t3>>1
|
|
;
|
|
psraw mm2, 1 ;t1>>1
|
|
;
|
|
psraw mm1, 1 ;t2>>1
|
|
;
|
|
paddw mm3, [edi+CLINE4] ;t0
|
|
movq mm7, mm0 ;t3>>1 copy
|
|
psraw mm3, 1 ;t0>>1
|
|
;
|
|
paddw mm0, mm3 ;u0=t3+t0
|
|
psubw mm3, mm7 ;u3=t0-t3
|
|
; psraw mm3, 1 ;u3>>1
|
|
movq mm7, mm1 ;t2
|
|
paddw mm1, mm2 ;u1=t2+t1
|
|
psubw mm2, mm7 ;u2=t1-t2
|
|
;mm0=u0 mm4=-u4
|
|
;mm1=u1 mm5=u5
|
|
;mm2=u2 mm6=u6
|
|
;mm3=u3 mm7=avail.
|
|
; psraw mm2, 1 ;u2>>1
|
|
movq mm7, mm3 ;u3>>1
|
|
psubw mm3, mm4 ;v3=u3-(-u4)
|
|
paddw mm4, mm7 ;v4=-u4+u3
|
|
; psraw mm1, 1 ;u1>>1
|
|
movq mm7, mm2 ;u2>>1
|
|
; psraw mm0, 1 ;u0>>1
|
|
psubw mm2, mm5 ;v5=u2-u5
|
|
paddw mm5, mm7 ;v2=u5+u2
|
|
movq mm7, mm1 ;u1>>1
|
|
psubw mm1, mm6 ;v6=u1-u6
|
|
paddw mm6, mm7 ;v1=u6+u1
|
|
movq Tv5, mm2 ;Save v5
|
|
movq mm7, mm0 ;
|
|
movq mm2, mm5 ;T1
|
|
punpckhwd mm5, mm3 ;T1(c,d)
|
|
paddw mm7, Tu7 ;v0
|
|
;v0=mm7 v4=mm4
|
|
;v1=mm6 v5=Tv5 (to mm2 later)
|
|
;v2=mm5 v6=mm1
|
|
;v3=mm3 v7=mm0 (later)
|
|
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
|
|
movq mm3, mm7 ;T1(a,b)
|
|
punpckhwd mm7, mm6 ;T1(a,b)
|
|
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
|
|
movq mm6, mm7 ;T1
|
|
psubw mm0, Tu7 ;v7
|
|
punpckldq mm7, mm5 ;T1
|
|
punpckhdq mm6, mm5 ;T1;mm5 free
|
|
movq mm5, mm3 ;T1
|
|
movq [edi+CLINE2], mm7 ;T1
|
|
punpckldq mm3, mm2 ;T1
|
|
movq [edi+CLINE3], mm6 ;T1
|
|
punpckhdq mm5, mm2 ;T1
|
|
movq [edi+CLINE0], mm3 ;T1
|
|
movq mm6, mm1 ;T2(c,d)
|
|
movq [edi+CLINE1], mm5 ;T1
|
|
punpckhwd mm1, mm0 ;T2(c,d)
|
|
movq mm2, Tv5
|
|
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
|
|
movq mm7, mm4 ;T2(a,b)
|
|
punpckhwd mm4, mm2 ;T2(a,b)
|
|
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
|
|
movq mm2, mm4 ;T2
|
|
punpckldq mm4, mm1 ;T2
|
|
; ;cols 4-7
|
|
punpckhdq mm2, mm1 ;T2
|
|
movq mm1, mm7 ;T2
|
|
movq [edi+CLINE6], mm4 ;T2
|
|
punpckhdq mm1, mm6 ;T2
|
|
movq [edi+CLINE7], mm2 ;T2
|
|
punpckldq mm7, mm6 ;T2
|
|
movq [edi+CLINE5], mm1 ;T2
|
|
; ;cols 4-7
|
|
movq [edi+CLINE4], mm7 ;T2
|
|
; ;cols 4-7
|
|
cols_4_7:
|
|
; Add 8 to CLINE offsets
|
|
pxor mm4, mm4 ;
|
|
movq mm0, [edi+CLINE5+8] ;
|
|
pxor mm5, mm5 ;
|
|
movq mm1, [edi+CLINE1+8] ;
|
|
pxor mm2, mm2 ;
|
|
psubw mm0, [edi+CLINE3+8] ;q4=r4
|
|
pxor mm3, mm3 ;
|
|
psubw mm1, [edi+CLINE7+8] ;q6=r6
|
|
punpcklwd mm4, mm0 ;
|
|
pmaddwd mm4, NEGBETA2 ;
|
|
punpckhwd mm5, mm0 ;
|
|
pmaddwd mm5, NEGBETA2 ;
|
|
psubw mm0, mm1 ;r4-r6
|
|
punpcklwd mm2, mm0 ;
|
|
pxor mm6, mm6 ;
|
|
pmaddwd mm2, BETA5 ;
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA5 ;
|
|
punpcklwd mm6, mm1 ;
|
|
pmaddwd mm6, BETA4 ;
|
|
pxor mm7, mm7 ;
|
|
punpckhwd mm7, mm1 ;
|
|
paddd mm4, mm2 ;s4l
|
|
pmaddwd mm7, BETA4 ;
|
|
paddd mm5, mm3 ;s4h
|
|
paddd mm4, CONSTBITS_RND ;s4l rounded
|
|
psubd mm6, mm2 ;s6l
|
|
paddd mm5, CONSTBITS_RND ;s4h rounded
|
|
psrad mm4, CONSTBITS ;s4l rounded descaled
|
|
psubd mm7, mm3 ;s6h
|
|
psrad mm5, CONSTBITS ;s4h rounded descaled
|
|
paddd mm6, CONSTBITS_RND ;s6l rounded
|
|
packssdw mm4, mm5 ;s4
|
|
paddd mm7, CONSTBITS_RND ;s6h rounded
|
|
psrad mm6, CONSTBITS ;s6l rounded descaled
|
|
movq mm0, [edi+CLINE1+8] ;
|
|
psrad mm7, CONSTBITS ;s6h rounded descaled
|
|
;mm0=q5 mm4=s4
|
|
;mm2=q7 mm6=s6
|
|
paddw mm0, [edi+CLINE7+8] ;q5
|
|
packssdw mm6, mm7 ;s6
|
|
movq mm2, [edi+CLINE3+8] ;
|
|
pxor mm5, mm5 ;
|
|
paddw mm2, [edi+CLINE5+8] ;q7
|
|
movq mm7, mm0 ;q5
|
|
psubw mm0, mm2 ;r5=q5-q7
|
|
;TODO
|
|
punpcklwd mm5, mm0
|
|
pxor mm3, mm3
|
|
pmaddwd mm5, BETA3 ;s5l
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA3 ;s5h
|
|
;TODO
|
|
movq mm0, [edi+CLINE2+8]
|
|
paddw mm7, mm2 ;r7=s7=u7
|
|
paddd mm5, CONSTBITS_RND ;s5l rounded
|
|
psubw mm6, mm7 ;u6
|
|
paddd mm3, CONSTBITS_RND ;s5h rounded
|
|
psrad mm5, CONSTBITS ;s5l rounded descaled
|
|
psubw mm0, [edi+CLINE6+8] ;r2
|
|
psrad mm3, CONSTBITS ;s5h rounded descaled
|
|
packssdw mm5, mm3 ;s5
|
|
pxor mm1, mm1
|
|
;mm0=r2 mm4=s4
|
|
;mm1 mm5=u5
|
|
;mm2 mm6=u6
|
|
;mm3 mm7=Tu7
|
|
movq Tu7, mm7 ;Save u7
|
|
pxor mm7, mm7
|
|
movq mm2, [edi+CLINE0+8]
|
|
punpcklwd mm1, mm0
|
|
pmaddwd mm1, BETA1 ;s2l
|
|
punpckhwd mm7, mm0
|
|
pmaddwd mm7, BETA1 ;s2h
|
|
psubw mm5, mm6 ;u5
|
|
movq mm0, [edi+CLINE2+8]
|
|
paddw mm4, mm5 ;-u4
|
|
;mm4=-u4 mm5=u5
|
|
;mm6=u6 mm7=u7
|
|
paddd mm1, CONSTBITS_RND ;s2l rounded
|
|
;
|
|
paddd mm7, CONSTBITS_RND ;s2h rounded
|
|
psrad mm1, CONSTBITS ;s2l rounded descaled
|
|
paddw mm0, [edi+CLINE6+8] ;r3=s3=t3
|
|
psrad mm7, CONSTBITS ;s2h rounded descaled
|
|
movq mm3, mm2 ;
|
|
packssdw mm1, mm7 ;s2
|
|
psubw mm2, [edi+CLINE4+8] ;t1
|
|
psubw mm1, mm0 ;t2=s2-s3
|
|
paddw mm3, [edi+CLINE4+8] ;t0
|
|
movq mm7, mm0 ;t3
|
|
paddw mm0, mm3 ;u0=t3+t0
|
|
psubw mm3, mm7 ;u3=t0-t3
|
|
movq mm7, mm1 ;t2
|
|
paddw mm1, mm2 ;u1=t2+t1
|
|
psubw mm2, mm7 ;u2=t1-t2
|
|
;mm0=u0 mm4=-u4
|
|
;mm1=u1 mm5=u5
|
|
;mm2=u2 mm6=u6
|
|
;mm3=u3 mm7=avail.
|
|
|
|
movq mm7, mm3 ;
|
|
psubw mm3, mm4 ;u3-(-u4)
|
|
paddw mm4, mm7 ;-u4+u3
|
|
psraw mm3, 1 ;v3
|
|
movq mm7, mm2 ;
|
|
psraw mm4, 1 ;v4
|
|
psubw mm2, mm5 ;u2-u5
|
|
psraw mm2, 1 ;v5
|
|
paddw mm5, mm7 ;u5+u2
|
|
psraw mm5, 1 ;v2
|
|
movq mm7, mm1 ;
|
|
psubw mm1, mm6 ;u1-u6
|
|
paddw mm6, mm7 ;u6+u1
|
|
movq Tv5, mm2 ;Save v5
|
|
psraw mm1, 1 ;v6
|
|
psraw mm6, 1 ;v1
|
|
movq mm7, mm0 ;
|
|
movq mm2, mm5 ;T1
|
|
punpckhwd mm5, mm3 ;T1(c,d)
|
|
paddw mm7, Tu7 ;
|
|
;TODO
|
|
psraw mm7, 1 ;v0
|
|
;TODO
|
|
;v0=mm7 v4=mm4
|
|
;v1=mm6 v5=Tv5 (to mm2 later)
|
|
;v2=mm5 v6=mm1
|
|
;v3=mm3 v7=mm0 (later)
|
|
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
|
|
movq mm3, mm7 ;T1(a,b)
|
|
punpckhwd mm7, mm6 ;T1(a,b)
|
|
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
|
|
movq mm6, mm7 ;T1
|
|
psubw mm0, Tu7 ;
|
|
punpckldq mm7, mm5 ;T1
|
|
psraw mm0, 1 ;v7
|
|
;TODO
|
|
punpckhdq mm6, mm5 ;T1;mm5 free
|
|
movq mm5, mm3 ;T1
|
|
movq [edi+CLINE2+8], mm7 ;T1
|
|
punpckldq mm3, mm2 ;T1
|
|
movq [edi+CLINE3+8], mm6 ;T1
|
|
punpckhdq mm5, mm2 ;T1
|
|
movq [edi+CLINE0+8], mm3 ;T1
|
|
movq mm6, mm1 ;T2(c,d)
|
|
movq [edi+CLINE1+8], mm5 ;T1
|
|
punpckhwd mm1, mm0 ;T2(c,d)
|
|
movq mm2, Tv5
|
|
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
|
|
movq mm7, mm4 ;T2(a,b)
|
|
punpckhwd mm4, mm2 ;T2(a,b)
|
|
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
|
|
movq mm2, mm4 ;T2
|
|
punpckldq mm4, mm1 ;T2
|
|
; ;cols 4-7
|
|
punpckhdq mm2, mm1 ;T2
|
|
movq mm1, mm7 ;T2
|
|
movq [edi+CLINE6+8], mm4 ;T2
|
|
punpckhdq mm1, mm6 ;T2
|
|
movq [edi+CLINE7+8], mm2 ;T2
|
|
punpckldq mm7, mm6 ;T2
|
|
movq [edi+CLINE5+8], mm1 ;T2
|
|
; ;cols 4-7
|
|
movq [edi+CLINE4+8], mm7 ;T2
|
|
; ;cols 4-7
|
|
rows_0_3:
|
|
|
|
RLINE0 = 0 - 64
|
|
RLINE1 = 16 - 64
|
|
RLINE2 = 32 - 64
|
|
RLINE3 = 48 - 64
|
|
RLINE4 = 8 - 64
|
|
RLINE5 = 24 - 64
|
|
RLINE6 = 40 - 64
|
|
RLINE7 = 56 - 64
|
|
pxor mm4, mm4 ;
|
|
movq mm0, [edi+RLINE5] ;
|
|
pxor mm5, mm5 ;
|
|
movq mm1, [edi+RLINE1] ;
|
|
pxor mm2, mm2 ;
|
|
psubw mm0, [edi+RLINE3] ;q4=r4
|
|
pxor mm3, mm3 ;
|
|
psubw mm1, [edi+RLINE7] ;q6=r6
|
|
punpcklwd mm4, mm0 ;
|
|
pmaddwd mm4, NEGBETA2 ;
|
|
punpckhwd mm5, mm0 ;
|
|
pmaddwd mm5, NEGBETA2 ;
|
|
psubw mm0, mm1 ;r4-r6
|
|
punpcklwd mm2, mm0 ;
|
|
pxor mm6, mm6 ;
|
|
pmaddwd mm2, BETA5 ;
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA5 ;
|
|
punpcklwd mm6, mm1 ;
|
|
pmaddwd mm6, BETA4 ;
|
|
pxor mm7, mm7 ;
|
|
punpckhwd mm7, mm1 ;
|
|
paddd mm4, mm2 ;s4l
|
|
pmaddwd mm7, BETA4 ;
|
|
paddd mm5, mm3 ;s4h
|
|
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
|
|
psubd mm6, mm2 ;s6l
|
|
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
|
|
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
|
|
psubd mm7, mm3 ;s6h
|
|
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
|
|
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
|
|
packssdw mm4, mm5 ;s4
|
|
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
|
|
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
|
|
movq mm0, [edi+RLINE1] ;
|
|
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
|
|
;mm0=q5 mm4=s4
|
|
;mm2=q7 mm6=s6
|
|
paddw mm0, [edi+RLINE7] ;q5
|
|
packssdw mm6, mm7 ;s6
|
|
movq mm2, [edi+RLINE3] ;
|
|
pxor mm5, mm5 ;
|
|
paddw mm2, [edi+RLINE5] ;q7
|
|
movq mm7, mm0 ;q5
|
|
psubw mm0, mm2 ;r5=q5-q7
|
|
paddw mm7, mm2 ;r7=q5+q7
|
|
punpcklwd mm5, mm0
|
|
pxor mm3, mm3
|
|
pmaddwd mm5, BETA3 ;s5l
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA3 ;s5h
|
|
;TODO
|
|
paddw mm7, ONE ;
|
|
;TODO
|
|
movq mm0, [edi+RLINE2]
|
|
psraw mm7, 1 ;s7
|
|
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
|
|
psubw mm6, mm7 ;u6
|
|
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
|
|
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
|
|
psubw mm0, [edi+RLINE6] ;r2
|
|
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
|
|
packssdw mm5, mm3 ;s5
|
|
pxor mm1, mm1
|
|
;mm0=r2 mm4=s4
|
|
;mm1 mm5=u5
|
|
;mm2 mm6=u6
|
|
;mm3 mm7=Tu7
|
|
psllw mm7, 1 ;u7<<1
|
|
;
|
|
movq Tu7, mm7 ;Save u7<<1
|
|
pxor mm7, mm7
|
|
movq mm2, [edi+RLINE0]
|
|
punpcklwd mm1, mm0
|
|
pmaddwd mm1, BETA1 ;s2l
|
|
punpckhwd mm7, mm0
|
|
pmaddwd mm7, BETA1 ;s2h
|
|
psubw mm5, mm6 ;u5
|
|
movq mm0, [edi+RLINE2]
|
|
paddw mm4, mm5 ;-u4
|
|
;mm4=-u4 mm5=u5
|
|
;mm6=u6 mm7=
|
|
paddd mm1, CONSTBITS_RND ;s2l rounded
|
|
;
|
|
paddd mm7, CONSTBITS_RND ;s2h rounded
|
|
psrad mm1, CONSTBITS ;s2l rounded descaled
|
|
paddw mm0, [edi+RLINE6] ;r3=s3=t3
|
|
psrad mm7, CONSTBITS ;s2h rounded descaled
|
|
movq mm3, mm2 ;
|
|
packssdw mm1, mm7 ;s2
|
|
psubw mm2, [edi+RLINE4] ;t1
|
|
psubw mm1, mm0 ;t2=s2-s3
|
|
paddw mm3, [edi+RLINE4] ;t0
|
|
movq mm7, mm0 ;t3
|
|
paddw mm0, mm3 ;u0=t3+t0
|
|
psubw mm3, mm7 ;u3=t0-t3
|
|
;TODO
|
|
movq mm7, mm1 ;t2
|
|
paddw mm1, mm2 ;u1=t2+t1
|
|
psubw mm2, mm7 ;u2=t1-t2
|
|
;mm0=u0 mm4=-u4
|
|
;mm1=u1 mm5=u5
|
|
;mm2=u2 mm6=u6
|
|
;mm3=u3 mm7=avail.
|
|
psllw mm4, 1 ;-u4<<1
|
|
movq mm7, mm3 ;
|
|
psubw mm3, mm4 ;v3=u3-(-u4<<1)
|
|
paddw mm4, mm7 ;v4=(-u4<<1)+u3
|
|
psllw mm5, 1 ;u5<<1
|
|
movq mm7, mm2 ;
|
|
psubw mm2, mm5 ;v5=u2-(u5<<1)
|
|
paddw mm5, mm7 ;v2=(u5<<1)+u2
|
|
psllw mm6, 1 ;u6<<1
|
|
movq mm7, mm1 ;
|
|
psubw mm1, mm6 ;v6=u1-(u6<<1)
|
|
paddw mm6, mm7 ;v1=(u6<<1)+u1
|
|
movq Tv5, mm2 ;Save v5
|
|
movq mm7, mm0 ;
|
|
movq mm2, mm5 ;T1
|
|
punpckhwd mm5, mm3 ;T1(c,d)
|
|
paddw mm7, Tu7 ;v0=u0+(u7<<1)
|
|
;v0=mm7 v4=mm4
|
|
;v1=mm6 v5=Tv5 (to mm2 later)
|
|
;v2=mm5 v6=mm1
|
|
;v3=mm3 v7=mm0 (later)
|
|
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
|
|
movq mm3, mm7 ;T1(a,b)
|
|
punpckhwd mm7, mm6 ;T1(a,b)
|
|
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
|
|
movq mm6, mm7 ;T1
|
|
psubw mm0, Tu7 ;v7=u0-(u7<<1)
|
|
punpckldq mm7, mm5 ;T1
|
|
punpckhdq mm6, mm5 ;T1;mm5 free
|
|
movq mm5, mm3 ;T1
|
|
movq [edi+RLINE2], mm7 ;T1
|
|
punpckldq mm3, mm2 ;T1
|
|
movq [edi+RLINE3], mm6 ;T1
|
|
punpckhdq mm5, mm2 ;T1
|
|
movq [edi+RLINE0], mm3 ;T1
|
|
movq mm6, mm1 ;T2(c,d)
|
|
movq [edi+RLINE1], mm5 ;T1
|
|
punpckhwd mm1, mm0 ;T2(c,d)
|
|
movq mm2, Tv5
|
|
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
|
|
movq mm7, mm4 ;T2(a,b)
|
|
punpckhwd mm4, mm2 ;T2(a,b)
|
|
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
|
|
movq mm2, mm4 ;T2
|
|
punpckldq mm4, mm1 ;T2
|
|
; ;cols 4-7
|
|
punpckhdq mm2, mm1 ;T2
|
|
movq mm1, mm7 ;T2
|
|
movq [edi+RLINE6], mm4 ;T2
|
|
punpckhdq mm1, mm6 ;T2
|
|
movq [edi+RLINE7], mm2 ;T2
|
|
punpckldq mm7, mm6 ;T2
|
|
movq [edi+RLINE5], mm1 ;T2
|
|
; ;cols 4-7
|
|
movq [edi+RLINE4], mm7 ;T2
|
|
; ;cols 4-7
|
|
rows_4_7:
|
|
; Add 64 to RLINE offsets
|
|
pxor mm4, mm4 ;
|
|
movq mm0, [edi+RLINE5+64] ;
|
|
pxor mm5, mm5 ;
|
|
movq mm1, [edi+RLINE1+64] ;
|
|
pxor mm2, mm2 ;
|
|
psubw mm0, [edi+RLINE3+64] ;q4=r4
|
|
pxor mm3, mm3 ;
|
|
psubw mm1, [edi+RLINE7+64] ;q6=r6
|
|
punpcklwd mm4, mm0 ;
|
|
pmaddwd mm4, NEGBETA2 ;
|
|
punpckhwd mm5, mm0 ;
|
|
pmaddwd mm5, NEGBETA2 ;
|
|
psubw mm0, mm1 ;r4-r6
|
|
punpcklwd mm2, mm0 ;
|
|
pxor mm6, mm6 ;
|
|
pmaddwd mm2, BETA5 ;
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA5 ;
|
|
punpcklwd mm6, mm1 ;
|
|
pmaddwd mm6, BETA4 ;
|
|
pxor mm7, mm7 ;
|
|
punpckhwd mm7, mm1 ;
|
|
paddd mm4, mm2 ;s4l
|
|
pmaddwd mm7, BETA4 ;
|
|
paddd mm5, mm3 ;s4h
|
|
paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
|
|
psubd mm6, mm2 ;s6l
|
|
paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
|
|
psrad mm4, CONSTBITS+1 ;s4l rounded descaled
|
|
psubd mm7, mm3 ;s6h
|
|
psrad mm5, CONSTBITS+1 ;s4h rounded descaled
|
|
paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
|
|
packssdw mm4, mm5 ;s4
|
|
paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
|
|
psrad mm6, CONSTBITS+1 ;s6l rounded descaled
|
|
movq mm0, [edi+RLINE1+64] ;
|
|
psrad mm7, CONSTBITS+1 ;s6h rounded descaled
|
|
;mm0=q5 mm4=s4
|
|
;mm2=q7 mm6=s6
|
|
paddw mm0, [edi+RLINE7+64] ;q5
|
|
packssdw mm6, mm7 ;s6
|
|
movq mm2, [edi+RLINE3+64] ;
|
|
pxor mm5, mm5 ;
|
|
paddw mm2, [edi+RLINE5+64] ;q7
|
|
movq mm7, mm0 ;q5
|
|
psubw mm0, mm2 ;r5=q5-q7
|
|
paddw mm7, mm2 ;r7=q5+q7
|
|
punpcklwd mm5, mm0
|
|
pxor mm3, mm3
|
|
pmaddwd mm5, BETA3 ;s5l
|
|
punpckhwd mm3, mm0 ;
|
|
pmaddwd mm3, BETA3 ;s5h
|
|
;TODO
|
|
paddw mm7, ONE ;
|
|
;TODO
|
|
movq mm0, [edi+RLINE2+64]
|
|
psraw mm7, 1 ;s7
|
|
paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
|
|
psubw mm6, mm7 ;u6
|
|
paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
|
|
psrad mm5, CONSTBITS+1 ;s5l rounded descaled
|
|
psubw mm0, [edi+RLINE6+64] ;r2
|
|
psrad mm3, CONSTBITS+1 ;s5h rounded descaled
|
|
packssdw mm5, mm3 ;s5
|
|
pxor mm1, mm1
|
|
;mm0=r2 mm4=s4
|
|
;mm1 mm5=u5
|
|
;mm2 mm6=u6
|
|
;mm3 mm7=Tu7
|
|
psllw mm7, 1 ;u7<<1
|
|
;
|
|
movq Tu7, mm7 ;Save u7<<1
|
|
pxor mm7, mm7
|
|
movq mm2, [edi+RLINE0+64]
|
|
punpcklwd mm1, mm0
|
|
pmaddwd mm1, BETA1 ;s2l
|
|
punpckhwd mm7, mm0
|
|
pmaddwd mm7, BETA1 ;s2h
|
|
psubw mm5, mm6 ;u5
|
|
movq mm0, [edi+RLINE2+64]
|
|
paddw mm4, mm5 ;-u4
|
|
;mm4=-u4 mm5=u5
|
|
;mm6=u6 mm7=
|
|
paddd mm1, CONSTBITS_RND ;s2l rounded
|
|
;
|
|
paddd mm7, CONSTBITS_RND ;s2h rounded
|
|
psrad mm1, CONSTBITS ;s2l rounded descaled
|
|
paddw mm0, [edi+RLINE6+64] ;r3=s3=t3
|
|
psrad mm7, CONSTBITS ;s2h rounded descaled
|
|
movq mm3, mm2 ;
|
|
packssdw mm1, mm7 ;s2
|
|
psubw mm2, [edi+RLINE4+64] ;t1
|
|
psubw mm1, mm0 ;t2=s2-s3
|
|
paddw mm3, [edi+RLINE4+64] ;t0
|
|
movq mm7, mm0 ;t3
|
|
paddw mm0, mm3 ;u0=t3+t0
|
|
psubw mm3, mm7 ;u3=t0-t3
|
|
;TODO
|
|
movq mm7, mm1 ;t2
|
|
paddw mm1, mm2 ;u1=t2+t1
|
|
psubw mm2, mm7 ;u2=t1-t2
|
|
;mm0=u0 mm4=-u4
|
|
;mm1=u1 mm5=u5
|
|
;mm2=u2 mm6=u6
|
|
;mm3=u3 mm7=avail.
|
|
psllw mm4, 1 ;-u4<<1
|
|
movq mm7, mm3 ;
|
|
psubw mm3, mm4 ;v3=u3-(-u4<<1)
|
|
paddw mm4, mm7 ;v4=(-u4<<1)+u3
|
|
psllw mm5, 1 ;u5<<1
|
|
movq mm7, mm2 ;
|
|
psubw mm2, mm5 ;v5=u2-(u5<<1)
|
|
paddw mm5, mm7 ;v2=(u5<<1)+u2
|
|
psllw mm6, 1 ;u6<<1
|
|
movq mm7, mm1 ;
|
|
psubw mm1, mm6 ;v6=u1-(u6<<1)
|
|
paddw mm6, mm7 ;v1=(u6<<1)+u1
|
|
movq Tv5, mm2 ;Save v5
|
|
movq mm7, mm0 ;
|
|
movq mm2, mm5 ;T1
|
|
punpckhwd mm5, mm3 ;T1(c,d)
|
|
paddw mm7, Tu7 ;v0=u0+(u7<<1)
|
|
;v0=mm7 v4=mm4
|
|
;v1=mm6 v5=Tv5 (to mm2 later)
|
|
;v2=mm5 v6=mm1
|
|
;v3=mm3 v7=mm0 (later)
|
|
punpcklwd mm2, mm3 ;T1(c,d);mm3 free
|
|
movq mm3, mm7 ;T1(a,b)
|
|
punpckhwd mm7, mm6 ;T1(a,b)
|
|
punpcklwd mm3, mm6 ;T1(a,b);mm6 free
|
|
movq mm6, mm7 ;T1
|
|
psubw mm0, Tu7 ;v7=u0-(u7<<1)
|
|
punpckldq mm7, mm5 ;T1
|
|
punpckhdq mm6, mm5 ;T1;mm5 free
|
|
movq mm5, mm3 ;T1
|
|
movq [edi+RLINE2+64], mm7 ;T1
|
|
punpckldq mm3, mm2 ;T1
|
|
movq [edi+RLINE3+64], mm6 ;T1
|
|
punpckhdq mm5, mm2 ;T1
|
|
movq [edi+RLINE0+64], mm3 ;T1
|
|
movq mm6, mm1 ;T2(c,d)
|
|
movq [edi+RLINE1+64], mm5 ;T1
|
|
punpckhwd mm1, mm0 ;T2(c,d)
|
|
movq mm2, Tv5
|
|
punpcklwd mm6, mm0 ;T2(c,d);mm0 free
|
|
movq mm7, mm4 ;T2(a,b)
|
|
punpckhwd mm4, mm2 ;T2(a,b)
|
|
punpcklwd mm7, mm2 ;T2(a,b);mm2 free
|
|
movq mm2, mm4 ;T2
|
|
punpckldq mm4, mm1 ;T2
|
|
; ;cols 4-7
|
|
punpckhdq mm2, mm1 ;T2
|
|
movq mm1, mm7 ;T2
|
|
movq [edi+RLINE6+64], mm4 ;T2
|
|
punpckhdq mm1, mm6 ;T2
|
|
movq [edi+RLINE7+64], mm2 ;T2
|
|
punpckldq mm7, mm6 ;T2
|
|
movq [edi+RLINE5+64], mm1 ;T2
|
|
; ;cols 4-7
|
|
movq [edi+RLINE4+64], mm7 ;T2
|
|
; ;cols 4-7
|
|
IDCT_Done:
|
|
mov esp, StashESP
|
|
pop edi
|
|
pop esi
|
|
ret 4
|
|
|
|
@MMX_DecodeBlock_IDCT@12 endp
|
|
|
|
MMXCODE1 ENDS
|
|
|
|
END
|