|
|
;***************************************************************************/ ;* ;* INTEL Corporation Proprietary Information ;* ;* ;* Copyright (c) 1996 Intel Corporation. ;* All rights reserved. ;* ;***************************************************************************/ ; AUTHOR: Kumar Balasubramanian ;***************************************************************************/
;; MMX version of the "integer fast mode" within IJG decompressor code.
.nolist include iammx.inc ; IAMMX Emulator Macros MMWORD TEXTEQU <DWORD> .list
.586 .model flat
_DATA SEGMENT PARA PUBLIC USE32 'DATA' x0005000200010001 DQ 0005000200010001h x0040000000000000 DQ 40000000000000h
x5a825a825a825a82 DW 16ah, 0h, 16ah, 0h ; 23170---1.414 x539f539f539f539f DW 0fd63h, 0h, 0fd63h, 0h ; 21407---2.613 x4546454645464546 DW 115h, 0h, 115h, 0h ; 17734---1.082 x61f861f861f861f8 DW 1d9h, 0h, 1d9h, 0h ; 25080---1.847
const_mask DQ 3ff03ff03ff03ffh const_zero DQ 0 scratch1 DQ 0 scratch3 DQ 0 scratch5 DQ 0 scratch7 DQ 0 ; for debug only x0 DQ 0
preSC DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
_DATA ENDS
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
PackMulW MACRO movq mm0, mmword ptr scratch1 punpcklwd mm0, mmword ptr const_zero pmaddwd mm0, mmword ptr scratch3 psrad mm0, 8 movq mm1, mmword ptr scratch1 punpckhwd mm1, mmword ptr const_zero pmaddwd mm1, mmword ptr scratch3 psrad mm1, 8 movq mmword ptr scratch1, mm1 movq mm1, mm0 punpcklwd mm0, mmword ptr scratch1 punpckhwd mm1, mmword ptr scratch1 punpcklwd mm0, mm1 movq mmword ptr scratch1, mm0
ENDM
COMMENT ^ void idct8x8aan ( int16 *src_result); ^ public _idct8x8aan _idct8x8aan proc USES eax ebx ecx edx esi edi ebp
mov ebx, DWORD PTR [esp+32] ; source coeff mov esi, DWORD PTR [esp+36] ; temp results mov edi, DWORD PTR [esp+40] ; quant factors ;slot
; column 0: even part ; use V4, V12, V0, V8 to produce V22..V25 ;slot
movq mm0, mmword ptr [ebx+8*12] ; V12 pmullw mm0, mmword ptr [edi+8*12] ;slot
movq mm1, mmword ptr [ebx+8*4] ; V4 pmullw mm1, mmword ptr [edi+8*4] ;slot
movq mm3, mmword ptr [ebx+8*0] ; V0 pmullw mm3, mmword ptr [edi+8*0] ;slot
movq mm2, mm1 ; duplicate V4
movq mm5, mmword ptr [ebx+8*8] ; V8 pmullw mm5, mmword ptr [edi+8*8] psubw mm1, mm0 ; V16
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
paddw mm2, mm0 ; V17
movq mm0, mm2 ; duplicate V17
movq mm4, mm3 ; duplicate V0
paddw mm3, mm5 ; V19 psubw mm4, mm5 ; V20 ;mm5 free
movq mm6, mm3 ; duplicate t74=t81
psubw mm1, mm0 ; V21 ; mm0 free paddw mm3, mm2 ; V22
movq mm5, mm1 ; duplicate V21 paddw mm1, mm4 ; V23
movq mmword ptr [esi+8*4], mm3 ; V22 psubw mm4, mm5 ; V24; mm5 free
movq mmword ptr [esi+8*12], mm1 ; V23 psubw mm6, mm2 ; V25; mm2 free
movq mmword ptr [esi+8*0], mm4 ; V24 ;slot
movq mm7, mmword ptr [ebx+8*10] ; V10 pmullw mm7, mmword ptr [edi+8*10] ;slot
movq mm0, mmword ptr [ebx+8*6] ; V6 pmullw mm0, mmword ptr [edi+8*6] ;slot
movq mm3, mm7 ; duplicate V10
movq mm5, mmword ptr [ebx+8*2] ; V2 pmullw mm5, mmword ptr [edi+8*2] ;slot
psubw mm7, mm0 ; V26
movq mm4, mmword ptr [ebx+8*14] ; V14 pmullw mm4, mmword ptr [edi+8*14] paddw mm3, mm0 ; V29 ; free mm0
movq mm1, mm7 ; duplicate V26
movq mmword ptr scratch1, mm7 movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18 movq mmword ptr scratch3, mm7 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm7, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
movq mm0, mm5 ; duplicate V2
paddw mm5, mm4 ; V27 psubw mm0, mm4 ; V28 ; free mm4
movq mm2, mm0 ; duplicate V28
movq mmword ptr scratch1, mm0 movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18 movq mmword ptr scratch3, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm0, mmword ptr scratch1 movq mm1, mmword ptr scratch7
movq mm4, mm5 ; duplicate t90=t93 paddw mm1, mm2 ; V32 ; free mm2
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
paddw mm5, mm3 ; V31 psubw mm4, mm3 ; V30 ; free mm3
movq mmword ptr scratch1, mm4 movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm4 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm4, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
psubw mm0, mm1 ; V38 paddw mm1, mm7 ; V37 ; free mm7 movq mm3, mm6 ; duplicate V25
;move from the next block movq mm7, mmword ptr [esi+8*4] ; V22
psubw mm1, mm5 ; V39 (mm5 still needed for next block)
;move from the next block movq mm2, mmword ptr [esi+8*12] ; V23
psubw mm4, mm1 ; V40
paddw mm0, mm4 ; V41; free mm0
; column 0: output butterfly
psubw mm6, mm0 ; tm6 paddw mm3, mm0 ; tm8; free mm1
movq mm0, mm1 ; line added by Kumar
movq mm1, mm7 ; duplicate V22 paddw mm7, mm5 ; tm0
movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3 psubw mm1, mm5 ; tm14; free mm5
movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6 movq mm3, mm2 ; duplicate t117=t125
movq mm6, mmword ptr [esi+8*0] ; V24 paddw mm2, mm0 ; tm2
movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7 psubw mm3, mm0 ; tm12; free mm0
movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2 movq mm0, mm6 ; duplicate t119=t123
movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3 paddw mm6, mm4 ; tm4
psubw mm0, mm4 ; tm10; free mm4
movq mm1, mmword ptr [ebx+8*5] ; V5 pmullw mm1, mmword ptr [edi+8*5]
movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
; column 1: even part ; use V5, V13, V1, V9 to produce V56..V59
movq mm7, mmword ptr [ebx+8*13] ; V13 pmullw mm7, mmword ptr [edi+8*13] movq mm2, mm1 ; duplicate t128=t130
movq mm3, mmword ptr [ebx+8*1] ; V1 pmullw mm3, mmword ptr [edi+8*1]
psubw mm1, mm7 ; V50
movq mm5, mmword ptr [ebx+8*9] ; V9 pmullw mm5, mmword ptr [edi+8*9] paddw mm2, mm7 ; V51
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
movq mm6, mm2 ; duplicate V51
movq mm4, mm3 ; duplicate V1
paddw mm3, mm5 ; V53
psubw mm4, mm5 ; V54 ;mm5 free movq mm7, mm3 ; duplicate V53
psubw mm1, mm6 ; V55 ; mm6 free paddw mm3, mm2 ; V56
movq mm5, mm4 ; duplicate t140=t142 paddw mm4, mm1 ; V57
movq mmword ptr [esi+8*5], mm3 ; V56 psubw mm5, mm1 ; V58; mm1 free
movq mmword ptr [esi+8*13], mm4 ; V57 psubw mm7, mm2 ; V59; mm2 free
movq mmword ptr [esi+8*9], mm5 ; V58
movq mm0, mmword ptr [ebx+8*11] ; V11 pmullw mm0, mmword ptr [edi+8*11]
movq mm6, mmword ptr [ebx+8*7] ; V7 pmullw mm6, mmword ptr [edi+8*7]
movq mm3, mm0 ; duplicate V11
movq mm4, mmword ptr [ebx+8*15] ; V15 pmullw mm4, mmword ptr [edi+8*15]
movq mm5, mmword ptr [ebx+8*3] ; V3 pmullw mm5, mmword ptr [edi+8*3] paddw mm0, mm6 ; V63
; note that V15 computation has a correction step: ; this is a 'magic' constant that rebiases the results to be closer to the expected result ; this magic constant can be refined to reduce the error even more ; by doing the correction step in a later stage when the number is actually multiplied by 16 psubw mm3, mm6 ; V60 ; free mm6
movq mm1, mm3 ; duplicate V60
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
movq mm6, mm5 ; duplicate V3
paddw mm5, mm4 ; V61 psubw mm6, mm4 ; V62 ; free mm4
movq mm4, mm5 ; duplicate V61
paddw mm5, mm0 ; V65 -> result psubw mm4, mm0 ; V64 ; free mm0
movq mmword ptr scratch1, mm4 movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm4 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm4, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
paddw mm3, mm6 ; V66 movq mm2, mm5 ; duplicate V65
movq mmword ptr scratch1, mm3 movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18 movq mmword ptr scratch3, mm3 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm3, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
movq mmword ptr scratch1, mm6 movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18 movq mmword ptr scratch3, mm6 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm6, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
movq mm0, mmword ptr [esi+8*5] ; V56
psubw mm6, mm3 ; V72 paddw mm3, mm1 ; V71 ; free mm1
psubw mm3, mm2 ; V73 ; free mm2
psubw mm4, mm3 ; V74
;moved from next block movq mm1, mm0 ; duplicate t177=t188
paddw mm6, mm4 ; V75
;moved from next block paddw mm0, mm5 ; tm1
;location ; 5 - V56 ; 13 - V57 ; 9 - V58 ; X - V59, mm7 ; X - V65, mm5 ; X - V73, mm6 ; X - V74, mm4 ; X - V75, mm3 ; free mm0, mm1 & mm2 ;move above
movq mm2, mmword ptr [esi+8*13] ; V57 psubw mm1, mm5 ; tm15; free mm5
movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
;save the store as used directly in the transpose ;movq mmword ptr [esi+8*15], mm1 ; tm15; free mm1 movq mm5, mm7 ; duplicate t182=t184
psubw mm7, mm6 ; tm7
paddw mm5, mm6 ; tm9; free mm3 ;slot
movq mm6, mm3
movq mm0, mmword ptr [esi+8*9] ; V58 movq mm3, mm2 ; duplicate V57
movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7 psubw mm3, mm6 ; tm13
paddw mm2, mm6 ; tm3 ; free mm6
movq mm6, mm0 ; duplicate V58
movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2 paddw mm0, mm4 ; tm5
psubw mm6, mm4 ; tm11; free mm4
movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
; transpose the bottom right quadrant(4X4) of the matrix ; --------- --------- ; | M1 | M2 | | M1'| M3'| ; --------- --> --------- ; | M3 | M4 | | M2'| M4'| ; --------- ---------
movq mm0, mm5 ; copy w4---0,1,3,5,6 punpcklwd mm5, mm6 ;
punpckhwd mm0, mm6 ;---0,1,3,5,6 movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
movq mm2, mm3 ;---0,1,2,3,5,6 punpcklwd mm3, mm1 ;
movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant punpckhwd mm2, mm1 ;---0,2,3,5,6,7
movq mm4, mm5 ;---0,2,3,4,5,6,7 punpckldq mm5, mm3 ; transposed w4
movq mmword ptr [esi+8*9], mm5 ; store w4 punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
movq mm3, mm0 ;---0,2,3,4,6,7 punpckldq mm0, mm2 ; transposed w6
movq mmword ptr [esi+8*11], mm4 ; store w5 punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
movq mmword ptr [esi+8*13], mm0 ; store w6---3,5,6,7 movq mm5, mm6 ; copy w0
movq mmword ptr [esi+8*15], mm3 ; store w7---5,6,7 punpcklwd mm6, mm7
; transpose the top left quadrant(4X4) of the matrix
punpckhwd mm5, mm7 ;---5,6,7 movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant movq mm3, mm7 ; copy w2---3,4,5,6,7
movq mm2, mm6 punpcklwd mm7, mm4 ;---2,3,4,5,6,7
punpckhwd mm3, mm4 ;---2,3,4,5,6,7 movq mm4, mm5 ;
movq mm1, mm5 punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
movq mmword ptr [esi+8*0], mm6 ; store w0 of TL quadrant punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
movq mmword ptr [esi+8*2], mm2 ; store w1 of TL quadrant punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
movq mmword ptr [esi+8*4], mm5 ; store w2 of TL quadrant punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
movq mmword ptr [esi+8*6], mm1 ; store w3 of TL quadrant
; transpose the top right quadrant(4X4) of the matrix
movq mm0, mmword ptr [esi+8*1] ;---0
movq mm1, mmword ptr [esi+8*3] ;---0,1,2 movq mm2, mm0
movq mm3, mmword ptr [esi+8*5] punpcklwd mm0, mm1 ;---0,1,2,3
punpckhwd mm2, mm1 movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
movq mm4, mm3 punpcklwd mm3, mm1 ;---0,1,2,3,4
punpckhwd mm4, mm1 ;---0,1,2,3,4 movq mm1, mm0
movq mm5, mm2 punpckldq mm0, mm3 ;---0,1,2,3,4,5
punpckhdq mm1, mm3 ;---0,1,2,3,4,5 movq mm3, mmword ptr [esi+8*8]
movq mmword ptr [esi+8*8], mm0 punpckldq mm2, mm4 ;---1,2,3,4,5
punpckhdq mm5, mm4 ;---1,2,3,4,5 movq mm4, mmword ptr [esi+8*10]
; transpose the bottom left quadrant(4X4) of the matrix ; Also store w1,w2,w3 of top right quadrant into ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4 ; of BL is already done.
movq mmword ptr [esi+8*10], mm1 movq mm1, mm3 ;---1,2,3,4,5
movq mm0, mmword ptr [esi+8*12] punpcklwd mm3, mm4 ;---0,1,2,3,4,5
punpckhwd mm1, mm4 ;---0,1,2,3,4,5 movq mm4, mmword ptr [esi+8*14]
movq mmword ptr [esi+8*12], mm2 movq mm2, mm0
movq mmword ptr [esi+8*14], mm5 punpcklwd mm0, mm4 ;---0,1,2,3,4
punpckhwd mm2, mm4 ;---0,1,2,3,4 movq mm4, mm3
movq mm5, mm1 punpckldq mm3, mm0 ;---0,1,2,3,4,5
movq mmword ptr [esi+8*1], mm3 punpckhdq mm4, mm0 ;---1,2,4,5
movq mmword ptr [esi+8*3], mm4 punpckldq mm1, mm2 ;---1,2,5
movq mmword ptr [esi+8*5], mm1 punpckhdq mm5, mm2 ;---5
movq mmword ptr [esi+8*7], mm5
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;; 1D DCT of the rows ;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, DWORD PTR [esp+36] ; source ;slot
; column 0: even part ; use V4, V12, V0, V8 to produce V22..V25
movq mm0, mmword ptr [esi+8*12] ; V12
movq mm1, mmword ptr [esi+8*4] ; V4
movq mm3, mmword ptr [esi+8*0] ; V0
movq mm2, mm1 ; duplicate V4
movq mm5, mmword ptr [esi+8*8] ; V8 psubw mm1, mm0 ; V16
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
paddw mm2, mm0 ; V17
movq mm0, mm2 ; duplicate V17
movq mm4, mm3 ; duplicate V0
paddw mm3, mm5 ; V19 psubw mm4, mm5 ; V20 ;mm5 free
;moved from the block below movq mm6, mm3 ; duplicate t74=t81
psubw mm1, mm0 ; V21 ; mm0 free paddw mm3, mm2 ; V22
movq mm5, mm1 ; duplicate V21 paddw mm1, mm4 ; V23
movq mmword ptr [esi+8*4], mm3 ; V22 psubw mm4, mm5 ; V24; mm5 free
movq mmword ptr [esi+8*12], mm1 ; V23 psubw mm6, mm2 ; V25; mm2 free
movq mmword ptr [esi+8*0], mm4 ; V24
; keep mm6 alive all along the next block
; column 0: odd part ; use V2, V6, V10, V14 to produce V31, V39, V40, V41
movq mm7, mmword ptr [esi+8*10] ; V10
movq mm0, mmword ptr [esi+8*6] ; V6
movq mm3, mm7 ; duplicate V10
movq mm5, mmword ptr [esi+8*2] ; V2
psubw mm7, mm0 ; V26
movq mm4, mmword ptr [esi+8*14] ; V14 paddw mm3, mm0 ; V29 ; free mm0
movq mm1, mm7 ; duplicate V26
movq mmword ptr scratch1, mm7 movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18 movq mmword ptr scratch3, mm7 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm7, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
movq mm0, mm5 ; duplicate V2
paddw mm5, mm4 ; V27 psubw mm0, mm4 ; V28 ; free mm4
movq mm2, mm0 ; duplicate V28
movq mmword ptr scratch1, mm0 movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18 movq mmword ptr scratch3, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm0, mmword ptr scratch1 movq mm1, mmword ptr scratch7
movq mm4, mm5 ; duplicate t90=t93 paddw mm1, mm2 ; V32 ; free mm2
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
paddw mm5, mm3 ; V31 psubw mm4, mm3 ; V30 ; free mm3
movq mmword ptr scratch1, mm4 movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm4 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm4, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
psubw mm0, mm1 ; V38 paddw mm1, mm7 ; V37 ; free mm7 ;move from the next block movq mm3, mm6 ; duplicate V25
;move from the next block movq mm7, mmword ptr [esi+8*4] ; V22
psubw mm1, mm5 ; V39 (mm5 still needed for next block)
;move from the next block movq mm2, mmword ptr [esi+8*12] ; V23
psubw mm4, mm1 ; V40
paddw mm0, mm4 ; V41; free mm0 ;move from the next block
; column 0: output butterfly ;move above psubw mm6, mm0 ; tm6 paddw mm3, mm0 ; tm8; free mm1
movq mm0, mm1 ; line added by Kumar
movq mm1, mm7 ; duplicate V22 paddw mm7, mm5 ; tm0
movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3 psubw mm1, mm5 ; tm14; free mm5
movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6 movq mm3, mm2 ; duplicate t117=t125
movq mm6, mmword ptr [esi+8*0] ; V24 paddw mm2, mm0 ; tm2
movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7 psubw mm3, mm0 ; tm12; free mm0
movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2 movq mm0, mm6 ; duplicate t119=t123
movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3 paddw mm6, mm4 ; tm4
;moved from next block psubw mm0, mm4 ; tm10; free mm4
;moved from next block movq mm1, mmword ptr [esi+8*5] ; V5
movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
; column 1: even part ; use V5, V13, V1, V9 to produce V56..V59 ;moved to prev block
movq mm7, mmword ptr [esi+8*13] ; V13 movq mm2, mm1 ; duplicate t128=t130
movq mm3, mmword ptr [esi+8*1] ; V1
psubw mm1, mm7 ; V50
movq mm5, mmword ptr [esi+8*9] ; V9 paddw mm2, mm7 ; V51
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
movq mm6, mm2 ; duplicate V51
movq mm4, mm3 ; duplicate V1
paddw mm3, mm5 ; V53
psubw mm4, mm5 ; V54 ;mm5 free movq mm7, mm3 ; duplicate V53
;moved from next block
psubw mm1, mm6 ; V55 ; mm6 free paddw mm3, mm2 ; V56
movq mm5, mm4 ; duplicate t140=t142 paddw mm4, mm1 ; V57
movq mmword ptr [esi+8*5], mm3 ; V56 psubw mm5, mm1 ; V58; mm1 free
movq mmword ptr [esi+8*13], mm4 ; V57 psubw mm7, mm2 ; V59; mm2 free
movq mmword ptr [esi+8*9], mm5 ; V58
; keep mm7 alive all along the next block
movq mm0, mmword ptr [esi+8*11] ; V11
movq mm6, mmword ptr [esi+8*7] ; V7
movq mm3, mm0 ; duplicate V11
movq mm4, mmword ptr [esi+8*15] ; V15
movq mm5, mmword ptr [esi+8*3] ; V3 paddw mm0, mm6 ; V63
; note that V15 computation has a correction step: ; this is a 'magic' constant that rebiases the results to be closer to the expected result ; this magic constant can be refined to reduce the error even more ; by doing the correction step in a later stage when the number is actually multiplied by 16 psubw mm3, mm6 ; V60 ; free mm6
movq mm1, mm3 ; duplicate V60
movq mmword ptr scratch1, mm1 movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18 movq mmword ptr scratch3, mm1 movq mmword ptr scratch5, mm0 PackMulW movq mm1, mmword ptr scratch1 movq mm0, mmword ptr scratch5
movq mm6, mm5 ; duplicate V3
paddw mm5, mm4 ; V61 psubw mm6, mm4 ; V62 ; free mm4
movq mm4, mm5 ; duplicate V61
paddw mm5, mm0 ; V65 -> result psubw mm4, mm0 ; V64 ; free mm0
movq mmword ptr scratch1, mm4 movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18 movq mmword ptr scratch3, mm4 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm4, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
paddw mm3, mm6 ; V66 movq mm2, mm5 ; duplicate V65
movq mmword ptr scratch1, mm3 movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18 movq mmword ptr scratch3, mm3 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm3, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
movq mmword ptr scratch1, mm6 movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18 movq mmword ptr scratch3, mm6 movq mmword ptr scratch5, mm0 movq mmword ptr scratch7, mm1 PackMulW movq mm6, mmword ptr scratch1 movq mm0, mmword ptr scratch5 movq mm1, mmword ptr scratch7
;moved from next block movq mm0, mmword ptr [esi+8*5] ; V56
psubw mm6, mm3 ; V72 paddw mm3, mm1 ; V71 ; free mm1
psubw mm3, mm2 ; V73 ; free mm2
psubw mm4, mm3 ; V74
;moved from next block movq mm1, mm0 ; duplicate t177=t188
paddw mm6, mm4 ; V75
;moved from next block paddw mm0, mm5 ; tm1
;location ; 5 - V56 ; 13 - V57 ; 9 - V58 ; X - V59, mm7 ; X - V65, mm5 ; X - V73, mm6 ; X - V74, mm4 ; X - V75, mm3 ; free mm0, mm1 & mm2 ;move above
movq mm2, mmword ptr [esi+8*13] ; V57 psubw mm1, mm5 ; tm15; free mm5
movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
;save the store as used directly in the transpose movq mm5, mm7 ; duplicate t182=t184
psubw mm7, mm6 ; tm7
paddw mm5, mm6 ; tm9; free mm3
movq mm6, mm3
movq mm0, mmword ptr [esi+8*9] ; V58 movq mm3, mm2 ; duplicate V57
movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7 psubw mm3, mm6 ; tm13
paddw mm2, mm6 ; tm3 ; free mm6
movq mm6, mm0 ; duplicate V58
movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2 paddw mm0, mm4 ; tm5
psubw mm6, mm4 ; tm11; free mm4
movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
; Final results to be stored after the transpose ; transpose the bottom right quadrant(4X4) of the matrix ; --------- --------- ; | M1 | M2 | | M1'| M3'| ; --------- --> --------- ; | M3 | M4 | | M2'| M4'| ; --------- --------- ; ; get the pointer to array "range" mov edi, [esp+52]
; calculate the destination address mov ebp, [esp+44] ; get output_buf[4]
mov ebx, [ebp+20] mov ecx, [ebp+24] mov edx, [ebp+28] mov ebp, [ebp+16]
add ebp, [esp+48] ; add to output_col add ebx, [esp+48] ; add to output_col add ecx, [esp+48] ; add to output_col add edx, [esp+48] ; add to output_col
movq mm0, mm5 ; copy w4---0,1,3,5,6 punpcklwd mm5, mm6 ;
punpckhwd mm0, mm6 ;---0,1,3,5,6 movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
movq mm2, mm3 ;---0,1,2,3,5,6 punpcklwd mm3, mm1 ;
movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant punpckhwd mm2, mm1 ;---0,2,3,5,6,7
movq mm4, mm5 ;---0,2,3,4,5,6,7 punpckldq mm5, mm3 ; transposed w4
psrlw mm5, 5 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+4], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+5], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+6], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+7], al
punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
movq mm3, mm0 ;---0,2,3,4,6,7 punpckldq mm0, mm2 ; transposed w6
psrlw mm4, 5 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+4], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+5], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+6], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+7], al
punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
psrlw mm0, 5
movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+4], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+5], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+6], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+7], al
movq mm5, mm6 ; copy w0
psrlw mm3, 5 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+4], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+5], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+6], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+7], al
punpcklwd mm6, mm7
; transpose the top left quadrant(4X4) of the matrix
; calculate the destination address mov ebp, [esp+44] ; get output_buf[0]
mov ebx, [ebp+4] mov ecx, [ebp+8] mov edx, [ebp+12] mov ebp, [ebp+0]
add ebp, [esp+48] ; add to output_col add ebx, [esp+48] ; add to output_col add ecx, [esp+48] ; add to output_col add edx, [esp+48] ; add to output_col
punpckhwd mm5, mm7 ;---5,6,7 movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant movq mm3, mm7 ; copy w2---3,4,5,6,7
movq mm2, mm6 punpcklwd mm7, mm4 ;---2,3,4,5,6,7
punpckhwd mm3, mm4 ;---2,3,4,5,6,7 movq mm4, mm5 ;
movq mm1, mm5 punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
psrlw mm6, 5 movd eax, mm6 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp], al
psrlq mm6, 16 movd eax, mm6 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+1], al
psrlq mm6, 16 movd eax, mm6 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+2], al
psrlq mm6, 16 movd eax, mm6 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+3], al
punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
psrlw mm2, 5 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+1], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+2], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+3], al
punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
psrlw mm5, 5 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+1], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+2], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+3], al
punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
psrlw mm1, 5 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+1], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+2], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+3], al
; transpose the top right quadrant(4X4) of the matrix
; calculate the destination address for **bottom left quadrant mov ebp, [esp+44] ; get output_buf[4]
mov ebx, [ebp+20] mov ecx, [ebp+24] mov edx, [ebp+28] mov ebp, [ebp+16]
add ebp, [esp+48] ; add to output_col add ebx, [esp+48] ; add to output_col add ecx, [esp+48] ; add to output_col add edx, [esp+48] ; add to output_col
movq mm0, mmword ptr [esi+8*1] ;---0
movq mm1, mmword ptr [esi+8*3] ;---0,1,2 movq mm2, mm0
movq mm3, mmword ptr [esi+8*5] punpcklwd mm0, mm1 ;---0,1,2,3
punpckhwd mm2, mm1 movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
movq mm4, mm3 punpcklwd mm3, mm1 ;---0,1,2,3,4
punpckhwd mm4, mm1 ;---0,1,2,3,4 movq mm1, mm0
movq mm5, mm2 punpckldq mm0, mm3 ;---0,1,2,3,4,5
punpckhdq mm1, mm3 ;---0,1,2,3,4,5 movq mm3, mmword ptr [esi+8*8]
psrlw mm0, 5 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+1], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+2], al
psrlq mm0, 16 movd eax, mm0 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+3], al
punpckldq mm2, mm4 ;---1,2,3,4,5
punpckhdq mm5, mm4 ;---1,2,3,4,5 movq mm4, mmword ptr [esi+8*10]
; transpose the bottom left quadrant(4X4) of the matrix ; Also store w1,w2,w3 of top right quadrant into ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4 ; of BL is already done.
psrlw mm1, 5 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+1], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+2], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+3], al
movq mm1, mm3 ;---1,2,3,4,5
movq mm0, mmword ptr [esi+8*12] punpcklwd mm3, mm4 ;---0,1,2,3,4,5
punpckhwd mm1, mm4 ;---0,1,2,3,4,5 movq mm4, mmword ptr [esi+8*14]
psrlw mm2, 5 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+1], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+2], al
psrlq mm2, 16 movd eax, mm2 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+3], al
movq mm2, mm0
psrlw mm5, 5 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+1], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+2], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+3], al
punpcklwd mm0, mm4 ;---0,1,2,3,4
punpckhwd mm2, mm4 ;---0,1,2,3,4 movq mm4, mm3
movq mm5, mm1 punpckldq mm3, mm0 ;---0,1,2,3,4,5
; calculate the destination address for **top right quadrant mov ebp, [esp+44] ; get output_buf[0]
mov ebx, [ebp+4] mov ecx, [ebp+8] mov edx, [ebp+12] mov ebp, [ebp+0]
add ebp, [esp+48] ; add to output_col add ebx, [esp+48] ; add to output_col add ecx, [esp+48] ; add to output_col add edx, [esp+48] ; add to output_col
psrlw mm3, 5 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+4], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+5], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+6], al
psrlq mm3, 16 movd eax, mm3 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebp+7], al
punpckhdq mm4, mm0 ;---1,2,4,5
psrlw mm4, 5 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+4], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+5], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+6], al
psrlq mm4, 16 movd eax, mm4 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ebx+7], al
punpckldq mm1, mm2 ;---1,2,5
psrlw mm1, 5 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+4], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+5], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+6], al
psrlq mm1, 16 movd eax, mm1 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [ecx+7], al
punpckhdq mm5, mm2 ;---5
psrlw mm5, 5 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+4], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+5], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+6], al
psrlq mm5, 16 movd eax, mm5 and eax, 03ffh mov al, byte ptr [edi][eax] mov byte ptr [edx+7], al
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
emms
ret
_idct8x8aan ENDP _TEXT ENDS
END
|