|
|
#include "stdafx.h"
#pragma hdrstop
/***************************************************************************
* * INTEL Corporation Proprietary Information * * * Copyright (c) 1996 Intel Corporation. * All rights reserved. * *************************************************************************** AUTHOR: Kumar Balasubramanian ***************************************************************************
** MMX version of the "integer LLM mode" within IJG decompressor code. ** The following is an MMX implementation of the integer slow mode ** IDCT within the IJG code. */
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
#ifdef DCT_ISLOW_SUPPORTED
/*
* This module is specialized to the case DCTSIZE = 8. */
#if DCTSIZE != 8
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ #endif
#if BITS_IN_JSAMPLE == 8
#define CONST_BITS 13
#define PASS1_BITS 2
#else
#define CONST_BITS 13
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
#endif
/* Define the constants for the case BITS_IN_JSAMPLE = 8 */
static const __int64 const_0_2986 = 0x0000098E0000098E ; static const __int64 const_0_3901 = 0x00000c7c00000c7c; static const __int64 const_0_54119 = 0x0000115100001151; static const __int64 const_0_7653 = 0x0000187E0000187E; static const __int64 const_0_899 = 0x00001ccd00001ccd; static const __int64 const_1_175 = 0x000025a1000025a1; static const __int64 const_1_501 = 0x0000300b0000300b; static const __int64 const_1_8477 = 0x00003b2100003b21; static const __int64 const_1_961 = 0x00003ec500003ec5 ; static const __int64 const_2_053 = 0x000041b3000041b3 ; static const __int64 const_2_562 = 0x0000520300005203 ; static const __int64 const_3_072 = 0x0000625400006254 ;
static const __int64 const_all_ones = 0x0ffffffffffffffff; static const __int64 const_0_1_0_1 = 0x0000000100000001 ; static const __int64 const_zero = 0x0000000000000000; static const __int64 const_1_0 = 0x0000000100000001 ; static const __int64 const_round = 0x0000040000000400; static const __int64 const_round_two = 0x0002000000020000; static const __int64 const_mask = 0x000003ff000003ff;
static const __int64 const_00_1_84_00_0_765 = 0x00003b210000187E; static const __int64 const_00_0_5411_00_00 = 0x0000115100000000; static const __int64 const_3_072_00_1_501_00 = 0x62540000300b0000; static const __int64 const_0_2986_00_2_053_00 = 0x098E000041b30000; static const __int64 const_0_899_00_2_562_00 = 0x1ccd000052030000; static const __int64 const_1_96_00_0_3901_00 = 0x3ec500000c7c0000; static const __int64 const_1_175_00_00_00 = 0x25a1000000000000;
/*
* Perform dequantization and inverse DCT on one block of coefficients. */
GLOBAL(void) midct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit ) {
INT32 locdwinptr, locdwqptr, locdwwsptr, locdwcounter, locdwrowctr ; __int64 locqwtmp0e,locqwtmp0o, locqwtmp1e, locqwtmp1o, locqwtmp2e ;
__int64 locqwtmp10e , locqwtmp10o ,locqwtmp11e , locqwtmp11o , locqwtmp12e , locqwtmp12o , locqwtmp13e , locqwtmp13o ,locqwtmp0 , locqwtmp1 ,locqwtmp2 ,locqwtmp3 , locqwz5e ,locqwz5o ,locqwz1e ,locqwz1o , locqwz13e ,locqwz13o ,locqwz14e , locqwz14o ,locqwz23e ,locqwz23o , locqwz24e ,locqwz24o ;
// Inline assembly to do the IDCT and store the result */
__asm {
mov esi, inptr ; load the input pointer mov edi, quantptr ; load the quant table pointer
mov locdwinptr, esi ; to be used in the idct_column loop mov locdwqptr, edi ; to be used in the idct_column loop
mov esi, wsptr mov locdwcounter, 2 ; idct_column loop counter
mov locdwwsptr, esi
;; do the idct on all the columns. Do four columns per ;; iteration of the loop.
idct_column:
mov esi, locdwinptr ; get the source pointer mov edi, locdwqptr ; get the quantzn. pointer
;; fetch C2 and Q2 movq mm0, [esi+16*2] ; get C2
movq mm1, [edi+16*2] ; get Q2
movq mm2, [esi+16*6] ; get C6 pmullw mm0, mm1 ; dequantized C2 = z2
movq mm3, [edi+16*6] ; get Q6
movq mm6, const_0_7653 pmullw mm2, mm3 ; dequant. C6 = z3
movq mm7, const_1_8477 movq mm4, mm0 ; copy z2
pmaddwd mm4, mm6 ; tmp3 - z1 for columns 0 & 2 movq mm5, mm0 ; copy z2
movq mm3, mm2 ; z3 copy psrlq mm5, 16 ; move z2 columns 1 & 3 to 0 & 2
movq mm1, const_0_54119 pmaddwd mm5, mm6 ; tmp3 - z1 for columns 1 & 3
psrlq mm3, 16 ; move z3 columns 1 & 3 to 0 & 2 paddw mm0, mm2 ; z2 + z3
pmaddwd mm2, mm7 ; tmp2 - z1 for columns 0 & 2 movq mm6, mm0 ; z2 + z3 copy
psrlq mm6, 16 ; z2 + z3 columns 1 & 3 in 0 & 2 pmaddwd mm3, mm7 ; tmp2 - z1 for columns 1 & 3
movq mm7, const_all_ones pmaddwd mm0, mm1 ; z1 columns 0 & 2
pmaddwd mm6, mm1 ; z1 columns 1 & 3 pxor mm2, mm7 ; 1s complement of tmp2 - z1
movq mm1, const_0_1_0_1 pxor mm3, mm7 ; 1s complement of tmp2 - z1
paddd mm2, mm1 ; 2s complement of tmp2 - z1(col 0 &2) paddd mm3, mm1 ; 2s complement of tmp2 - z1(col 1 & 3)
paddd mm2, mm0 ; tmp2 (columns 0 & 2) paddd mm4, mm0 ; tmp2 (cols. 1 & 3)
;; get C0 and Q0 movq mm0, [esi+16*0] ; get C0 paddd mm3, mm6 ; tmp3
movq mm1, [edi+16*0] ; getQ0 paddd mm5, mm6 ; tmp3
movq mm6, [esi+16*4] ; get C4 pmullw mm0, mm1 ; dequant C0 = z2
movq mm7, [edi+16*4] ; get Q4 nop
movq locqwtmp2e, mm2 ; store tmp2 even part pmullw mm6, mm7 ; dequant C4 = z3
movq mm7, const_1_0 movq mm1, mm0 ; copy of z2
paddw mm0, mm6 ; z2+z3 nop
psubw mm1, mm6 ; z2-z3 movq mm6, mm0 ; z2+z3 copy
pmaddwd mm0, mm7 ; get 0 & 2 cols psrlq mm6, 16 ; get the other two cols.
pmaddwd mm6, mm7 ; movq mm2, mm1 ; copy of z2-z3
pmaddwd mm1, mm7 psrlq mm2, 16
pmaddwd mm2, mm7 pslld mm0, 13 ; tmp0 cols 0&2
movq mm7, mm4 pslld mm6, 13 ; tmp0 cols 1 & 3
paddd mm4, mm0 ; psubd mm0, mm7 ;
movq mm7, mm5 pslld mm2, 13
movq locqwtmp13e, mm0 ; store tmp13 cols 0&2 paddd mm5, mm6
movq mm0, locqwtmp2e psubd mm6, mm7
movq locqwtmp10o, mm5 ; store tmp10 cols 1&3 movq mm7, mm3
movq locqwtmp13o, mm6 ; store tmp13 cols 1&3 paddd mm3, mm2
movq locqwtmp10e, mm4 ; store tmp10 cols 0&2 pslld mm1, 13
movq locqwtmp11o, mm3 ; store tmp11 cols 1,3 psubd mm2, mm7
movq mm6, [esi+16*1] movq mm3, mm0
movq locqwtmp12o, mm2 ; store tmp12 cols. 1,3 paddd mm0, mm1
movq mm7, [edi+16*1]
movq locqwtmp11e, mm0 ; store tmp11 cols. 0,2 psubd mm1, mm3
movq mm0, [esi+16*7] pmullw mm6, mm7 ; dequant. C1 = tmp3
movq locqwtmp12e, mm1
;; completed the even part. ;; Now start the odd part
movq mm1, [edi+16*7] ; get C7
movq mm2, [esi+16*5] ; get C5 pmullw mm0, mm1 ; dequant. C7 = tmp0
movq mm3, [edi+16*5]
movq mm4, [esi+16*3] pmullw mm2, mm3 ; dequant. C5 = tmp1
movq mm5, [edi+16*3] movq mm1, mm0
movq locqwtmp3, mm6 pmullw mm4, mm5 ; dequant. C3 = tmp2
movq locqwtmp0, mm0 paddw mm0, mm6 ; z1
movq locqwtmp1, mm2 movq mm3, mm2
movq locqwtmp2, mm4 paddw mm2, mm4 ; z2
paddw mm1, mm4 ; z3
movq mm4, const_1_175 paddw mm3, mm6 ; z4
movq mm5, mm1 movq mm7, mm0
psrlq mm7, 16 ; other two cols. of z1 paddw mm5, mm3 ; z3 + z4
movq mm6, mm5 pmaddwd mm5, mm4 ; z5 cols 0 & 2
pmaddwd mm0, const_0_899 ; z1 even part psrlq mm6, 16
pmaddwd mm6, mm4 ; z5 cols 1 & 3 movq mm4, mm2 ; z2 copy
movq locqwz5e, mm5 psrlq mm4, 16 ; get z2 cols 1 & 3
pxor mm0, const_all_ones movq mm5, mm1
movq locqwz5o, mm6 psrlq mm5, 16
movq mm6, const_2_562 nop
paddd mm0, const_0_1_0_1 pmaddwd mm2, mm6 ; z2 cols 0 & 2
movq locqwz1e, mm0 pmaddwd mm4, mm6 ; z2 cols 1 & 3
pmaddwd mm7, const_0_899 ; z1 movq mm0, mm3
movq mm6, const_1_961 psrlq mm0, 16
pxor mm2, const_all_ones pmaddwd mm1, mm6 ; z3 cols 0 & 2
paddd mm2, const_0_1_0_1 pmaddwd mm5, mm6 ; z3 cols 1 & 3
movq mm6, const_0_3901 nop
pxor mm4, const_all_ones pmaddwd mm3, mm6 ; z4 cols 0 & 2
paddd mm4, const_0_1_0_1 pmaddwd mm0, mm6 ; z4 cols 1 & 3
movq mm6, const_all_ones nop
pxor mm1, mm6 pxor mm7, mm6
;; twos complement of z1, z2, z3, z4
paddd mm1, const_0_1_0_1 pxor mm5, mm6
paddd mm7, const_0_1_0_1 pxor mm3, mm6
paddd mm5, const_0_1_0_1 nop
movq locqwz1o, mm7 pxor mm0, mm6
paddd mm1, locqwz5e ; z3+z5 cols 0 & 2 nop
movq mm6, locqwz1e nop
paddd mm5, locqwz5o ; z3+z5 cols 1 & 3 paddd mm6, mm1
paddd mm3, const_0_1_0_1 paddd mm1, mm2
paddd mm0, const_0_1_0_1 paddd mm7, mm5
paddd mm3, locqwz5e ; z4+z5 cols 0 & 2 paddd mm5, mm4
paddd mm0, locqwz5o ; z4+z5 cols 0 & 2 paddd mm2, mm3
paddd mm3, locqwz1e paddd mm4, mm0
paddd mm0, locqwz1o
movq locqwz23e, mm1 nop
movq locqwz14o, mm0 nop
movq mm0, locqwtmp0 nop
movq locqwz24e, mm2 movq mm1, mm0
movq mm2, const_0_2986 psrlq mm1, 16
movq locqwz14e, mm3 pmaddwd mm0, mm2 ; tmp0 even
movq mm3, locqwtmp1 pmaddwd mm1, mm2 ; tmp0 odd
movq locqwz24o, mm4 movq mm2, mm3
movq mm4, const_2_053 psrlq mm2, 16
movq locqwz23o, mm5 pmaddwd mm3, mm4 ; tmp1 even
movq mm5, locqwtmp2 pmaddwd mm2, mm4 ; tmp1 odd
movq locqwz13e, mm6 movq mm4, mm5
movq mm6, const_3_072 psrlq mm4, 16
movq locqwz13o, mm7 pmaddwd mm5, mm6 ; tmp2 even ;;;;;;; now calculate tmp0..tmp3 ;; then calculate the pre-descaled values ;; this includes the right shift with rounding
movq mm7, locqwtmp3 pmaddwd mm4, mm6 ; tmp2 odd
paddd mm0, locqwz13e movq mm6, mm7
paddd mm1, locqwz13o psrlq mm6, 16
movq locqwtmp0e, mm0 ; tmp0 even nop
movq mm0, const_1_501 nop
movq locqwtmp0o, mm1 pmaddwd mm7, mm0
paddd mm3, locqwz24e pmaddwd mm6, mm0
movq mm0, locqwtmp10e nop
paddd mm7, locqwz14e nop
paddd mm6, locqwz14o psubd mm0, mm7
movq mm1, locqwtmp10o nop
movq locqwtmp1e, mm3 psubd mm1, mm6
movq mm3, const_round nop
paddd mm2, locqwz24o paddd mm0, mm3
paddd mm7, locqwtmp10e psrad mm0, 11
movq locqwtmp1o, mm2 paddd mm1, mm3
paddd mm6, locqwtmp10o psrad mm1, 11
paddd mm5, locqwz23e movq mm2, mm0
paddd mm4, locqwz23o punpcklwd mm0, mm1
paddd mm6, mm3 punpckhwd mm2, mm1
paddd mm7, mm3 punpckldq mm0, mm2
;; now do all the stores of the 1D-iDCT of the four columns
mov edi, locdwwsptr ; get pointer to scratch pad array
movq [edi+16*7], mm0 ; store wsptr[7] psrad mm6, 11
movq mm2, locqwtmp11e psrad mm7, 11
psubd mm2, mm5 movq mm0, mm7
movq mm1, locqwtmp11o punpcklwd mm7, mm6
psubd mm1, mm4 punpckhwd mm0, mm6
paddd mm5, locqwtmp11e punpckldq mm7, mm0
paddd mm4, locqwtmp11o paddd mm2, mm3
paddd mm1, mm3 paddd mm5, mm3
paddd mm4, mm3 psrad mm2, 11
movq [edi+16*0], mm7 ; store wsptr[0] psrad mm1, 11
movq mm0, mm2 psrad mm5, 11
movq mm6, locqwtmp12e punpcklwd mm2, mm1
punpckhwd mm0, mm1 movq mm1, mm5
movq mm7, locqwtmp12o punpckldq mm2, mm0
movq [edi+16*6], mm2 ; store wsptr[6] psrad mm4, 11
movq mm2, mm6 punpcklwd mm5, mm4
paddd mm6, locqwtmp1e punpckhwd mm1, mm4
psubd mm2, locqwtmp1e punpckldq mm5, mm1
movq [edi+16*1], mm5 ; store wsptr[1] movq mm0, mm7
paddd mm7, locqwtmp1o paddd mm6, mm3
psubd mm0, locqwtmp1o paddd mm7, mm3
paddd mm2, mm3 psrad mm7, 11
paddd mm0, mm3 psrad mm6, 11
movq mm1, mm6 psrad mm2, 11
movq mm4, locqwtmp13e punpcklwd mm6, mm7
movq mm5, mm4 punpckhwd mm1, mm7
paddd mm4, locqwtmp0e punpckldq mm6, mm1
psubd mm5, locqwtmp0e psrad mm0, 11
movq [edi+16*2], mm6 ; store wsptr[2] movq mm6, mm2
paddd mm4, mm3 punpcklwd mm2, mm0
paddd mm5, mm3 punpckhwd mm6, mm0
movq mm0, locqwtmp13o punpckldq mm2, mm6
movq mm1, mm0 psrad mm4, 11
paddd mm0, locqwtmp0o psrad mm5, 11
paddd mm0, mm3 movq mm6, mm4
psubd mm1, locqwtmp0o psrad mm0, 11
paddd mm1, mm3 punpcklwd mm4, mm0
movq mm3, mm5 punpckhwd mm6, mm0
movq [edi+16*5], mm2 ; store wsptr[5] punpckldq mm4, mm6
psrad mm1, 11
movq [edi+16*3], mm4 ; store wsptr[3] punpcklwd mm5, mm1
punpckhwd mm3, mm1
punpckldq mm5, mm3
add locdwinptr, 8 ; skip first four columns add locdwqptr, 8
movq [edi+16*4], mm5 ; store wsptr[4]
;;;;;;; done with 1D-idct of four columns ;;;;;;;
;; now update pointers for next four columns
add locdwwsptr, 8 mov eax, locdwcounter
dec eax
mov locdwcounter, eax jnz idct_column
;;;;;;;end of 1D-idct on the columns ;;;;;;;
mov esi, wsptr ; get start addr of temp array mov locdwcounter, 8
mov locdwwsptr, esi mov locdwrowctr, 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;; start of 1D-idct on the rows ;;;;;;;
idct_row:
mov esi, locdwwsptr ; get next row start addr of temp array mov edi, output_buf
movq mm0, [esi+0] ; get first 4 elements of row
movq mm1, [esi+2*4] ; get next 4 elem. of row movq mm2, mm0
movq mm3, mm0 ; copy of e3|e2|e1|e0 paddw mm2, mm1 ; (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
movq mm4, mm2 ; copy of (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4) punpckhdq mm3, mm1 ; e7|e6|e3|e2
pmaddwd mm3, const_00_1_84_00_0_765 ; (tmp2 - z1)||(tmp3-z1) movq mm6, mm0 ; copy of e3|e2|e1|e0
pmaddwd mm2, const_00_0_5411_00_00 ; z1||xxx psubw mm6, mm1 ; (e3-e7)|(e2-e6)|(e1-e5)|(e0-e4)
punpckldq mm4, mm6 ; (e1-e5)|(e0-e4)|(e1+e5)|(e0+e4) movq mm6, mm0 ;
movq mm5, mm3 pslld mm4, 16 ; (e0-e4)|(e1+e5)||(e0+e4)|x0000
pxor mm3, const_all_ones punpckhdq mm2, mm2 ; z1||z1
paddd mm3, const_0_1_0_1 psrad mm4, 3 ; (e0-e4)<<13||(e0+e4)<<13
psrlq mm3, 32 movq mm7, mm4 ; copy of tmp1||tmp0
punpckldq mm5, mm3 movq mm3, mm0 ; e3|e2|e1|e0
paddd mm5, mm2 ; tmp2 || tmp3 paddw mm3, mm1 ; (e7+e3)|(e2+e6)|(e1+e5)|(e0+e4)
paddd mm4, mm5 psubd mm7, mm5
;; end of even part calculation ;; ;; mm0 => e3|e2|e1|e0 ;; mm1 => e7|e6|e5|e4 ;; mm4 => tmp11||tmp10 ;; mm7 => tmp12||tmp13
movq mm5, mm3 movq mm2, mm0
pmaddwd mm0, const_3_072_00_1_501_00 ; tmp2|tmp3 punpckldq mm5, mm5
paddw mm5, mm3 punpckldq mm2, mm2
pmaddwd mm5, const_1_175_00_00_00 ; z5|0 punpckhdq mm6, mm2
pmaddwd mm3, const_1_96_00_0_3901_00 ; z3|z4 paddw mm6, mm1
pmaddwd mm6, const_0_899_00_2_562_00 ; z1|z2 nop
pmaddwd mm1, const_0_2986_00_2_053_00 ; tmp0|tmp1 punpckhdq mm5, mm5
movq mm2, const_0_1_0_1 nop
pxor mm3, const_all_ones nop
pxor mm6, const_all_ones paddd mm3, mm2
paddd mm6, mm2 paddd mm3, mm5
movq mm5, mm6 paddd mm6, mm3
movq mm2, mm5 punpckldq mm5, mm5
punpckhdq mm2, mm5 paddd mm1, mm6
paddd mm2, mm3 movq mm5, mm1
movq mm3, mm4 paddd mm0, mm2
movq mm2, mm7 punpckldq mm5, mm5
punpckhdq mm1, mm5 psubd mm3, mm0
movq mm5, const_round_two paddd mm0, mm4
movq mm6, const_mask psubd mm2, mm1
paddd mm0, mm5 paddd mm1, mm7
;; descale the resulting coeff values paddd mm1, mm5 psrad mm0, 18
paddd mm3, mm5 psrad mm1, 18
paddd mm2, mm5 psrad mm3, 18
;; mask the result with RANGE_MASK (least 10 bits) pand mm1, mm6 ; w2|w3 psrad mm2, 18
movd ebx, mm1 ; w3 psrlq mm1, 32 ; 0|w2
;; using the results as index, get the corresponding ;; value from array range_limit and store the final result
mov ecx, range_limit ; get start addr of range_limit array add edi, locdwrowctr
movd edx, mm1 ; w2 pand mm0, mm6 ; w1|w0
mov ah, [ecx][ebx] ; w3 mov edi, [edi]
movd ebx, mm0 ; w0 psrlq mm0, 32 ; 0|w1
mov al, [ecx][edx] ; w2 add locdwrowctr, 4
movd edx, mm0 ; w1 pand mm3, mm6 ; w6|w7
add edi, output_col ; this is the dest start addr for this row shl eax, 16 ; w3|w2|0|0
mov al, [ecx][ebx] ; w0
mov ah, [ecx][edx] ; w1
movd mm4, eax ; w3|w2|w1|w0 pand mm2, mm6 ; w5|w4
movd ebx, mm3 ; w7 psrlq mm3, 32 ; 0|w6
movd edx, mm3 ; w6
mov ah, [ecx][ebx] ; w7
mov al, [ecx][edx] ; w6
movd ebx, mm2 ; w4 psrlq mm2, 32 ; 0|w5
shl eax, 16 ; w7|w6|0|0
movd edx, mm2 ; w5
mov al, [ecx][ebx] ; w4
mov ah, [ecx][edx] ; w5
movd mm5, eax ; w7|w6|w5|w4
punpckldq mm4, mm5 ; w7|w6|w5|w4|w3|w2|w1|w0
add locdwwsptr, 16 mov eax, locdwcounter
movq [edi], mm4
;; update address pointer and loop counter
dec eax
mov locdwcounter, eax jnz idct_row
;;;;;;; end of 1D-idct on all the rows ;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
emms
} //end of __asm
}
#endif /* DCT_ISLOW_SUPPORTED */
|