Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

884 lines
15 KiB

#include "stdafx.h"
#pragma hdrstop
/***************************************************************************
*
* INTEL Corporation Proprietary Information
*
*
* Copyright (c) 1996 Intel Corporation.
* All rights reserved.
*
***************************************************************************
AUTHOR: Kumar Balasubramanian
***************************************************************************
** MMX version of the "integer LLM mode" within IJG decompressor code.
** The following is an MMX implementation of the integer slow mode
** IDCT within the IJG code.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
#ifdef DCT_ISLOW_SUPPORTED
/*
* This module is specialized to the case DCTSIZE = 8.
*/
#if DCTSIZE != 8
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif
#if BITS_IN_JSAMPLE == 8
#define CONST_BITS 13
#define PASS1_BITS 2
#else
#define CONST_BITS 13
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
#endif
/* Define the constants for the case BITS_IN_JSAMPLE = 8 */
static const __int64 const_0_2986 = 0x0000098E0000098E ;
static const __int64 const_0_3901 = 0x00000c7c00000c7c;
static const __int64 const_0_54119 = 0x0000115100001151;
static const __int64 const_0_7653 = 0x0000187E0000187E;
static const __int64 const_0_899 = 0x00001ccd00001ccd;
static const __int64 const_1_175 = 0x000025a1000025a1;
static const __int64 const_1_501 = 0x0000300b0000300b;
static const __int64 const_1_8477 = 0x00003b2100003b21;
static const __int64 const_1_961 = 0x00003ec500003ec5 ;
static const __int64 const_2_053 = 0x000041b3000041b3 ;
static const __int64 const_2_562 = 0x0000520300005203 ;
static const __int64 const_3_072 = 0x0000625400006254 ;
static const __int64 const_all_ones = 0x0ffffffffffffffff;
static const __int64 const_0_1_0_1 = 0x0000000100000001 ;
static const __int64 const_zero = 0x0000000000000000;
static const __int64 const_1_0 = 0x0000000100000001 ;
static const __int64 const_round = 0x0000040000000400;
static const __int64 const_round_two = 0x0002000000020000;
static const __int64 const_mask = 0x000003ff000003ff;
static const __int64 const_00_1_84_00_0_765 = 0x00003b210000187E;
static const __int64 const_00_0_5411_00_00 = 0x0000115100000000;
static const __int64 const_3_072_00_1_501_00 = 0x62540000300b0000;
static const __int64 const_0_2986_00_2_053_00 = 0x098E000041b30000;
static const __int64 const_0_899_00_2_562_00 = 0x1ccd000052030000;
static const __int64 const_1_96_00_0_3901_00 = 0x3ec500000c7c0000;
static const __int64 const_1_175_00_00_00 = 0x25a1000000000000;
/*
* Perform dequantization and inverse DCT on one block of coefficients.
*/
GLOBAL(void)
midct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
{
INT32 locdwinptr, locdwqptr, locdwwsptr, locdwcounter, locdwrowctr ;
__int64 locqwtmp0e,locqwtmp0o, locqwtmp1e, locqwtmp1o, locqwtmp2e ;
__int64 locqwtmp10e , locqwtmp10o ,locqwtmp11e ,
locqwtmp11o , locqwtmp12e , locqwtmp12o ,
locqwtmp13e , locqwtmp13o ,locqwtmp0 ,
locqwtmp1 ,locqwtmp2 ,locqwtmp3 ,
locqwz5e ,locqwz5o ,locqwz1e ,locqwz1o ,
locqwz13e ,locqwz13o ,locqwz14e ,
locqwz14o ,locqwz23e ,locqwz23o ,
locqwz24e ,locqwz24o ;
// Inline assembly to do the IDCT and store the result */
__asm {
mov esi, inptr ; load the input pointer
mov edi, quantptr ; load the quant table pointer
mov locdwinptr, esi ; to be used in the idct_column loop
mov locdwqptr, edi ; to be used in the idct_column loop
mov esi, wsptr
mov locdwcounter, 2 ; idct_column loop counter
mov locdwwsptr, esi
;; do the idct on all the columns. Do four columns per
;; iteration of the loop.
idct_column:
mov esi, locdwinptr ; get the source pointer
mov edi, locdwqptr ; get the quantzn. pointer
;; fetch C2 and Q2
movq mm0, [esi+16*2] ; get C2
movq mm1, [edi+16*2] ; get Q2
movq mm2, [esi+16*6] ; get C6
pmullw mm0, mm1 ; dequantized C2 = z2
movq mm3, [edi+16*6] ; get Q6
movq mm6, const_0_7653
pmullw mm2, mm3 ; dequant. C6 = z3
movq mm7, const_1_8477
movq mm4, mm0 ; copy z2
pmaddwd mm4, mm6 ; tmp3 - z1 for columns 0 & 2
movq mm5, mm0 ; copy z2
movq mm3, mm2 ; z3 copy
psrlq mm5, 16 ; move z2 columns 1 & 3 to 0 & 2
movq mm1, const_0_54119
pmaddwd mm5, mm6 ; tmp3 - z1 for columns 1 & 3
psrlq mm3, 16 ; move z3 columns 1 & 3 to 0 & 2
paddw mm0, mm2 ; z2 + z3
pmaddwd mm2, mm7 ; tmp2 - z1 for columns 0 & 2
movq mm6, mm0 ; z2 + z3 copy
psrlq mm6, 16 ; z2 + z3 columns 1 & 3 in 0 & 2
pmaddwd mm3, mm7 ; tmp2 - z1 for columns 1 & 3
movq mm7, const_all_ones
pmaddwd mm0, mm1 ; z1 columns 0 & 2
pmaddwd mm6, mm1 ; z1 columns 1 & 3
pxor mm2, mm7 ; 1s complement of tmp2 - z1
movq mm1, const_0_1_0_1
pxor mm3, mm7 ; 1s complement of tmp2 - z1
paddd mm2, mm1 ; 2s complement of tmp2 - z1(col 0 &2)
paddd mm3, mm1 ; 2s complement of tmp2 - z1(col 1 & 3)
paddd mm2, mm0 ; tmp2 (columns 0 & 2)
paddd mm4, mm0 ; tmp2 (cols. 1 & 3)
;; get C0 and Q0
movq mm0, [esi+16*0] ; get C0
paddd mm3, mm6 ; tmp3
movq mm1, [edi+16*0] ; getQ0
paddd mm5, mm6 ; tmp3
movq mm6, [esi+16*4] ; get C4
pmullw mm0, mm1 ; dequant C0 = z2
movq mm7, [edi+16*4] ; get Q4
nop
movq locqwtmp2e, mm2 ; store tmp2 even part
pmullw mm6, mm7 ; dequant C4 = z3
movq mm7, const_1_0
movq mm1, mm0 ; copy of z2
paddw mm0, mm6 ; z2+z3
nop
psubw mm1, mm6 ; z2-z3
movq mm6, mm0 ; z2+z3 copy
pmaddwd mm0, mm7 ; get 0 & 2 cols
psrlq mm6, 16 ; get the other two cols.
pmaddwd mm6, mm7 ;
movq mm2, mm1 ; copy of z2-z3
pmaddwd mm1, mm7
psrlq mm2, 16
pmaddwd mm2, mm7
pslld mm0, 13 ; tmp0 cols 0&2
movq mm7, mm4
pslld mm6, 13 ; tmp0 cols 1 & 3
paddd mm4, mm0 ;
psubd mm0, mm7 ;
movq mm7, mm5
pslld mm2, 13
movq locqwtmp13e, mm0 ; store tmp13 cols 0&2
paddd mm5, mm6
movq mm0, locqwtmp2e
psubd mm6, mm7
movq locqwtmp10o, mm5 ; store tmp10 cols 1&3
movq mm7, mm3
movq locqwtmp13o, mm6 ; store tmp13 cols 1&3
paddd mm3, mm2
movq locqwtmp10e, mm4 ; store tmp10 cols 0&2
pslld mm1, 13
movq locqwtmp11o, mm3 ; store tmp11 cols 1,3
psubd mm2, mm7
movq mm6, [esi+16*1]
movq mm3, mm0
movq locqwtmp12o, mm2 ; store tmp12 cols. 1,3
paddd mm0, mm1
movq mm7, [edi+16*1]
movq locqwtmp11e, mm0 ; store tmp11 cols. 0,2
psubd mm1, mm3
movq mm0, [esi+16*7]
pmullw mm6, mm7 ; dequant. C1 = tmp3
movq locqwtmp12e, mm1
;; completed the even part.
;; Now start the odd part
movq mm1, [edi+16*7] ; get C7
movq mm2, [esi+16*5] ; get C5
pmullw mm0, mm1 ; dequant. C7 = tmp0
movq mm3, [edi+16*5]
movq mm4, [esi+16*3]
pmullw mm2, mm3 ; dequant. C5 = tmp1
movq mm5, [edi+16*3]
movq mm1, mm0
movq locqwtmp3, mm6
pmullw mm4, mm5 ; dequant. C3 = tmp2
movq locqwtmp0, mm0
paddw mm0, mm6 ; z1
movq locqwtmp1, mm2
movq mm3, mm2
movq locqwtmp2, mm4
paddw mm2, mm4 ; z2
paddw mm1, mm4 ; z3
movq mm4, const_1_175
paddw mm3, mm6 ; z4
movq mm5, mm1
movq mm7, mm0
psrlq mm7, 16 ; other two cols. of z1
paddw mm5, mm3 ; z3 + z4
movq mm6, mm5
pmaddwd mm5, mm4 ; z5 cols 0 & 2
pmaddwd mm0, const_0_899 ; z1 even part
psrlq mm6, 16
pmaddwd mm6, mm4 ; z5 cols 1 & 3
movq mm4, mm2 ; z2 copy
movq locqwz5e, mm5
psrlq mm4, 16 ; get z2 cols 1 & 3
pxor mm0, const_all_ones
movq mm5, mm1
movq locqwz5o, mm6
psrlq mm5, 16
movq mm6, const_2_562
nop
paddd mm0, const_0_1_0_1
pmaddwd mm2, mm6 ; z2 cols 0 & 2
movq locqwz1e, mm0
pmaddwd mm4, mm6 ; z2 cols 1 & 3
pmaddwd mm7, const_0_899 ; z1
movq mm0, mm3
movq mm6, const_1_961
psrlq mm0, 16
pxor mm2, const_all_ones
pmaddwd mm1, mm6 ; z3 cols 0 & 2
paddd mm2, const_0_1_0_1
pmaddwd mm5, mm6 ; z3 cols 1 & 3
movq mm6, const_0_3901
nop
pxor mm4, const_all_ones
pmaddwd mm3, mm6 ; z4 cols 0 & 2
paddd mm4, const_0_1_0_1
pmaddwd mm0, mm6 ; z4 cols 1 & 3
movq mm6, const_all_ones
nop
pxor mm1, mm6
pxor mm7, mm6
;; twos complement of z1, z2, z3, z4
paddd mm1, const_0_1_0_1
pxor mm5, mm6
paddd mm7, const_0_1_0_1
pxor mm3, mm6
paddd mm5, const_0_1_0_1
nop
movq locqwz1o, mm7
pxor mm0, mm6
paddd mm1, locqwz5e ; z3+z5 cols 0 & 2
nop
movq mm6, locqwz1e
nop
paddd mm5, locqwz5o ; z3+z5 cols 1 & 3
paddd mm6, mm1
paddd mm3, const_0_1_0_1
paddd mm1, mm2
paddd mm0, const_0_1_0_1
paddd mm7, mm5
paddd mm3, locqwz5e ; z4+z5 cols 0 & 2
paddd mm5, mm4
paddd mm0, locqwz5o ; z4+z5 cols 0 & 2
paddd mm2, mm3
paddd mm3, locqwz1e
paddd mm4, mm0
paddd mm0, locqwz1o
movq locqwz23e, mm1
nop
movq locqwz14o, mm0
nop
movq mm0, locqwtmp0
nop
movq locqwz24e, mm2
movq mm1, mm0
movq mm2, const_0_2986
psrlq mm1, 16
movq locqwz14e, mm3
pmaddwd mm0, mm2 ; tmp0 even
movq mm3, locqwtmp1
pmaddwd mm1, mm2 ; tmp0 odd
movq locqwz24o, mm4
movq mm2, mm3
movq mm4, const_2_053
psrlq mm2, 16
movq locqwz23o, mm5
pmaddwd mm3, mm4 ; tmp1 even
movq mm5, locqwtmp2
pmaddwd mm2, mm4 ; tmp1 odd
movq locqwz13e, mm6
movq mm4, mm5
movq mm6, const_3_072
psrlq mm4, 16
movq locqwz13o, mm7
pmaddwd mm5, mm6 ; tmp2 even
;;;;;;; now calculate tmp0..tmp3
;; then calculate the pre-descaled values
;; this includes the right shift with rounding
movq mm7, locqwtmp3
pmaddwd mm4, mm6 ; tmp2 odd
paddd mm0, locqwz13e
movq mm6, mm7
paddd mm1, locqwz13o
psrlq mm6, 16
movq locqwtmp0e, mm0 ; tmp0 even
nop
movq mm0, const_1_501
nop
movq locqwtmp0o, mm1
pmaddwd mm7, mm0
paddd mm3, locqwz24e
pmaddwd mm6, mm0
movq mm0, locqwtmp10e
nop
paddd mm7, locqwz14e
nop
paddd mm6, locqwz14o
psubd mm0, mm7
movq mm1, locqwtmp10o
nop
movq locqwtmp1e, mm3
psubd mm1, mm6
movq mm3, const_round
nop
paddd mm2, locqwz24o
paddd mm0, mm3
paddd mm7, locqwtmp10e
psrad mm0, 11
movq locqwtmp1o, mm2
paddd mm1, mm3
paddd mm6, locqwtmp10o
psrad mm1, 11
paddd mm5, locqwz23e
movq mm2, mm0
paddd mm4, locqwz23o
punpcklwd mm0, mm1
paddd mm6, mm3
punpckhwd mm2, mm1
paddd mm7, mm3
punpckldq mm0, mm2
;; now do all the stores of the 1D-iDCT of the four columns
mov edi, locdwwsptr ; get pointer to scratch pad array
movq [edi+16*7], mm0 ; store wsptr[7]
psrad mm6, 11
movq mm2, locqwtmp11e
psrad mm7, 11
psubd mm2, mm5
movq mm0, mm7
movq mm1, locqwtmp11o
punpcklwd mm7, mm6
psubd mm1, mm4
punpckhwd mm0, mm6
paddd mm5, locqwtmp11e
punpckldq mm7, mm0
paddd mm4, locqwtmp11o
paddd mm2, mm3
paddd mm1, mm3
paddd mm5, mm3
paddd mm4, mm3
psrad mm2, 11
movq [edi+16*0], mm7 ; store wsptr[0]
psrad mm1, 11
movq mm0, mm2
psrad mm5, 11
movq mm6, locqwtmp12e
punpcklwd mm2, mm1
punpckhwd mm0, mm1
movq mm1, mm5
movq mm7, locqwtmp12o
punpckldq mm2, mm0
movq [edi+16*6], mm2 ; store wsptr[6]
psrad mm4, 11
movq mm2, mm6
punpcklwd mm5, mm4
paddd mm6, locqwtmp1e
punpckhwd mm1, mm4
psubd mm2, locqwtmp1e
punpckldq mm5, mm1
movq [edi+16*1], mm5 ; store wsptr[1]
movq mm0, mm7
paddd mm7, locqwtmp1o
paddd mm6, mm3
psubd mm0, locqwtmp1o
paddd mm7, mm3
paddd mm2, mm3
psrad mm7, 11
paddd mm0, mm3
psrad mm6, 11
movq mm1, mm6
psrad mm2, 11
movq mm4, locqwtmp13e
punpcklwd mm6, mm7
movq mm5, mm4
punpckhwd mm1, mm7
paddd mm4, locqwtmp0e
punpckldq mm6, mm1
psubd mm5, locqwtmp0e
psrad mm0, 11
movq [edi+16*2], mm6 ; store wsptr[2]
movq mm6, mm2
paddd mm4, mm3
punpcklwd mm2, mm0
paddd mm5, mm3
punpckhwd mm6, mm0
movq mm0, locqwtmp13o
punpckldq mm2, mm6
movq mm1, mm0
psrad mm4, 11
paddd mm0, locqwtmp0o
psrad mm5, 11
paddd mm0, mm3
movq mm6, mm4
psubd mm1, locqwtmp0o
psrad mm0, 11
paddd mm1, mm3
punpcklwd mm4, mm0
movq mm3, mm5
punpckhwd mm6, mm0
movq [edi+16*5], mm2 ; store wsptr[5]
punpckldq mm4, mm6
psrad mm1, 11
movq [edi+16*3], mm4 ; store wsptr[3]
punpcklwd mm5, mm1
punpckhwd mm3, mm1
punpckldq mm5, mm3
add locdwinptr, 8 ; skip first four columns
add locdwqptr, 8
movq [edi+16*4], mm5 ; store wsptr[4]
;;;;;;; done with 1D-idct of four columns ;;;;;;;
;; now update pointers for next four columns
add locdwwsptr, 8
mov eax, locdwcounter
dec eax
mov locdwcounter, eax
jnz idct_column
;;;;;;;end of 1D-idct on the columns ;;;;;;;
mov esi, wsptr ; get start addr of temp array
mov locdwcounter, 8
mov locdwwsptr, esi
mov locdwrowctr, 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;; start of 1D-idct on the rows ;;;;;;;
idct_row:
mov esi, locdwwsptr ; get next row start addr of temp array
mov edi, output_buf
movq mm0, [esi+0] ; get first 4 elements of row
movq mm1, [esi+2*4] ; get next 4 elem. of row
movq mm2, mm0
movq mm3, mm0 ; copy of e3|e2|e1|e0
paddw mm2, mm1 ; (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
movq mm4, mm2 ; copy of (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
punpckhdq mm3, mm1 ; e7|e6|e3|e2
pmaddwd mm3, const_00_1_84_00_0_765 ; (tmp2 - z1)||(tmp3-z1)
movq mm6, mm0 ; copy of e3|e2|e1|e0
pmaddwd mm2, const_00_0_5411_00_00 ; z1||xxx
psubw mm6, mm1 ; (e3-e7)|(e2-e6)|(e1-e5)|(e0-e4)
punpckldq mm4, mm6 ; (e1-e5)|(e0-e4)|(e1+e5)|(e0+e4)
movq mm6, mm0 ;
movq mm5, mm3
pslld mm4, 16 ; (e0-e4)|(e1+e5)||(e0+e4)|x0000
pxor mm3, const_all_ones
punpckhdq mm2, mm2 ; z1||z1
paddd mm3, const_0_1_0_1
psrad mm4, 3 ; (e0-e4)<<13||(e0+e4)<<13
psrlq mm3, 32
movq mm7, mm4 ; copy of tmp1||tmp0
punpckldq mm5, mm3
movq mm3, mm0 ; e3|e2|e1|e0
paddd mm5, mm2 ; tmp2 || tmp3
paddw mm3, mm1 ; (e7+e3)|(e2+e6)|(e1+e5)|(e0+e4)
paddd mm4, mm5
psubd mm7, mm5
;; end of even part calculation ;;
;; mm0 => e3|e2|e1|e0
;; mm1 => e7|e6|e5|e4
;; mm4 => tmp11||tmp10
;; mm7 => tmp12||tmp13
movq mm5, mm3
movq mm2, mm0
pmaddwd mm0, const_3_072_00_1_501_00 ; tmp2|tmp3
punpckldq mm5, mm5
paddw mm5, mm3
punpckldq mm2, mm2
pmaddwd mm5, const_1_175_00_00_00 ; z5|0
punpckhdq mm6, mm2
pmaddwd mm3, const_1_96_00_0_3901_00 ; z3|z4
paddw mm6, mm1
pmaddwd mm6, const_0_899_00_2_562_00 ; z1|z2
nop
pmaddwd mm1, const_0_2986_00_2_053_00 ; tmp0|tmp1
punpckhdq mm5, mm5
movq mm2, const_0_1_0_1
nop
pxor mm3, const_all_ones
nop
pxor mm6, const_all_ones
paddd mm3, mm2
paddd mm6, mm2
paddd mm3, mm5
movq mm5, mm6
paddd mm6, mm3
movq mm2, mm5
punpckldq mm5, mm5
punpckhdq mm2, mm5
paddd mm1, mm6
paddd mm2, mm3
movq mm5, mm1
movq mm3, mm4
paddd mm0, mm2
movq mm2, mm7
punpckldq mm5, mm5
punpckhdq mm1, mm5
psubd mm3, mm0
movq mm5, const_round_two
paddd mm0, mm4
movq mm6, const_mask
psubd mm2, mm1
paddd mm0, mm5
paddd mm1, mm7
;; descale the resulting coeff values
paddd mm1, mm5
psrad mm0, 18
paddd mm3, mm5
psrad mm1, 18
paddd mm2, mm5
psrad mm3, 18
;; mask the result with RANGE_MASK (least 10 bits)
pand mm1, mm6 ; w2|w3
psrad mm2, 18
movd ebx, mm1 ; w3
psrlq mm1, 32 ; 0|w2
;; using the results as index, get the corresponding
;; value from array range_limit and store the final result
mov ecx, range_limit ; get start addr of range_limit array
add edi, locdwrowctr
movd edx, mm1 ; w2
pand mm0, mm6 ; w1|w0
mov ah, [ecx][ebx] ; w3
mov edi, [edi]
movd ebx, mm0 ; w0
psrlq mm0, 32 ; 0|w1
mov al, [ecx][edx] ; w2
add locdwrowctr, 4
movd edx, mm0 ; w1
pand mm3, mm6 ; w6|w7
add edi, output_col ; this is the dest start addr for this row
shl eax, 16 ; w3|w2|0|0
mov al, [ecx][ebx] ; w0
mov ah, [ecx][edx] ; w1
movd mm4, eax ; w3|w2|w1|w0
pand mm2, mm6 ; w5|w4
movd ebx, mm3 ; w7
psrlq mm3, 32 ; 0|w6
movd edx, mm3 ; w6
mov ah, [ecx][ebx] ; w7
mov al, [ecx][edx] ; w6
movd ebx, mm2 ; w4
psrlq mm2, 32 ; 0|w5
shl eax, 16 ; w7|w6|0|0
movd edx, mm2 ; w5
mov al, [ecx][ebx] ; w4
mov ah, [ecx][edx] ; w5
movd mm5, eax ; w7|w6|w5|w4
punpckldq mm4, mm5 ; w7|w6|w5|w4|w3|w2|w1|w0
add locdwwsptr, 16
mov eax, locdwcounter
movq [edi], mm4
;; update address pointer and loop counter
dec eax
mov locdwcounter, eax
jnz idct_row
;;;;;;; end of 1D-idct on all the rows ;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
emms
} //end of __asm
}
#endif /* DCT_ISLOW_SUPPORTED */