Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

604 lines
14 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. */
  13. /*
  14. * jidctfst.c
  15. *
  16. * Copyright (C) 1994-1996, Thomas G. Lane.
  17. * This file is part of the Independent JPEG Group's software.
  18. * For conditions of distribution and use, see the accompanying README file.
  19. *
  20. * This file contains a fast, not so accurate integer implementation of the
  21. * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
  22. * must also perform dequantization of the input coefficients.
  23. *
  24. * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  25. * on each row (or vice versa, but it's more convenient to emit a row at
  26. * a time). Direct algorithms are also available, but they are much more
  27. * complex and seem not to be any faster when reduced to code.
  28. *
  29. * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  30. * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
  31. * Japanese, but the algorithm is described in the Pennebaker & Mitchell
  32. * JPEG textbook (see REFERENCES section in file README). The following code
  33. * is based directly on figure 4-8 in P&M.
  34. * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  35. * possible to arrange the computation so that many of the multiplies are
  36. * simple scalings of the final outputs. These multiplies can then be
  37. * folded into the multiplications or divisions by the JPEG quantization
  38. * table entries. The AA&N method leaves only 5 multiplies and 29 adds
  39. * to be done in the DCT itself.
  40. * The primary disadvantage of this method is that with fixed-point math,
  41. * accuracy is lost due to imprecise representation of the scaled
  42. * quantization values. The smaller the quantization table entry, the less
  43. * precise the scaled value, so this implementation does worse with high-
  44. * quality-setting files than with low-quality ones.
  45. */
  46. #define JPEG_INTERNALS
  47. #include "jinclude.h"
  48. #include "jpeglib.h"
  49. #include "jdct.h" /* Private declarations for DCT subsystem */
  50. #ifdef DCT_IFAST_SUPPORTED
  51. /*
  52. * This module is specialized to the case DCTSIZE = 8.
  53. */
  54. #if DCTSIZE != 8
  55. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  56. #endif
  57. /* Scaling decisions are generally the same as in the LL&M algorithm;
  58. * see jidctint.c for more details. However, we choose to descale
  59. * (right shift) multiplication products as soon as they are formed,
  60. * rather than carrying additional fractional bits into subsequent additions.
  61. * This compromises accuracy slightly, but it lets us save a few shifts.
  62. * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  63. * everywhere except in the multiplications proper; this saves a good deal
  64. * of work on 16-bit-int machines.
  65. *
  66. * The dequantized coefficients are not integers because the AA&N scaling
  67. * factors have been incorporated. We represent them scaled up by PASS1_BITS,
  68. * so that the first and second IDCT rounds have the same input scaling.
  69. * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
  70. * avoid a descaling shift; this compromises accuracy rather drastically
  71. * for small quantization table entries, but it saves a lot of shifts.
  72. * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
  73. * so we use a much larger scaling factor to preserve accuracy.
  74. *
  75. * A final compromise is to represent the multiplicative constants to only
  76. * 8 fractional bits, rather than 13. This saves some shifting work on some
  77. * machines, and may also reduce the cost of multiplication (since there
  78. * are fewer one-bits in the constants).
  79. */
  80. #if BITS_IN_JSAMPLE == 8
  81. #define CONST_BITS 8
  82. #define PASS1_BITS 2
  83. #else
  84. #define CONST_BITS 8
  85. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  86. #endif
  87. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  88. * causing a lot of useless floating-point operations at run time.
  89. * To get around this we use the following pre-calculated constants.
  90. * If you change CONST_BITS you may want to add appropriate values.
  91. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  92. */
  93. #if CONST_BITS == 8
  94. #define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */
  95. #define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */
  96. #define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */
  97. #define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */
  98. #else
  99. #define FIX_1_082392200 FIX(1.082392200)
  100. #define FIX_1_414213562 FIX(1.414213562)
  101. #define FIX_1_847759065 FIX(1.847759065)
  102. #define FIX_2_613125930 FIX(2.613125930)
  103. #endif
  104. /* We can gain a little more speed, with a further compromise in accuracy,
  105. * by omitting the addition in a descaling shift. This yields an incorrectly
  106. * rounded result half the time...
  107. */
  108. #ifndef USE_ACCURATE_ROUNDING
  109. #undef DESCALE
  110. #define DESCALE(x,n) RIGHT_SHIFT(x, n)
  111. #endif
  112. //#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
  113. /* Multiply a DCTELEM variable by an INT32 constant, and immediately
  114. * descale to yield a DCTELEM result.
  115. */
  116. //#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
  117. #define MULTIPLY(var,const) ((DCTELEM) ((var) * (const)))
  118. /* Dequantize a coefficient by multiplying it by the multiplier-table
  119. * entry; produce a DCTELEM result. For 8-bit data a 16x16->16
  120. * multiplication will do. For 12-bit data, the multiplier table is
  121. * declared INT32, so a 32-bit multiply will be used.
  122. */
  123. #if BITS_IN_JSAMPLE == 8
  124. //#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval))
  125. #define DEQUANTIZE(coef,quantval) (((coef)) * (quantval))
  126. #else
  127. #define DEQUANTIZE(coef,quantval) \
  128. DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
  129. #endif
  130. /* Like DESCALE, but applies to a DCTELEM and produces an int.
  131. * We assume that int right shift is unsigned if INT32 right shift is.
  132. */
  133. #ifdef RIGHT_SHIFT_IS_UNSIGNED
  134. #define ISHIFT_TEMPS DCTELEM ishift_temp;
  135. #if BITS_IN_JSAMPLE == 8
  136. #define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */
  137. #else
  138. #define DCTELEMBITS 32 /* DCTELEM must be 32 bits */
  139. #endif
  140. #define IRIGHT_SHIFT(x,shft) \
  141. ((ishift_temp = (x)) < 0 ? \
  142. (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
  143. (ishift_temp >> (shft)))
  144. #else
  145. #define ISHIFT_TEMPS
  146. #define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
  147. #endif
  148. #ifdef USE_ACCURATE_ROUNDING
  149. #define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
  150. #else
  151. #define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n))
  152. #endif
  153. static const long x5a825a825a825a82 = 0x0000016a ;
  154. static const long x539f539f539f539f = 0xfffffd63 ;
  155. static const long x4546454645464546 = 0x00000115 ;
  156. static const long x61f861f861f861f8 = 0x000001d9 ;
  157. /*
  158. * Perform dequantization and inverse DCT on one block of coefficients.
  159. */
  160. GLOBAL(void)
  161. pidct8x8aan (JCOEFPTR coef_block, short * wsptr, short * quantptr,
  162. JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
  163. {
  164. INT32 locdwinptr, locdwqptr, locdwwsptr, locwctr ;
  165. short locwcounter, locwtmp0, locwtmp1 ;
  166. short locwtmp3, scratch1, scratch2, scratch3 ;
  167. // do the 2-Dal idct and store the corresponding results
  168. // from the range_limit array
  169. // pidct(coef_block, quantptr, wsptr, output_buf, output_col, range_limit) ;
  170. __asm {
  171. mov esi, coef_block ; source coeff
  172. mov edi, quantptr ; quant pointer
  173. mov locdwinptr, esi
  174. mov eax, wsptr ; temp storage pointer
  175. mov locdwqptr, edi
  176. mov locdwwsptr, eax
  177. mov locwcounter, 8
  178. ;; perform the 1D-idct on each of the eight columns
  179. idct_column:
  180. mov esi, locdwinptr
  181. mov edi, locdwqptr
  182. mov ax, word ptr [esi+16*0]
  183. mov bx, word ptr [esi+16*4]
  184. imul ax, word ptr [edi+16*0]
  185. mov cx, word ptr [esi+16*2]
  186. imul bx, word ptr [edi+16*4]
  187. mov dx, word ptr [esi+16*6]
  188. imul cx, word ptr [edi+16*2]
  189. imul dx, word ptr [edi+16*6]
  190. ;;;; at this point C0, C2, C4 and C6 have been dequantized
  191. mov scratch1, ax
  192. add ax, bx ; tmp10 in ax
  193. sub scratch1, bx ; tmp11
  194. mov bx, cx
  195. add cx, dx ; tmp13 in cx
  196. sub bx, dx ; tmp1 - tmp3 in bx
  197. mov dx, ax
  198. movsx ebx, bx ; sign extend bx: get ready to do imul
  199. add ax, cx ; tmp0 in ax
  200. imul ebx, dword ptr x5a825a825a825a82
  201. sub dx, cx ; tmp3 in dx
  202. mov locwtmp0, ax
  203. mov locwtmp3, dx
  204. sar ebx, 8 ; bx now has (tmp1-tmp3)*1.414
  205. mov ax, scratch1 ; copy of tmp11
  206. sub bx, cx ; tmp12 in bx
  207. add ax, bx ; tmp1 in ax
  208. sub scratch1, bx ; tmp2
  209. mov locwtmp1, ax
  210. ;;;;;completed computing/storing the even part;;;;;;;;;;
  211. mov ax, [esi+16*1] ; get C1
  212. imul ax, [edi+16*1]
  213. mov bx, [esi+16*7] ; get C7
  214. mov cx, [esi+16*3]
  215. imul bx, [edi+16*7]
  216. mov dx, [esi+16*5]
  217. imul cx, [edi+16*3]
  218. imul dx, [edi+16*5]
  219. mov scratch2, ax
  220. add ax, bx ; z11 in ax
  221. sub scratch2, bx ; z12
  222. mov bx, dx ; copy of deQ C5
  223. add dx, cx ; z13 in dx
  224. sub bx, cx ; z10 in bx
  225. mov cx, ax ; copy of z11
  226. add ax, dx ; tmp7 in ax
  227. sub cx, dx ; partial tmp11
  228. movsx ecx, cx
  229. mov dx, bx ; copy of z10
  230. add bx, scratch2 ; partial z5
  231. imul ecx, dword ptr x5a825a825a825a82
  232. movsx edx, dx ; sign extend z10: get ready for imul
  233. movsx ebx, bx ; sign extend partial z5 for imul
  234. imul edx, dword ptr x539f539f539f539f ; partial tmp12
  235. imul ebx, dword ptr x61f861f861f861f8 ; partial z5 product
  236. mov di, scratch2
  237. movsx edi, di ; sign extend z12: get ready for imul
  238. sar ecx, 8 ; tmp11 in cx
  239. sar ebx, 8 ; z5 in bx
  240. imul edi, dword ptr x4546454645464546
  241. sar edx, 8
  242. sar edi, 8
  243. sub di, bx ; tmp10
  244. add dx, bx ; tmp12 in dx
  245. sub dx, ax ; tmp6 in dx
  246. sub cx, dx ; tmp5 in cx
  247. add di, cx ; tmp4
  248. mov scratch3, di
  249. ;;; completed calculating the odd part ;;;;;;;;;;;
  250. mov edi, dword ptr locdwwsptr ; get address of temp. destn
  251. mov si, ax ; copy of tmp7
  252. mov bx, locwtmp0 ; get tmp0
  253. add ax, locwtmp0 ; wsptr[0]
  254. sub bx, si ; wsptr[7]
  255. mov word ptr [edi+16*0], ax
  256. mov word ptr [edi+16*7], bx
  257. mov ax, dx ; copy of tmp6
  258. mov bx, locwtmp1
  259. add dx, bx ; wsptr[1]
  260. sub bx, ax ; wsptr[6]
  261. mov word ptr [edi+16*1], dx
  262. mov word ptr [edi+16*6], bx
  263. mov dx, cx ; copy of tmp5
  264. mov bx, scratch1
  265. add cx, bx ; wsptr[2]
  266. sub bx, dx ; wsptr[5]
  267. mov word ptr [edi+16*2], cx
  268. mov word ptr [edi+16*5], bx
  269. mov cx, scratch3 ; copy of tmp4
  270. mov ax, locwtmp3
  271. add scratch3, ax ; wsptr[4]
  272. sub ax, cx ; wsptr[3]
  273. mov bx, scratch3
  274. mov word ptr [edi+16*4], bx
  275. mov word ptr [edi+16*3], ax
  276. ;;;;; completed storing 1D idct of one column ;;;;;;;;
  277. ;; update inptr, qptr and wsptr for next column
  278. add locdwinptr, 2
  279. add locdwqptr, 2
  280. add locdwwsptr, 2
  281. mov ax, locwcounter ; get loop count
  282. dec ax ; another loop done
  283. mov locwcounter, ax
  284. jnz idct_column
  285. ;;;;;;; end of 1D idct on all columns ;;;;;;;
  286. ;;;;;;; temp result is stored in wsptr ;;;;;;;
  287. ;;;;;;; perform 1D-idct on each row and store final result
  288. mov esi, wsptr ; initialize source ptr to original wsptr
  289. mov locwctr, 0
  290. mov locwcounter, 8
  291. mov locdwwsptr, esi
  292. idct_row:
  293. mov edi, output_buf
  294. mov esi, locdwwsptr
  295. add edi, locwctr
  296. mov edi, [edi] ; get output_buf[ctr]
  297. add edi, output_col ; now edi is pointing to the resp. row
  298. add locwctr, 4
  299. ;; get even coeffs. and do the even part
  300. mov ax, word ptr [esi+2*0]
  301. mov bx, word ptr [esi+2*4]
  302. mov cx, word ptr [esi+2*2]
  303. mov dx, word ptr [esi+2*6]
  304. mov scratch1, ax
  305. add ax, bx ; tmp10 in ax
  306. sub scratch1, bx ; tmp11
  307. mov bx, cx
  308. add cx, dx ; tmp13 in cx
  309. sub bx, dx ; tmp1 - tmp3 in bx
  310. mov dx, ax
  311. movsx ebx, bx ; sign extend bx: get ready to do imul
  312. add ax, cx ; tmp0 in ax
  313. imul ebx, dword ptr x5a825a825a825a82
  314. sub dx, cx ; tmp3 in dx
  315. mov locwtmp0, ax
  316. mov locwtmp3, dx
  317. sar ebx, 8 ; bx now has (tmp1-tmp3)*1.414
  318. mov ax, scratch1 ; copy of tmp11
  319. sub bx, cx ; tmp12 in bx
  320. add ax, bx ; tmp1 in ax
  321. sub scratch1, bx ; tmp2
  322. mov locwtmp1, ax
  323. ;;;;;completed computing/storing the even part;;;;;;;;;;
  324. mov ax, [esi+2*1] ; get C1
  325. mov bx, [esi+2*7] ; get C7
  326. mov cx, [esi+2*3]
  327. mov dx, [esi+2*5]
  328. mov scratch2, ax
  329. add ax, bx ; z11 in ax
  330. sub scratch2, bx ; z12
  331. mov bx, dx ; copy of deQ C5
  332. add dx, cx ; z13 in dx
  333. sub bx, cx ; z10 in bx
  334. mov cx, ax ; copy of z11
  335. add ax, dx ; tmp7 in ax
  336. sub cx, dx ; partial tmp11
  337. movsx ecx, cx
  338. mov dx, bx ; copy of z10
  339. add bx, scratch2 ; partial z5
  340. imul ecx, dword ptr x5a825a825a825a82
  341. movsx edx, dx ; sign extend z10: get ready for imul
  342. movsx ebx, bx ; sign extend partial z5 for imul
  343. imul edx, dword ptr x539f539f539f539f ; partial tmp12
  344. imul ebx, dword ptr x61f861f861f861f8 ; partial z5 product
  345. mov si, scratch2
  346. movsx esi, si ; sign extend z12: get ready for imul
  347. sar ecx, 8 ; tmp11 in cx
  348. sar ebx, 8 ; z5 in bx
  349. imul esi, dword ptr x4546454645464546
  350. sar edx, 8
  351. sar esi, 8
  352. sub si, bx ; tmp10
  353. add dx, bx ; tmp12 in dx
  354. sub dx, ax ; tmp6 in dx
  355. sub cx, dx ; tmp5 in cx
  356. add si, cx ; tmp4
  357. mov scratch3, si
  358. ;;; completed calculating the odd part ;;;;;;;;;;;
  359. mov si, ax ; copy of tmp7
  360. mov bx, locwtmp0 ; get tmp0
  361. add ax, locwtmp0 ; wsptr[0]
  362. sub bx, si ; wsptr[7]
  363. mov esi, range_limit ; initialize esi to range_limit pointer
  364. sar ax, 5
  365. sar bx, 5
  366. and eax, 3ffh
  367. and ebx, 3ffh
  368. mov al, byte ptr [esi][eax]
  369. mov bl, byte ptr [esi][ebx]
  370. mov byte ptr [edi+0], al
  371. mov byte ptr [edi+7], bl
  372. mov ax, dx ; copy of tmp6
  373. mov bx, locwtmp1
  374. add dx, bx ; wsptr[1]
  375. sub bx, ax ; wsptr[6]
  376. sar dx, 5
  377. sar bx, 5
  378. and edx, 3ffh
  379. and ebx, 3ffh
  380. mov dl, byte ptr [esi][edx]
  381. mov bl, byte ptr [esi][ebx]
  382. mov byte ptr [edi+1], dl
  383. mov byte ptr [edi+6], bl
  384. mov dx, cx ; copy of tmp5
  385. mov bx, scratch1
  386. add cx, bx ; wsptr[2]
  387. sub bx, dx ; wsptr[5]
  388. sar cx, 5
  389. sar bx, 5
  390. and ecx, 3ffh
  391. and ebx, 3ffh
  392. mov cl, byte ptr [esi][ecx]
  393. mov bl, byte ptr [esi][ebx]
  394. mov byte ptr [edi+2], cl
  395. mov byte ptr [edi+5], bl
  396. mov cx, scratch3 ; copy of tmp4
  397. mov ax, locwtmp3
  398. add scratch3, ax ; wsptr[4]
  399. sub ax, cx ; wsptr[3]
  400. sar scratch3, 5
  401. sar ax, 5
  402. mov cx, scratch3
  403. and ecx, 3ffh
  404. and eax, 3ffh
  405. mov bl, byte ptr [esi][ecx]
  406. mov al, byte ptr [esi][eax]
  407. mov byte ptr [edi+4], bl
  408. mov byte ptr [edi+3], al
  409. ;;;;; completed storing 1D idct of one row ;;;;;;;;
  410. ;; update the source pointer (wsptr) for next row
  411. add locdwwsptr, 16
  412. mov ax, locwcounter ; get loop count
  413. dec ax ; another loop done
  414. mov locwcounter, ax
  415. jnz idct_row
  416. ;; end of 1D idct on all rows
  417. ;; final result is stored in outptr
  418. } /* end of __asm */
  419. }
  420. #endif /* DCT_IFAST_SUPPORTED */