Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

637 lines
11 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. AUTHOR: Kumar Balasubramanian
  13. ***************************************************************************
  14. ** Pentium version of the "integer LLM mode" within IJG decompressor code.
  15. ** The following is a non-MMX Pentium implementation of the integer slow mode
  16. ** IDCT within the IJG code.
  17. */
  18. #define JPEG_INTERNALS
  19. #include "jinclude.h"
  20. #include "jpeglib.h"
  21. #include "jdct.h" /* Private declarations for DCT subsystem */
  22. #ifdef DCT_ISLOW_SUPPORTED
  23. /*
  24. * This module is specialized to the case DCTSIZE = 8.
  25. */
  26. #if DCTSIZE != 8
  27. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  28. #endif
  29. #if BITS_IN_JSAMPLE == 8
  30. #define CONST_BITS 13
  31. #define PASS1_BITS 2
  32. #else
  33. #define CONST_BITS 13
  34. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  35. #endif
  36. /* Define the constants for the case BITS_IN_JSAMPLE = 8 */
  37. static const INT32 const_0_2986 = 0x0000098E ;
  38. static const INT32 const_0_3901 = 0x0fffff384;
  39. static const INT32 const_0_54119 = 0x00001151;
  40. static const INT32 const_0_7653 = 0x0000187E;
  41. static const INT32 const_0_899 = 0x0ffffe333;
  42. static const INT32 const_1_175 = 0x000025a1;
  43. static const INT32 const_1_501 = 0x0000300b;
  44. static const INT32 const_1_8477 = 0x0ffffc4df;
  45. static const INT32 const_1_961 = 0x0ffffc13b;
  46. static const INT32 const_2_053 = 0x000041b3;
  47. static const INT32 const_2_562 = 0x0ffffadfd;
  48. static const INT32 const_3_072 = 0x00006254;
  49. static const INT32 const_round = 0x00000400;
  50. static const INT32 const_round_row = 0x00020000;
  51. static const INT32 const_mask = 0x000003ff;
  52. /*
  53. * Perform dequantization and inverse DCT on one block of coefficients.
  54. */
  55. GLOBAL(void)
  56. pidct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
  57. JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
  58. {
  59. INT32 locdwinptr, locdwqptr, locdwwsptr, locdwtmp0, locdwtmp1 ;
  60. INT32 locdwtmp2, locdwtmp3, locdwtmp00, locdwtmp01, locdwtmp02 ;
  61. INT32 locdwtmp03, locdwtmp10, locdwtmp11, locdwtmp12 ;
  62. INT32 locdwtmp13, locdwcounter, locdwrowctr ;
  63. // Inline assembly to do the IDCT and store the result */
  64. __asm {
  65. mov esi, inptr ; point to start of source
  66. mov edi, quantptr ;
  67. mov eax, wsptr
  68. mov locdwinptr, esi ; point to start of source
  69. mov locdwqptr, edi ;
  70. mov locdwwsptr, eax
  71. mov locdwcounter, 8
  72. mov eax, [esi] ; warm up the cache
  73. mov ebx, [esi+32]
  74. mov ecx, [esi+64]
  75. mov edx, [esi+96]
  76. mov eax, [edi]
  77. mov ebx, [edi+32]
  78. mov ecx, [edi+64]
  79. mov edx, [edi+96]
  80. ;; 1D-IDCT of all the eight columns
  81. idct_column:
  82. mov esi, locdwinptr ; point to start of source
  83. mov edi, locdwqptr ;
  84. ;; do the even part
  85. mov ax, [esi+16*2]
  86. mov bx, [edi+16*2]
  87. shl eax, 16 ; sign extend the i/p
  88. mov cx, [esi+16*6]
  89. sar eax, 16
  90. mov dx, [edi+16*6]
  91. shl ebx, 16 ; sign extend the quant factor
  92. sar ebx, 16
  93. imul eax, ebx ; dequantized C2 = z2
  94. shl ecx, 16
  95. sar ecx, 16
  96. shl edx, 16
  97. sar edx, 16
  98. imul ecx, edx ; dequantized C6 = z3
  99. mov ebx, eax ; copy of z2
  100. imul eax, const_0_7653
  101. add ebx, ecx ; z2 + z3
  102. imul ecx, const_1_8477
  103. imul ebx, const_0_54119 ; z1
  104. mov dx, [edi+16*4] ; quant factor for C4
  105. add ecx, ebx ; tmp2
  106. add eax, ebx ; tmp3
  107. mov locdwtmp2, ecx
  108. mov locdwtmp3, eax
  109. mov cx, [esi+16*4] ; C4
  110. mov ax, [esi+16*0] ; C0
  111. mov bx, [edi+16*0] ; quant factor for C0
  112. movsx edx, dx
  113. movsx ecx, cx
  114. movsx eax, ax
  115. movsx ebx, bx
  116. imul ecx, edx ; dequantize C4 = z3
  117. imul eax, ebx ; dequantize C0 = z2
  118. mov edx, ecx ; copy of z3
  119. add ecx, eax ; z2 + z3
  120. shl ecx, 13 ; tmp0
  121. sub eax, edx ; z2 - z3
  122. shl eax, 13 ; tmp1
  123. mov ebx, ecx ; copy of tmp0
  124. add ecx, locdwtmp3 ; tmp10
  125. mov edx, eax ; copy of tmp1
  126. add eax, locdwtmp2 ; tmp11
  127. mov locdwtmp00, ecx
  128. sub ebx, locdwtmp3 ; tmp13
  129. mov locdwtmp01, eax
  130. sub edx, locdwtmp2 ; tmp12
  131. mov locdwtmp03, ebx
  132. mov ax, [esi+16*7] ; C7 for the odd part
  133. mov locdwtmp02, edx
  134. mov bx, [edi+16*7] ; quant factor for C7
  135. ;; now do the odd part
  136. shl eax, 16
  137. mov cx, [esi+16*3]
  138. sar eax, 16
  139. mov dx, [edi+16*3]
  140. shl ebx, 16
  141. sar ebx, 16
  142. imul eax, ebx ; dequantized C7 = tmp0
  143. shl ecx, 16
  144. sar ecx, 16
  145. shl edx, 16
  146. sar edx, 16
  147. mov bx, [esi+16*1]
  148. imul ecx, edx ; dequantized C3 = tmp2
  149. shl ebx, 16
  150. mov dx, [edi+16*1]
  151. sar ebx, 16
  152. shl edx, 16
  153. sar edx, 16
  154. imul ebx, edx ; dequantized C1 = tmp3
  155. mov locdwtmp0, eax
  156. mov locdwtmp2, ecx
  157. mov ax, [esi+16*5]
  158. mov dx, [edi+16*5]
  159. shl eax, 16
  160. sar eax, 16
  161. shl edx, 16
  162. sar edx, 16
  163. imul eax, edx ; dequantized C5 = tmp1
  164. imul ecx, const_3_072 ; tmp2
  165. mov locdwtmp3, ebx
  166. mov edx, locdwtmp0
  167. imul ebx, const_1_501 ; tmp3
  168. imul edx, const_0_2986 ; tmp0
  169. mov locdwtmp1, eax ; store tmp1
  170. mov locdwtmp10, edx
  171. imul eax, const_2_053 ; tmp1
  172. mov locdwtmp11, eax
  173. mov locdwtmp12, ecx
  174. mov locdwtmp13, ebx
  175. mov eax, locdwtmp0
  176. mov ebx, locdwtmp1
  177. mov ecx, eax
  178. mov edx, ebx
  179. add eax, locdwtmp3 ; z1
  180. add ebx, locdwtmp3 ; z4
  181. add ecx, locdwtmp2 ; z3
  182. add edx, locdwtmp2 ; z2
  183. mov esi, ecx ; copy of z3
  184. imul eax, const_0_899 ; z1
  185. imul edx, const_2_562 ; z2
  186. add esi, ebx ; z3 + z4
  187. imul esi, const_1_175 ; z5
  188. imul ecx, const_1_961 ; z3
  189. imul ebx, const_0_3901 ; z4
  190. add ecx, esi ; z3
  191. add ebx, esi ; z4
  192. mov esi, eax ; copy of z1
  193. add eax, ecx ; z1 + z3
  194. add esi, ebx ; z1 + z4
  195. add ecx, edx ; z3 + z2
  196. add edx, ebx ; z2 + z4
  197. add eax, locdwtmp10 ; tmp0
  198. add edx, locdwtmp11 ; tmp1
  199. add ecx, locdwtmp12 ; tmp2
  200. add esi, locdwtmp13 ; tmp3
  201. mov ebx, locdwtmp03
  202. sub ebx, eax ; w4
  203. add eax, locdwtmp03 ; w3
  204. add ebx, const_round
  205. mov edi, locdwwsptr ; keep in mind that wsptr stores 32 bit values
  206. sar ebx, 11 ; So store/update the pointer accordingly
  207. add eax, const_round
  208. sar eax, 11
  209. mov [edi+32*4], ebx
  210. mov [edi+32*3], eax
  211. mov ebx, locdwtmp02
  212. mov eax, locdwtmp01
  213. sub ebx, edx ; w5
  214. add edx, locdwtmp02 ; w2
  215. sub eax, ecx ; w6
  216. add ecx, locdwtmp01 ; w1
  217. add ebx, const_round
  218. sar ebx, 11
  219. add eax, const_round
  220. sar eax, 11
  221. add edx, const_round
  222. add ecx, const_round
  223. mov [edi+32*5], ebx
  224. sar edx, 11
  225. mov [edi+32*6], eax
  226. sar ecx, 11
  227. mov [edi+32*2], edx
  228. mov eax, locdwtmp00
  229. mov [edi+32*1], ecx
  230. mov ebx, eax
  231. sub eax, esi ; w7
  232. add ebx, esi ; w0
  233. add eax, const_round
  234. sar eax, 11
  235. add ebx, const_round
  236. sar ebx, 11
  237. mov [edi+32*7], eax
  238. mov [edi+32*0], ebx
  239. mov eax, locdwcounter
  240. add locdwinptr, 2
  241. add locdwwsptr, 4 ; wsptr stores 32 bit quantities
  242. add locdwqptr, 2
  243. dec eax
  244. mov locdwcounter, eax
  245. jnz idct_column
  246. ;; End of 1D-idct of all the columns
  247. ;; get ready for the 1D-idct of the rows
  248. mov esi, wsptr
  249. mov locdwcounter, 8
  250. mov locdwrowctr, 0
  251. mov locdwwsptr, esi
  252. ;; 1D-IDCT of all the eight rows
  253. idct_row:
  254. mov esi, locdwwsptr ; point to start of source
  255. mov edi, output_buf
  256. add edi, locdwrowctr
  257. mov edi, [edi]
  258. add locdwrowctr, 4
  259. add edi, output_col ; this is the dest start addr for this row
  260. ;; do the even part
  261. mov eax, [esi+4*2]
  262. mov ecx, [esi+4*6]
  263. mov ebx, eax ; copy of z2
  264. mov edx, [edi] ; warm up the cache for writing this output row
  265. imul eax, const_0_7653
  266. add ebx, ecx ; z2 + z3
  267. imul ecx, const_1_8477
  268. imul ebx, const_0_54119 ; z1
  269. add ecx, ebx ; tmp2
  270. add eax, ebx ; tmp3
  271. mov locdwtmp2, ecx
  272. mov locdwtmp3, eax
  273. mov ecx, [esi+4*4] ; C4
  274. mov eax, [esi+4*0] ; C0
  275. mov edx, ecx ; copy of z3
  276. add ecx, eax ; z2 + z3
  277. sub eax, edx ; z2 - z3
  278. shl ecx, 13 ; tmp0
  279. shl eax, 13 ; tmp1
  280. mov ebx, ecx ; copy of tmp0
  281. add ecx, locdwtmp3 ; tmp10
  282. mov edx, eax ; copy of tmp1
  283. add eax, locdwtmp2 ; tmp11
  284. mov locdwtmp00, ecx
  285. sub ebx, locdwtmp3 ; tmp13
  286. mov locdwtmp01, eax
  287. sub edx, locdwtmp2 ; tmp12
  288. mov locdwtmp03, ebx
  289. mov eax, [esi+4*7] ; C7 for the odd part
  290. mov locdwtmp02, edx
  291. ;; now do the odd part
  292. mov ecx, [esi+4*3]
  293. mov ebx, [esi+4*1]
  294. mov locdwtmp0, eax
  295. mov locdwtmp2, ecx
  296. mov eax, [esi+4*5]
  297. mov locdwtmp3, ebx
  298. imul ecx, const_3_072 ; tmp2
  299. mov edx, locdwtmp0
  300. imul ebx, const_1_501 ; tmp3
  301. imul edx, const_0_2986 ; tmp0
  302. mov locdwtmp1, eax ; store tmp1
  303. imul eax, const_2_053 ; tmp1
  304. mov locdwtmp10, edx
  305. mov locdwtmp11, eax
  306. mov locdwtmp12, ecx
  307. mov locdwtmp13, ebx
  308. mov eax, locdwtmp0
  309. mov ebx, locdwtmp1
  310. mov ecx, eax
  311. mov edx, ebx
  312. add eax, locdwtmp3 ; z1
  313. add edx, locdwtmp2 ; z2
  314. add ebx, locdwtmp3 ; z4
  315. add ecx, locdwtmp2 ; z3
  316. mov esi, ecx ; copy of z3
  317. imul eax, const_0_899 ; z1
  318. imul edx, const_2_562 ; z2
  319. add esi, ebx ; z3 + z4
  320. imul esi, const_1_175 ; z5
  321. imul ecx, const_1_961 ; z3
  322. imul ebx, const_0_3901 ; z4
  323. add ecx, esi ; z3
  324. add ebx, esi ; z4
  325. mov esi, eax ; copy of z1
  326. add eax, ecx ; z1 + z3
  327. add esi, ebx ; z1 + z4
  328. add ecx, edx ; z3 + z2
  329. add edx, ebx ; z2 + z4
  330. add eax, locdwtmp10 ; tmp0
  331. add edx, locdwtmp11 ; tmp1
  332. add ecx, locdwtmp12 ; tmp2
  333. add esi, locdwtmp13 ; tmp3
  334. mov locdwtmp0, eax
  335. mov locdwtmp1, edx
  336. mov locdwtmp2, ecx
  337. mov locdwtmp3, esi
  338. mov ebx, locdwtmp03
  339. add ebx, locdwtmp0 ; out3
  340. mov ecx, locdwtmp00
  341. sub ecx, locdwtmp3 ; out7
  342. add ebx, const_round_row
  343. sar ebx, 18
  344. add ecx, const_round_row
  345. sar ecx, 18
  346. mov esi, range_limit
  347. and ebx, const_mask
  348. and ecx, const_mask
  349. mov al, [esi][ebx]
  350. mov dl, [esi][ecx]
  351. mov ebx, locdwtmp02
  352. mov ecx, locdwtmp01
  353. add ebx, locdwtmp1 ; out2
  354. sub ecx, locdwtmp2 ; out6
  355. shl eax, 8 ; get ready to receive next output byte
  356. add ebx, const_round_row
  357. shl edx, 8 ; get ready to receive next output byte
  358. add ecx, const_round_row
  359. sar ebx, 18
  360. sar ecx, 18
  361. and ebx, const_mask
  362. and ecx, const_mask
  363. mov al, [esi][ebx]
  364. mov dl, [esi][ecx]
  365. mov ebx, locdwtmp01
  366. mov ecx, locdwtmp02
  367. add ebx, locdwtmp2 ; out1
  368. shl eax, 8 ; get ready to receive next output byte
  369. sub ecx, locdwtmp1 ; out5
  370. shl edx, 8 ; get ready to receive next output byte
  371. add ebx, const_round_row
  372. sar ebx, 18
  373. add ecx, const_round_row
  374. sar ecx, 18
  375. and ebx, const_mask
  376. and ecx, const_mask
  377. mov al, [esi][ebx] ; out1
  378. mov dl, [esi][ecx] ; out5
  379. mov ebx, locdwtmp00
  380. mov ecx, locdwtmp03
  381. add ebx, locdwtmp3 ; out0
  382. shl eax, 8 ; get ready to receive next output byte
  383. sub ecx, locdwtmp0 ; out4
  384. shl edx, 8 ; get ready to receive next output byte
  385. add ebx, const_round_row
  386. sar ebx, 18
  387. add ecx, const_round_row
  388. sar ecx, 18
  389. and ebx, const_mask
  390. and ecx, const_mask
  391. mov al, [esi][ebx] ; out0
  392. mov dl, [esi][ecx] ; out4
  393. mov [edi], eax ; store the first four bytes
  394. mov [edi+4], edx ; store the next four bytes of this row
  395. mov eax, locdwcounter
  396. add locdwwsptr, 32 ; wsptr stores 32 bit quantities
  397. dec eax
  398. mov locdwcounter, eax
  399. jnz idct_row
  400. } //end of __asm
  401. }
  402. #endif /* DCT_ISLOW_SUPPORTED */