Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

884 lines
15 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. AUTHOR: Kumar Balasubramanian
  13. ***************************************************************************
  14. ** MMX version of the "integer LLM mode" within IJG decompressor code.
  15. ** The following is an MMX implementation of the integer slow mode
  16. ** IDCT within the IJG code.
  17. */
  18. #define JPEG_INTERNALS
  19. #include "jinclude.h"
  20. #include "jpeglib.h"
  21. #include "jdct.h" /* Private declarations for DCT subsystem */
  22. #ifdef DCT_ISLOW_SUPPORTED
  23. /*
  24. * This module is specialized to the case DCTSIZE = 8.
  25. */
  26. #if DCTSIZE != 8
  27. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  28. #endif
  29. #if BITS_IN_JSAMPLE == 8
  30. #define CONST_BITS 13
  31. #define PASS1_BITS 2
  32. #else
  33. #define CONST_BITS 13
  34. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  35. #endif
  36. /* Define the constants for the case BITS_IN_JSAMPLE = 8 */
  37. static const __int64 const_0_2986 = 0x0000098E0000098E ;
  38. static const __int64 const_0_3901 = 0x00000c7c00000c7c;
  39. static const __int64 const_0_54119 = 0x0000115100001151;
  40. static const __int64 const_0_7653 = 0x0000187E0000187E;
  41. static const __int64 const_0_899 = 0x00001ccd00001ccd;
  42. static const __int64 const_1_175 = 0x000025a1000025a1;
  43. static const __int64 const_1_501 = 0x0000300b0000300b;
  44. static const __int64 const_1_8477 = 0x00003b2100003b21;
  45. static const __int64 const_1_961 = 0x00003ec500003ec5 ;
  46. static const __int64 const_2_053 = 0x000041b3000041b3 ;
  47. static const __int64 const_2_562 = 0x0000520300005203 ;
  48. static const __int64 const_3_072 = 0x0000625400006254 ;
  49. static const __int64 const_all_ones = 0x0ffffffffffffffff;
  50. static const __int64 const_0_1_0_1 = 0x0000000100000001 ;
  51. static const __int64 const_zero = 0x0000000000000000;
  52. static const __int64 const_1_0 = 0x0000000100000001 ;
  53. static const __int64 const_round = 0x0000040000000400;
  54. static const __int64 const_round_two = 0x0002000000020000;
  55. static const __int64 const_mask = 0x000003ff000003ff;
  56. static const __int64 const_00_1_84_00_0_765 = 0x00003b210000187E;
  57. static const __int64 const_00_0_5411_00_00 = 0x0000115100000000;
  58. static const __int64 const_3_072_00_1_501_00 = 0x62540000300b0000;
  59. static const __int64 const_0_2986_00_2_053_00 = 0x098E000041b30000;
  60. static const __int64 const_0_899_00_2_562_00 = 0x1ccd000052030000;
  61. static const __int64 const_1_96_00_0_3901_00 = 0x3ec500000c7c0000;
  62. static const __int64 const_1_175_00_00_00 = 0x25a1000000000000;
  63. /*
  64. * Perform dequantization and inverse DCT on one block of coefficients.
  65. */
  66. GLOBAL(void)
  67. midct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
  68. JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
  69. {
  70. INT32 locdwinptr, locdwqptr, locdwwsptr, locdwcounter, locdwrowctr ;
  71. __int64 locqwtmp0e,locqwtmp0o, locqwtmp1e, locqwtmp1o, locqwtmp2e ;
  72. __int64 locqwtmp10e , locqwtmp10o ,locqwtmp11e ,
  73. locqwtmp11o , locqwtmp12e , locqwtmp12o ,
  74. locqwtmp13e , locqwtmp13o ,locqwtmp0 ,
  75. locqwtmp1 ,locqwtmp2 ,locqwtmp3 ,
  76. locqwz5e ,locqwz5o ,locqwz1e ,locqwz1o ,
  77. locqwz13e ,locqwz13o ,locqwz14e ,
  78. locqwz14o ,locqwz23e ,locqwz23o ,
  79. locqwz24e ,locqwz24o ;
  80. // Inline assembly to do the IDCT and store the result */
  81. __asm {
  82. mov esi, inptr ; load the input pointer
  83. mov edi, quantptr ; load the quant table pointer
  84. mov locdwinptr, esi ; to be used in the idct_column loop
  85. mov locdwqptr, edi ; to be used in the idct_column loop
  86. mov esi, wsptr
  87. mov locdwcounter, 2 ; idct_column loop counter
  88. mov locdwwsptr, esi
  89. ;; do the idct on all the columns. Do four columns per
  90. ;; iteration of the loop.
  91. idct_column:
  92. mov esi, locdwinptr ; get the source pointer
  93. mov edi, locdwqptr ; get the quantzn. pointer
  94. ;; fetch C2 and Q2
  95. movq mm0, [esi+16*2] ; get C2
  96. movq mm1, [edi+16*2] ; get Q2
  97. movq mm2, [esi+16*6] ; get C6
  98. pmullw mm0, mm1 ; dequantized C2 = z2
  99. movq mm3, [edi+16*6] ; get Q6
  100. movq mm6, const_0_7653
  101. pmullw mm2, mm3 ; dequant. C6 = z3
  102. movq mm7, const_1_8477
  103. movq mm4, mm0 ; copy z2
  104. pmaddwd mm4, mm6 ; tmp3 - z1 for columns 0 & 2
  105. movq mm5, mm0 ; copy z2
  106. movq mm3, mm2 ; z3 copy
  107. psrlq mm5, 16 ; move z2 columns 1 & 3 to 0 & 2
  108. movq mm1, const_0_54119
  109. pmaddwd mm5, mm6 ; tmp3 - z1 for columns 1 & 3
  110. psrlq mm3, 16 ; move z3 columns 1 & 3 to 0 & 2
  111. paddw mm0, mm2 ; z2 + z3
  112. pmaddwd mm2, mm7 ; tmp2 - z1 for columns 0 & 2
  113. movq mm6, mm0 ; z2 + z3 copy
  114. psrlq mm6, 16 ; z2 + z3 columns 1 & 3 in 0 & 2
  115. pmaddwd mm3, mm7 ; tmp2 - z1 for columns 1 & 3
  116. movq mm7, const_all_ones
  117. pmaddwd mm0, mm1 ; z1 columns 0 & 2
  118. pmaddwd mm6, mm1 ; z1 columns 1 & 3
  119. pxor mm2, mm7 ; 1s complement of tmp2 - z1
  120. movq mm1, const_0_1_0_1
  121. pxor mm3, mm7 ; 1s complement of tmp2 - z1
  122. paddd mm2, mm1 ; 2s complement of tmp2 - z1(col 0 &2)
  123. paddd mm3, mm1 ; 2s complement of tmp2 - z1(col 1 & 3)
  124. paddd mm2, mm0 ; tmp2 (columns 0 & 2)
  125. paddd mm4, mm0 ; tmp2 (cols. 1 & 3)
  126. ;; get C0 and Q0
  127. movq mm0, [esi+16*0] ; get C0
  128. paddd mm3, mm6 ; tmp3
  129. movq mm1, [edi+16*0] ; getQ0
  130. paddd mm5, mm6 ; tmp3
  131. movq mm6, [esi+16*4] ; get C4
  132. pmullw mm0, mm1 ; dequant C0 = z2
  133. movq mm7, [edi+16*4] ; get Q4
  134. nop
  135. movq locqwtmp2e, mm2 ; store tmp2 even part
  136. pmullw mm6, mm7 ; dequant C4 = z3
  137. movq mm7, const_1_0
  138. movq mm1, mm0 ; copy of z2
  139. paddw mm0, mm6 ; z2+z3
  140. nop
  141. psubw mm1, mm6 ; z2-z3
  142. movq mm6, mm0 ; z2+z3 copy
  143. pmaddwd mm0, mm7 ; get 0 & 2 cols
  144. psrlq mm6, 16 ; get the other two cols.
  145. pmaddwd mm6, mm7 ;
  146. movq mm2, mm1 ; copy of z2-z3
  147. pmaddwd mm1, mm7
  148. psrlq mm2, 16
  149. pmaddwd mm2, mm7
  150. pslld mm0, 13 ; tmp0 cols 0&2
  151. movq mm7, mm4
  152. pslld mm6, 13 ; tmp0 cols 1 & 3
  153. paddd mm4, mm0 ;
  154. psubd mm0, mm7 ;
  155. movq mm7, mm5
  156. pslld mm2, 13
  157. movq locqwtmp13e, mm0 ; store tmp13 cols 0&2
  158. paddd mm5, mm6
  159. movq mm0, locqwtmp2e
  160. psubd mm6, mm7
  161. movq locqwtmp10o, mm5 ; store tmp10 cols 1&3
  162. movq mm7, mm3
  163. movq locqwtmp13o, mm6 ; store tmp13 cols 1&3
  164. paddd mm3, mm2
  165. movq locqwtmp10e, mm4 ; store tmp10 cols 0&2
  166. pslld mm1, 13
  167. movq locqwtmp11o, mm3 ; store tmp11 cols 1,3
  168. psubd mm2, mm7
  169. movq mm6, [esi+16*1]
  170. movq mm3, mm0
  171. movq locqwtmp12o, mm2 ; store tmp12 cols. 1,3
  172. paddd mm0, mm1
  173. movq mm7, [edi+16*1]
  174. movq locqwtmp11e, mm0 ; store tmp11 cols. 0,2
  175. psubd mm1, mm3
  176. movq mm0, [esi+16*7]
  177. pmullw mm6, mm7 ; dequant. C1 = tmp3
  178. movq locqwtmp12e, mm1
  179. ;; completed the even part.
  180. ;; Now start the odd part
  181. movq mm1, [edi+16*7] ; get C7
  182. movq mm2, [esi+16*5] ; get C5
  183. pmullw mm0, mm1 ; dequant. C7 = tmp0
  184. movq mm3, [edi+16*5]
  185. movq mm4, [esi+16*3]
  186. pmullw mm2, mm3 ; dequant. C5 = tmp1
  187. movq mm5, [edi+16*3]
  188. movq mm1, mm0
  189. movq locqwtmp3, mm6
  190. pmullw mm4, mm5 ; dequant. C3 = tmp2
  191. movq locqwtmp0, mm0
  192. paddw mm0, mm6 ; z1
  193. movq locqwtmp1, mm2
  194. movq mm3, mm2
  195. movq locqwtmp2, mm4
  196. paddw mm2, mm4 ; z2
  197. paddw mm1, mm4 ; z3
  198. movq mm4, const_1_175
  199. paddw mm3, mm6 ; z4
  200. movq mm5, mm1
  201. movq mm7, mm0
  202. psrlq mm7, 16 ; other two cols. of z1
  203. paddw mm5, mm3 ; z3 + z4
  204. movq mm6, mm5
  205. pmaddwd mm5, mm4 ; z5 cols 0 & 2
  206. pmaddwd mm0, const_0_899 ; z1 even part
  207. psrlq mm6, 16
  208. pmaddwd mm6, mm4 ; z5 cols 1 & 3
  209. movq mm4, mm2 ; z2 copy
  210. movq locqwz5e, mm5
  211. psrlq mm4, 16 ; get z2 cols 1 & 3
  212. pxor mm0, const_all_ones
  213. movq mm5, mm1
  214. movq locqwz5o, mm6
  215. psrlq mm5, 16
  216. movq mm6, const_2_562
  217. nop
  218. paddd mm0, const_0_1_0_1
  219. pmaddwd mm2, mm6 ; z2 cols 0 & 2
  220. movq locqwz1e, mm0
  221. pmaddwd mm4, mm6 ; z2 cols 1 & 3
  222. pmaddwd mm7, const_0_899 ; z1
  223. movq mm0, mm3
  224. movq mm6, const_1_961
  225. psrlq mm0, 16
  226. pxor mm2, const_all_ones
  227. pmaddwd mm1, mm6 ; z3 cols 0 & 2
  228. paddd mm2, const_0_1_0_1
  229. pmaddwd mm5, mm6 ; z3 cols 1 & 3
  230. movq mm6, const_0_3901
  231. nop
  232. pxor mm4, const_all_ones
  233. pmaddwd mm3, mm6 ; z4 cols 0 & 2
  234. paddd mm4, const_0_1_0_1
  235. pmaddwd mm0, mm6 ; z4 cols 1 & 3
  236. movq mm6, const_all_ones
  237. nop
  238. pxor mm1, mm6
  239. pxor mm7, mm6
  240. ;; twos complement of z1, z2, z3, z4
  241. paddd mm1, const_0_1_0_1
  242. pxor mm5, mm6
  243. paddd mm7, const_0_1_0_1
  244. pxor mm3, mm6
  245. paddd mm5, const_0_1_0_1
  246. nop
  247. movq locqwz1o, mm7
  248. pxor mm0, mm6
  249. paddd mm1, locqwz5e ; z3+z5 cols 0 & 2
  250. nop
  251. movq mm6, locqwz1e
  252. nop
  253. paddd mm5, locqwz5o ; z3+z5 cols 1 & 3
  254. paddd mm6, mm1
  255. paddd mm3, const_0_1_0_1
  256. paddd mm1, mm2
  257. paddd mm0, const_0_1_0_1
  258. paddd mm7, mm5
  259. paddd mm3, locqwz5e ; z4+z5 cols 0 & 2
  260. paddd mm5, mm4
  261. paddd mm0, locqwz5o ; z4+z5 cols 0 & 2
  262. paddd mm2, mm3
  263. paddd mm3, locqwz1e
  264. paddd mm4, mm0
  265. paddd mm0, locqwz1o
  266. movq locqwz23e, mm1
  267. nop
  268. movq locqwz14o, mm0
  269. nop
  270. movq mm0, locqwtmp0
  271. nop
  272. movq locqwz24e, mm2
  273. movq mm1, mm0
  274. movq mm2, const_0_2986
  275. psrlq mm1, 16
  276. movq locqwz14e, mm3
  277. pmaddwd mm0, mm2 ; tmp0 even
  278. movq mm3, locqwtmp1
  279. pmaddwd mm1, mm2 ; tmp0 odd
  280. movq locqwz24o, mm4
  281. movq mm2, mm3
  282. movq mm4, const_2_053
  283. psrlq mm2, 16
  284. movq locqwz23o, mm5
  285. pmaddwd mm3, mm4 ; tmp1 even
  286. movq mm5, locqwtmp2
  287. pmaddwd mm2, mm4 ; tmp1 odd
  288. movq locqwz13e, mm6
  289. movq mm4, mm5
  290. movq mm6, const_3_072
  291. psrlq mm4, 16
  292. movq locqwz13o, mm7
  293. pmaddwd mm5, mm6 ; tmp2 even
  294. ;;;;;;; now calculate tmp0..tmp3
  295. ;; then calculate the pre-descaled values
  296. ;; this includes the right shift with rounding
  297. movq mm7, locqwtmp3
  298. pmaddwd mm4, mm6 ; tmp2 odd
  299. paddd mm0, locqwz13e
  300. movq mm6, mm7
  301. paddd mm1, locqwz13o
  302. psrlq mm6, 16
  303. movq locqwtmp0e, mm0 ; tmp0 even
  304. nop
  305. movq mm0, const_1_501
  306. nop
  307. movq locqwtmp0o, mm1
  308. pmaddwd mm7, mm0
  309. paddd mm3, locqwz24e
  310. pmaddwd mm6, mm0
  311. movq mm0, locqwtmp10e
  312. nop
  313. paddd mm7, locqwz14e
  314. nop
  315. paddd mm6, locqwz14o
  316. psubd mm0, mm7
  317. movq mm1, locqwtmp10o
  318. nop
  319. movq locqwtmp1e, mm3
  320. psubd mm1, mm6
  321. movq mm3, const_round
  322. nop
  323. paddd mm2, locqwz24o
  324. paddd mm0, mm3
  325. paddd mm7, locqwtmp10e
  326. psrad mm0, 11
  327. movq locqwtmp1o, mm2
  328. paddd mm1, mm3
  329. paddd mm6, locqwtmp10o
  330. psrad mm1, 11
  331. paddd mm5, locqwz23e
  332. movq mm2, mm0
  333. paddd mm4, locqwz23o
  334. punpcklwd mm0, mm1
  335. paddd mm6, mm3
  336. punpckhwd mm2, mm1
  337. paddd mm7, mm3
  338. punpckldq mm0, mm2
  339. ;; now do all the stores of the 1D-iDCT of the four columns
  340. mov edi, locdwwsptr ; get pointer to scratch pad array
  341. movq [edi+16*7], mm0 ; store wsptr[7]
  342. psrad mm6, 11
  343. movq mm2, locqwtmp11e
  344. psrad mm7, 11
  345. psubd mm2, mm5
  346. movq mm0, mm7
  347. movq mm1, locqwtmp11o
  348. punpcklwd mm7, mm6
  349. psubd mm1, mm4
  350. punpckhwd mm0, mm6
  351. paddd mm5, locqwtmp11e
  352. punpckldq mm7, mm0
  353. paddd mm4, locqwtmp11o
  354. paddd mm2, mm3
  355. paddd mm1, mm3
  356. paddd mm5, mm3
  357. paddd mm4, mm3
  358. psrad mm2, 11
  359. movq [edi+16*0], mm7 ; store wsptr[0]
  360. psrad mm1, 11
  361. movq mm0, mm2
  362. psrad mm5, 11
  363. movq mm6, locqwtmp12e
  364. punpcklwd mm2, mm1
  365. punpckhwd mm0, mm1
  366. movq mm1, mm5
  367. movq mm7, locqwtmp12o
  368. punpckldq mm2, mm0
  369. movq [edi+16*6], mm2 ; store wsptr[6]
  370. psrad mm4, 11
  371. movq mm2, mm6
  372. punpcklwd mm5, mm4
  373. paddd mm6, locqwtmp1e
  374. punpckhwd mm1, mm4
  375. psubd mm2, locqwtmp1e
  376. punpckldq mm5, mm1
  377. movq [edi+16*1], mm5 ; store wsptr[1]
  378. movq mm0, mm7
  379. paddd mm7, locqwtmp1o
  380. paddd mm6, mm3
  381. psubd mm0, locqwtmp1o
  382. paddd mm7, mm3
  383. paddd mm2, mm3
  384. psrad mm7, 11
  385. paddd mm0, mm3
  386. psrad mm6, 11
  387. movq mm1, mm6
  388. psrad mm2, 11
  389. movq mm4, locqwtmp13e
  390. punpcklwd mm6, mm7
  391. movq mm5, mm4
  392. punpckhwd mm1, mm7
  393. paddd mm4, locqwtmp0e
  394. punpckldq mm6, mm1
  395. psubd mm5, locqwtmp0e
  396. psrad mm0, 11
  397. movq [edi+16*2], mm6 ; store wsptr[2]
  398. movq mm6, mm2
  399. paddd mm4, mm3
  400. punpcklwd mm2, mm0
  401. paddd mm5, mm3
  402. punpckhwd mm6, mm0
  403. movq mm0, locqwtmp13o
  404. punpckldq mm2, mm6
  405. movq mm1, mm0
  406. psrad mm4, 11
  407. paddd mm0, locqwtmp0o
  408. psrad mm5, 11
  409. paddd mm0, mm3
  410. movq mm6, mm4
  411. psubd mm1, locqwtmp0o
  412. psrad mm0, 11
  413. paddd mm1, mm3
  414. punpcklwd mm4, mm0
  415. movq mm3, mm5
  416. punpckhwd mm6, mm0
  417. movq [edi+16*5], mm2 ; store wsptr[5]
  418. punpckldq mm4, mm6
  419. psrad mm1, 11
  420. movq [edi+16*3], mm4 ; store wsptr[3]
  421. punpcklwd mm5, mm1
  422. punpckhwd mm3, mm1
  423. punpckldq mm5, mm3
  424. add locdwinptr, 8 ; skip first four columns
  425. add locdwqptr, 8
  426. movq [edi+16*4], mm5 ; store wsptr[4]
  427. ;;;;;;; done with 1D-idct of four columns ;;;;;;;
  428. ;; now update pointers for next four columns
  429. add locdwwsptr, 8
  430. mov eax, locdwcounter
  431. dec eax
  432. mov locdwcounter, eax
  433. jnz idct_column
  434. ;;;;;;;end of 1D-idct on the columns ;;;;;;;
  435. mov esi, wsptr ; get start addr of temp array
  436. mov locdwcounter, 8
  437. mov locdwwsptr, esi
  438. mov locdwrowctr, 0
  439. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  440. ;;;;;;; start of 1D-idct on the rows ;;;;;;;
  441. idct_row:
  442. mov esi, locdwwsptr ; get next row start addr of temp array
  443. mov edi, output_buf
  444. movq mm0, [esi+0] ; get first 4 elements of row
  445. movq mm1, [esi+2*4] ; get next 4 elem. of row
  446. movq mm2, mm0
  447. movq mm3, mm0 ; copy of e3|e2|e1|e0
  448. paddw mm2, mm1 ; (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
  449. movq mm4, mm2 ; copy of (e3+e7)|(e2+e6)|(e1+e5)|(e0+e4)
  450. punpckhdq mm3, mm1 ; e7|e6|e3|e2
  451. pmaddwd mm3, const_00_1_84_00_0_765 ; (tmp2 - z1)||(tmp3-z1)
  452. movq mm6, mm0 ; copy of e3|e2|e1|e0
  453. pmaddwd mm2, const_00_0_5411_00_00 ; z1||xxx
  454. psubw mm6, mm1 ; (e3-e7)|(e2-e6)|(e1-e5)|(e0-e4)
  455. punpckldq mm4, mm6 ; (e1-e5)|(e0-e4)|(e1+e5)|(e0+e4)
  456. movq mm6, mm0 ;
  457. movq mm5, mm3
  458. pslld mm4, 16 ; (e0-e4)|(e1+e5)||(e0+e4)|x0000
  459. pxor mm3, const_all_ones
  460. punpckhdq mm2, mm2 ; z1||z1
  461. paddd mm3, const_0_1_0_1
  462. psrad mm4, 3 ; (e0-e4)<<13||(e0+e4)<<13
  463. psrlq mm3, 32
  464. movq mm7, mm4 ; copy of tmp1||tmp0
  465. punpckldq mm5, mm3
  466. movq mm3, mm0 ; e3|e2|e1|e0
  467. paddd mm5, mm2 ; tmp2 || tmp3
  468. paddw mm3, mm1 ; (e7+e3)|(e2+e6)|(e1+e5)|(e0+e4)
  469. paddd mm4, mm5
  470. psubd mm7, mm5
  471. ;; end of even part calculation ;;
  472. ;; mm0 => e3|e2|e1|e0
  473. ;; mm1 => e7|e6|e5|e4
  474. ;; mm4 => tmp11||tmp10
  475. ;; mm7 => tmp12||tmp13
  476. movq mm5, mm3
  477. movq mm2, mm0
  478. pmaddwd mm0, const_3_072_00_1_501_00 ; tmp2|tmp3
  479. punpckldq mm5, mm5
  480. paddw mm5, mm3
  481. punpckldq mm2, mm2
  482. pmaddwd mm5, const_1_175_00_00_00 ; z5|0
  483. punpckhdq mm6, mm2
  484. pmaddwd mm3, const_1_96_00_0_3901_00 ; z3|z4
  485. paddw mm6, mm1
  486. pmaddwd mm6, const_0_899_00_2_562_00 ; z1|z2
  487. nop
  488. pmaddwd mm1, const_0_2986_00_2_053_00 ; tmp0|tmp1
  489. punpckhdq mm5, mm5
  490. movq mm2, const_0_1_0_1
  491. nop
  492. pxor mm3, const_all_ones
  493. nop
  494. pxor mm6, const_all_ones
  495. paddd mm3, mm2
  496. paddd mm6, mm2
  497. paddd mm3, mm5
  498. movq mm5, mm6
  499. paddd mm6, mm3
  500. movq mm2, mm5
  501. punpckldq mm5, mm5
  502. punpckhdq mm2, mm5
  503. paddd mm1, mm6
  504. paddd mm2, mm3
  505. movq mm5, mm1
  506. movq mm3, mm4
  507. paddd mm0, mm2
  508. movq mm2, mm7
  509. punpckldq mm5, mm5
  510. punpckhdq mm1, mm5
  511. psubd mm3, mm0
  512. movq mm5, const_round_two
  513. paddd mm0, mm4
  514. movq mm6, const_mask
  515. psubd mm2, mm1
  516. paddd mm0, mm5
  517. paddd mm1, mm7
  518. ;; descale the resulting coeff values
  519. paddd mm1, mm5
  520. psrad mm0, 18
  521. paddd mm3, mm5
  522. psrad mm1, 18
  523. paddd mm2, mm5
  524. psrad mm3, 18
  525. ;; mask the result with RANGE_MASK (least 10 bits)
  526. pand mm1, mm6 ; w2|w3
  527. psrad mm2, 18
  528. movd ebx, mm1 ; w3
  529. psrlq mm1, 32 ; 0|w2
  530. ;; using the results as index, get the corresponding
  531. ;; value from array range_limit and store the final result
  532. mov ecx, range_limit ; get start addr of range_limit array
  533. add edi, locdwrowctr
  534. movd edx, mm1 ; w2
  535. pand mm0, mm6 ; w1|w0
  536. mov ah, [ecx][ebx] ; w3
  537. mov edi, [edi]
  538. movd ebx, mm0 ; w0
  539. psrlq mm0, 32 ; 0|w1
  540. mov al, [ecx][edx] ; w2
  541. add locdwrowctr, 4
  542. movd edx, mm0 ; w1
  543. pand mm3, mm6 ; w6|w7
  544. add edi, output_col ; this is the dest start addr for this row
  545. shl eax, 16 ; w3|w2|0|0
  546. mov al, [ecx][ebx] ; w0
  547. mov ah, [ecx][edx] ; w1
  548. movd mm4, eax ; w3|w2|w1|w0
  549. pand mm2, mm6 ; w5|w4
  550. movd ebx, mm3 ; w7
  551. psrlq mm3, 32 ; 0|w6
  552. movd edx, mm3 ; w6
  553. mov ah, [ecx][ebx] ; w7
  554. mov al, [ecx][edx] ; w6
  555. movd ebx, mm2 ; w4
  556. psrlq mm2, 32 ; 0|w5
  557. shl eax, 16 ; w7|w6|0|0
  558. movd edx, mm2 ; w5
  559. mov al, [ecx][ebx] ; w4
  560. mov ah, [ecx][edx] ; w5
  561. movd mm5, eax ; w7|w6|w5|w4
  562. punpckldq mm4, mm5 ; w7|w6|w5|w4|w3|w2|w1|w0
  563. add locdwwsptr, 16
  564. mov eax, locdwcounter
  565. movq [edi], mm4
  566. ;; update address pointer and loop counter
  567. dec eax
  568. mov locdwcounter, eax
  569. jnz idct_row
  570. ;;;;;;; end of 1D-idct on all the rows ;;;;;;;
  571. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  572. emms
  573. } //end of __asm
  574. }
  575. #endif /* DCT_ISLOW_SUPPORTED */