Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1755 lines
38 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. */
  13. /*
  14. * jidctfst.c
  15. *
  16. * Copyright (C) 1994-1996, Thomas G. Lane.
  17. * This file is part of the Independent JPEG Group's software.
  18. * For conditions of distribution and use, see the accompanying README file.
  19. *
  20. * This file contains a fast, not so accurate integer implementation of the
  21. * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
  22. * must also perform dequantization of the input coefficients.
  23. *
  24. * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  25. * on each row (or vice versa, but it's more convenient to emit a row at
  26. * a time). Direct algorithms are also available, but they are much more
  27. * complex and seem not to be any faster when reduced to code.
  28. *
  29. * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  30. * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
  31. * Japanese, but the algorithm is described in the Pennebaker & Mitchell
  32. * JPEG textbook (see REFERENCES section in file README). The following code
  33. * is based directly on figure 4-8 in P&M.
  34. * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  35. * possible to arrange the computation so that many of the multiplies are
  36. * simple scalings of the final outputs. These multiplies can then be
  37. * folded into the multiplications or divisions by the JPEG quantization
  38. * table entries. The AA&N method leaves only 5 multiplies and 29 adds
  39. * to be done in the DCT itself.
  40. * The primary disadvantage of this method is that with fixed-point math,
  41. * accuracy is lost due to imprecise representation of the scaled
  42. * quantization values. The smaller the quantization table entry, the less
  43. * precise the scaled value, so this implementation does worse with high-
  44. * quality-setting files than with low-quality ones.
  45. */
  46. #define JPEG_INTERNALS
  47. #include "jinclude.h"
  48. #include "jpeglib.h"
  49. #include "jdct.h" /* Private declarations for DCT subsystem */
  50. #ifdef DCT_IFAST_SUPPORTED
  51. /*
  52. * This module is specialized to the case DCTSIZE = 8.
  53. */
  54. #if DCTSIZE != 8
  55. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  56. #endif
  57. /* Scaling decisions are generally the same as in the LL&M algorithm;
  58. * see jidctint.c for more details. However, we choose to descale
  59. * (right shift) multiplication products as soon as they are formed,
  60. * rather than carrying additional fractional bits into subsequent additions.
  61. * This compromises accuracy slightly, but it lets us save a few shifts.
  62. * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  63. * everywhere except in the multiplications proper; this saves a good deal
  64. * of work on 16-bit-int machines.
  65. *
  66. * The dequantized coefficients are not integers because the AA&N scaling
  67. * factors have been incorporated. We represent them scaled up by PASS1_BITS,
  68. * so that the first and second IDCT rounds have the same input scaling.
  69. * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
  70. * avoid a descaling shift; this compromises accuracy rather drastically
  71. * for small quantization table entries, but it saves a lot of shifts.
  72. * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
  73. * so we use a much larger scaling factor to preserve accuracy.
  74. *
  75. * A final compromise is to represent the multiplicative constants to only
  76. * 8 fractional bits, rather than 13. This saves some shifting work on some
  77. * machines, and may also reduce the cost of multiplication (since there
  78. * are fewer one-bits in the constants).
  79. */
  80. #if BITS_IN_JSAMPLE == 8
  81. #define CONST_BITS 8
  82. #define PASS1_BITS 2
  83. #else
  84. #define CONST_BITS 8
  85. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  86. #endif
  87. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  88. * causing a lot of useless floating-point operations at run time.
  89. * To get around this we use the following pre-calculated constants.
  90. * If you change CONST_BITS you may want to add appropriate values.
  91. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  92. */
  93. #if CONST_BITS == 8
  94. #define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */
  95. #define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */
  96. #define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */
  97. #define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */
  98. #else
  99. #define FIX_1_082392200 FIX(1.082392200)
  100. #define FIX_1_414213562 FIX(1.414213562)
  101. #define FIX_1_847759065 FIX(1.847759065)
  102. #define FIX_2_613125930 FIX(2.613125930)
  103. #endif
  104. /* We can gain a little more speed, with a further compromise in accuracy,
  105. * by omitting the addition in a descaling shift. This yields an incorrectly
  106. * rounded result half the time...
  107. */
  108. #ifndef USE_ACCURATE_ROUNDING
  109. #undef DESCALE
  110. #define DESCALE(x,n) RIGHT_SHIFT(x, n)
  111. #endif
  112. //#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
  113. /* Multiply a DCTELEM variable by an INT32 constant, and immediately
  114. * descale to yield a DCTELEM result.
  115. */
  116. //#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
  117. #define MULTIPLY(var,const) ((DCTELEM) ((var) * (const)))
  118. /* Dequantize a coefficient by multiplying it by the multiplier-table
  119. * entry; produce a DCTELEM result. For 8-bit data a 16x16->16
  120. * multiplication will do. For 12-bit data, the multiplier table is
  121. * declared INT32, so a 32-bit multiply will be used.
  122. */
  123. #if BITS_IN_JSAMPLE == 8
  124. //#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval))
  125. #define DEQUANTIZE(coef,quantval) (((coef)) * (quantval))
  126. #else
  127. #define DEQUANTIZE(coef,quantval) \
  128. DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
  129. #endif
  130. /* Like DESCALE, but applies to a DCTELEM and produces an int.
  131. * We assume that int right shift is unsigned if INT32 right shift is.
  132. */
  133. #ifdef RIGHT_SHIFT_IS_UNSIGNED
  134. #define ISHIFT_TEMPS DCTELEM ishift_temp;
  135. #if BITS_IN_JSAMPLE == 8
  136. #define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */
  137. #else
  138. #define DCTELEMBITS 32 /* DCTELEM must be 32 bits */
  139. #endif
  140. #define IRIGHT_SHIFT(x,shft) \
  141. ((ishift_temp = (x)) < 0 ? \
  142. (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
  143. (ishift_temp >> (shft)))
  144. #else
  145. #define ISHIFT_TEMPS
  146. #define IRIGHT_SHIFT(x,shft) ((x) >> (shft))
  147. #endif
  148. #ifdef USE_ACCURATE_ROUNDING
  149. #define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
  150. #else
  151. #define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n))
  152. #endif
  153. static const __int64 x5a825a825a825a82 = 0x0000016a0000016a ;
  154. static const __int64 x539f539f539f539f = 0x0000fd630000fd63 ;
  155. static const __int64 x4546454645464546 = 0x0000011500000115 ;
  156. static const __int64 x61f861f861f861f8 = 0x000001d9000001d9 ;
  157. static const __int64 const_mask = 0x03ff03ff03ff03ff ;
  158. static const __int64 const_zero = 0x0000000000000000 ;
  159. /*
  160. * Perform dequantization and inverse DCT on one block of coefficients.
  161. */
  162. GLOBAL(void)
  163. midct8x8aan (JCOEFPTR coef_block, short * wsptr, short * quantptr,
  164. JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
  165. {
  166. __int64 scratch3, scratch5, scratch7 ;
  167. // do the 2-Dal idct and store the corresponding results
  168. // from the range_limit array
  169. __asm {
  170. mov ebx, coef_block ; source coeff
  171. mov esi, wsptr ; temp results
  172. mov edi, quantptr ; quant factors
  173. movq mm0, [ebx+8*12] ; V12
  174. pmullw mm0, [edi+8*12]
  175. movq mm1, [ebx+8*4] ; V4
  176. pmullw mm1, [edi+8*4]
  177. movq mm3, [ebx+8*0] ; V0
  178. pmullw mm3, [edi+8*0]
  179. movq mm5, [ebx+8*8] ; V8
  180. movq mm2, mm1 ; duplicate V4
  181. pmullw mm5, [edi+8*8]
  182. psubw mm1, mm0 ; V16 (s1)
  183. movq mm7, x5a825a825a825a82 ; 23170 ->V18 (s3)
  184. ;***************************************************PackMulW
  185. movq mm6, mm1
  186. punpcklwd mm1, const_zero
  187. paddw mm2, mm0 ; V17
  188. pmaddwd mm1, mm7
  189. movq mm0, mm2 ; duplicate V17
  190. punpckhwd mm6, const_zero
  191. movq mm4, mm3 ; duplicate V0
  192. pmaddwd mm6, mm7
  193. paddw mm3, mm5 ; V19
  194. psrad mm1, 8
  195. psubw mm4, mm5 ; V20 ;mm5 free
  196. psrad mm6, 8 ; mm6 = (s1)
  197. packssdw mm1, mm6
  198. ;**********************************************************
  199. movq mm6, mm3 ; duplicate t74=t81
  200. psubw mm1, mm0 ; V21 ; mm0 free
  201. paddw mm3, mm2 ; V22
  202. movq mm5, mm1 ; duplicate V21
  203. paddw mm1, mm4 ; V23
  204. movq [esi+8*4], mm3 ; V22
  205. psubw mm4, mm5 ; V24; mm5 free
  206. movq [esi+8*12], mm1 ; V23
  207. psubw mm6, mm2 ; V25; mm2 free
  208. movq [esi+8*0], mm4 ; V24
  209. ; keep mm6 alive all along the next block
  210. movq mm7, [ebx+8*10] ; V10
  211. pmullw mm7, [edi+8*10]
  212. movq mm0, [ebx+8*6] ; V6
  213. pmullw mm0, [edi+8*6]
  214. movq mm3, mm7 ; duplicate V10
  215. movq mm5, [ebx+8*2] ; V2
  216. pmullw mm5, [edi+8*2]
  217. psubw mm7, mm0 ; V26 (s1/7)
  218. movq mm4, [ebx+8*14] ; V14
  219. pmullw mm4, [edi+8*14]
  220. paddw mm3, mm0 ; V29 ; free mm0
  221. movq mm1, x539f539f539f539f ;23170 ->V18 (scratch3)
  222. ;mm0 = s5,
  223. ;***************************************************PackMulW
  224. movq scratch7, mm7
  225. movq mm2, mm7
  226. punpcklwd mm7, const_zero
  227. movq mm0, mm5 ; duplicate V2
  228. pmaddwd mm7, mm1
  229. paddw mm5, mm4 ; V27
  230. punpckhwd mm2, const_zero
  231. psubw mm0, mm4 ;(s1) for next ; V28 ; free mm4
  232. pmaddwd mm2, mm1
  233. movq mm4, mm0
  234. punpcklwd mm0, const_zero
  235. psrad mm7, 8
  236. psrad mm2, 8 ; mm2 = scratch1
  237. movq mm1, mm4 ; duplicate V28
  238. punpckhwd mm4, const_zero
  239. packssdw mm7, mm2
  240. movq mm2, x4546454645464546 ; 23170 ->V18
  241. ;**********************************************************
  242. ;***************************************************PackMulW
  243. pmaddwd mm0, mm2
  244. pmaddwd mm4, mm2
  245. psrad mm0, 8
  246. movq mm2, x61f861f861f861f8 ; 23170 ->V18
  247. psrad mm4, 8
  248. packssdw mm0, mm4
  249. movq mm4, mm1
  250. movq mm1, scratch7
  251. ;**********************************************************
  252. movq scratch5, mm0
  253. paddw mm1, mm4 ; V32 ; free mm4
  254. ;***************************************************PackMulW
  255. movq mm0, mm1
  256. punpcklwd mm1, const_zero
  257. movq mm4, mm5 ; duplicate t90=t93
  258. pmaddwd mm1, mm2
  259. paddw mm5, mm3 ; V31
  260. punpckhwd mm0, const_zero
  261. psubw mm4, mm3 ; V30 ; free mm3
  262. movq mm3, x5a825a825a825a82 ; 23170 ->V18
  263. pmaddwd mm0, mm2
  264. psrad mm1, 8
  265. movq mm2, mm4 ; make a copy of mm4
  266. punpcklwd mm4, const_zero
  267. psrad mm0, 8
  268. pmaddwd mm4, mm3
  269. packssdw mm1, mm0
  270. ;**********************************************************
  271. ;***************************************************PackMulW
  272. punpckhwd mm2, const_zero
  273. movq mm0, scratch5
  274. pmaddwd mm2, mm3
  275. psubw mm0, mm1 ; V38
  276. paddw mm1, mm7 ; V37 ; free mm7
  277. movq mm7, [esi+8*4] ; V22
  278. psrad mm4, 8
  279. psrad mm2, 8
  280. movq mm3, mm6 ; duplicate V25
  281. packssdw mm4, mm2
  282. psubw mm1, mm5 ; V39 (mm5 still needed for next block)
  283. ;**********************************************************
  284. movq mm2, [esi+8*12] ; V23
  285. psubw mm4, mm1 ; V40
  286. paddw mm0, mm4 ; V41; free mm0
  287. psubw mm6, mm0 ; tm6
  288. paddw mm3, mm0 ; tm8; free mm1
  289. movq mm0, mm1 ; line added by Kumar
  290. movq mm1, mm7 ; duplicate V22
  291. movq [esi+8*8], mm3 ; tm8; free mm3
  292. paddw mm7, mm5 ; tm0
  293. movq [esi+8*6], mm6 ; tm6; free mm6
  294. psubw mm1, mm5 ; tm14; free mm5
  295. movq mm6, [esi+8*0] ; V24
  296. movq mm3, mm2 ; duplicate t117=t125
  297. movq [esi+8*0], mm7 ; tm0; free mm7
  298. paddw mm2, mm0 ; tm2
  299. movq [esi+8*14], mm1 ; tm14; free mm1
  300. psubw mm3, mm0 ; tm12; free mm0
  301. movq [esi+8*2], mm2 ; tm2; free mm2
  302. movq mm0, mm6 ; duplicate t119=t123
  303. movq [esi+8*12], mm3 ; tm12; free mm3
  304. paddw mm6, mm4 ; tm4
  305. movq mm1, [ebx+8*5] ; V5
  306. psubw mm0, mm4 ; tm10; free mm4
  307. pmullw mm1, [edi+8*5]
  308. movq [esi+8*4], mm6 ; tm4; free mm6
  309. movq [esi+8*10], mm0 ; tm10; free mm0
  310. ; column 1: even part
  311. ; use V5, V13, V1, V9 to produce V56..V59
  312. movq mm7, [ebx+8*13] ; V13
  313. movq mm2, mm1 ; duplicate t128=t130
  314. pmullw mm7, [edi+8*13]
  315. movq mm3, [ebx+8*1] ; V1
  316. pmullw mm3, [edi+8*1]
  317. movq mm5, [ebx+8*9] ; V9
  318. psubw mm1, mm7 ; V50
  319. pmullw mm5, [edi+8*9]
  320. paddw mm2, mm7 ; V51
  321. movq mm7, x5a825a825a825a82 ; 23170 ->V18
  322. ;***************************************************PackMulW
  323. movq mm4, mm1
  324. punpcklwd mm1, const_zero
  325. movq mm6, mm2 ; duplicate V51
  326. pmaddwd mm1, mm7
  327. punpckhwd mm4, const_zero
  328. movq mm0, [ebx+8*11] ; V11
  329. pmaddwd mm4, mm7
  330. pmullw mm0, [edi+8*11]
  331. psrad mm1, 8
  332. psrad mm4, 8
  333. packssdw mm1, mm4
  334. movq mm4, mm3 ; duplicate V1
  335. ;**********************************************************
  336. paddw mm3, mm5 ; V53
  337. psubw mm4, mm5 ; V54 ;mm5 free
  338. movq mm7, mm3 ; duplicate V53
  339. psubw mm1, mm6 ; V55 ; mm6 free
  340. movq mm6, [ebx+8*7] ; V7
  341. paddw mm3, mm2 ; V56
  342. movq mm5, mm4 ; duplicate t140=t142
  343. paddw mm4, mm1 ; V57
  344. movq [esi+8*5], mm3 ; V56
  345. psubw mm5, mm1 ; V58; mm1 free
  346. pmullw mm6, [edi+8*7]
  347. psubw mm7, mm2 ; V59; mm2 free
  348. movq [esi+8*13], mm4 ; V57
  349. movq mm3, mm0 ; duplicate V11
  350. ; keep mm7 alive all along the next block
  351. movq [esi+8*9], mm5 ; V58
  352. paddw mm0, mm6 ; V63
  353. movq mm4, [ebx+8*15] ; V15
  354. psubw mm3, mm6 ; V60 ; free mm6
  355. pmullw mm4, [edi+8*15]
  356. ; note that V15 computation has a correction step:
  357. ; this is a 'magic' constant that rebiases the results to be closer to the expected result
  358. ; this magic constant can be refined to reduce the error even more
  359. ; by doing the correction step in a later stage when the number is actually multiplied by 16
  360. movq mm1, mm3 ; duplicate V60
  361. movq mm5, [ebx+8*3] ; V3
  362. movq mm2, mm1
  363. pmullw mm5, [edi+8*3]
  364. movq scratch7, mm7
  365. movq mm6, mm5 ; duplicate V3
  366. movq mm7, x539f539f539f539f ; 23170 ->V18
  367. paddw mm5, mm4 ; V61
  368. ;***************************************************PackMulW
  369. punpcklwd mm1, const_zero
  370. psubw mm6, mm4 ; V62 ; free mm4
  371. pmaddwd mm1, mm7
  372. movq mm4, mm5 ; duplicate V61
  373. punpckhwd mm2, const_zero
  374. paddw mm5, mm0 ; V65 -> result
  375. pmaddwd mm2, mm7
  376. psubw mm4, mm0 ; V64 ; free mm0
  377. movq scratch3, mm3
  378. psrad mm1, 8
  379. movq mm3, x5a825a825a825a82 ; 23170 ->V18
  380. psrad mm2, 8
  381. packssdw mm1, mm2
  382. movq mm2, mm4
  383. ;**********************************************************
  384. ;***************************************************PackMulW
  385. punpcklwd mm4, const_zero
  386. pmaddwd mm4, mm3
  387. punpckhwd mm2, const_zero
  388. pmaddwd mm2, mm3
  389. psrad mm4, 8
  390. movq mm3, scratch3
  391. movq mm0, x61f861f861f861f8 ; 23170 ->V18
  392. paddw mm3, mm6 ; V66
  393. psrad mm2, 8
  394. movq mm7, mm3
  395. packssdw mm4, mm2
  396. movq mm2, mm5 ; duplicate V65
  397. ;**********************************************************
  398. ;***************************************************PackMulW
  399. punpcklwd mm3, const_zero
  400. pmaddwd mm3, mm0
  401. punpckhwd mm7, const_zero
  402. pmaddwd mm7, mm0
  403. movq mm0, mm6
  404. psrad mm3, 8
  405. punpcklwd mm6, const_zero
  406. psrad mm7, 8
  407. packssdw mm3, mm7
  408. ;**********************************************************
  409. movq mm7, x4546454645464546 ; 23170 ->V18
  410. ;***************************************************PackMulW
  411. punpckhwd mm0, const_zero
  412. pmaddwd mm6, mm7
  413. pmaddwd mm0, mm7
  414. psrad mm6, 8
  415. psrad mm0, 8
  416. packssdw mm6, mm0
  417. ;**********************************************************
  418. movq mm0, [esi+8*5] ; V56
  419. psubw mm6, mm3 ; V72
  420. paddw mm3, mm1 ; V71 ; free mm1
  421. psubw mm3, mm2 ; V73 ; free mm2
  422. movq mm1, mm0 ; duplicate t177=t188
  423. psubw mm4, mm3 ; V74
  424. paddw mm0, mm5 ; tm1
  425. movq mm2, [esi+8*13] ; V57
  426. paddw mm6, mm4 ; V75
  427. ;location
  428. ; 5 - V56
  429. ; 13 - V57
  430. ; 9 - V58
  431. ; X - V59, mm7
  432. ; X - V65, mm5
  433. ; X - V73, mm6
  434. ; X - V74, mm4
  435. ; X - V75, mm3
  436. ; free mm0, mm1 & mm2
  437. movq mm7, scratch7 ; tm1; free mm0
  438. psubw mm1, mm5 ; tm15; free mm5
  439. ;save the store as used directly in the transpose
  440. movq [esi+8*1], mm0 ; tm1; free mm0
  441. movq mm5, mm7 ; duplicate t182=t184
  442. movq mm0, [esi+8*9] ; V58
  443. psubw mm7, mm6 ; tm7
  444. paddw mm5, mm6 ; tm9; free mm6
  445. movq mm6, mm3
  446. movq [esi+8*7], mm7 ; tm7; free mm7
  447. movq mm3, mm2 ; duplicate V57
  448. psubw mm3, mm6 ; tm13
  449. paddw mm2, mm6 ; tm3 ; free mm6
  450. movq [esi+8*3], mm2 ; tm3; free mm2
  451. movq mm6, mm0 ; duplicate V58
  452. paddw mm0, mm4 ; tm5
  453. psubw mm6, mm4 ; tm11; free mm4
  454. movq [esi+8*5], mm0 ; tm5; free mm0
  455. movq mm0, mm5 ; copy w4---0,1,3,5,6
  456. ; transpose the bottom right quadrant(4X4) of the matrix
  457. ; --------- ---------
  458. ; | M1 | M2 | | M1'| M3'|
  459. ; --------- --> ---------
  460. ; | M3 | M4 | | M2'| M4'|
  461. ; --------- ---------
  462. punpcklwd mm5, mm6 ;
  463. punpckhwd mm0, mm6 ;---0,1,3,5,6
  464. movq mm6, [esi+8*0] ;get w0 of top left quadrant
  465. movq mm2, mm3 ;---0,1,2,3,5,6
  466. punpcklwd mm3, mm1 ;
  467. movq mm7, [esi+8*2] ;get w1 of top left quadrant
  468. punpckhwd mm2, mm1 ;---0,2,3,5,6,7
  469. movq mm4, mm5 ;---0,2,3,4,5,6,7
  470. punpckldq mm5, mm3 ; transposed w4
  471. movq [esi+8*9], mm5 ; store w4
  472. punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
  473. movq mm3, mm0 ;---0,2,3,4,6,7
  474. punpckldq mm0, mm2 ; transposed w6
  475. movq [esi+8*11], mm4 ; store w5
  476. punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
  477. movq [esi+8*13], mm0 ; store w6---3,5,6,7
  478. movq mm5, mm6 ; copy w0
  479. movq [esi+8*15], mm3 ; store w7---5,6,7
  480. punpcklwd mm6, mm7
  481. ; transpose the top left quadrant(4X4) of the matrix
  482. punpckhwd mm5, mm7 ;---5,6,7
  483. movq mm7, [esi+8*4] ; get w2 of TL quadrant
  484. movq mm4, [esi+8*6] ; get w3 of TL quadrant
  485. movq mm3, mm7 ; copy w2---3,4,5,6,7
  486. movq mm2, mm6
  487. punpcklwd mm7, mm4 ;---2,3,4,5,6,7
  488. punpckhwd mm3, mm4 ;---2,3,4,5,6,7
  489. movq mm4, mm5 ;
  490. movq mm1, mm5
  491. punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
  492. movq [esi+8*0], mm6 ; store w0 of TL quadrant
  493. punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
  494. movq [esi+8*2], mm2 ; store w1 of TL quadrant
  495. punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
  496. movq [esi+8*4], mm5 ; store w2 of TL quadrant
  497. punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
  498. movq [esi+8*6], mm1 ; store w3 of TL quadrant
  499. ; transpose the top right quadrant(4X4) of the matrix
  500. movq mm0, [esi+8*1] ;---0
  501. movq mm1, [esi+8*3] ;---0,1,2
  502. movq mm2, mm0
  503. movq mm3, [esi+8*5]
  504. punpcklwd mm0, mm1 ;---0,1,2,3
  505. punpckhwd mm2, mm1
  506. movq mm1, [esi+8*7] ;---0,1,2,3
  507. movq mm4, mm3
  508. punpcklwd mm3, mm1 ;---0,1,2,3,4
  509. punpckhwd mm4, mm1 ;---0,1,2,3,4
  510. movq mm1, mm0
  511. movq mm5, mm2
  512. punpckldq mm0, mm3 ;---0,1,2,3,4,5
  513. punpckhdq mm1, mm3 ;---0,1,2,3,4,5
  514. movq mm3, [esi+8*8]
  515. movq [esi+8*8], mm0
  516. punpckldq mm2, mm4 ;---1,2,3,4,5
  517. punpckhdq mm5, mm4 ;---1,2,3,4,5
  518. movq mm4, [esi+8*10]
  519. ; transpose the bottom left quadrant(4X4) of the matrix
  520. ; Also store w1,w2,w3 of top right quadrant into
  521. ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
  522. ; of BL is already done.
  523. movq [esi+8*10], mm1
  524. movq mm1, mm3 ;---1,2,3,4,5
  525. movq mm0, [esi+8*12]
  526. punpcklwd mm3, mm4 ;---0,1,2,3,4,5
  527. punpckhwd mm1, mm4 ;---0,1,2,3,4,5
  528. movq mm4, [esi+8*14]
  529. movq [esi+8*12], mm2
  530. movq mm2, mm0
  531. movq [esi+8*14], mm5
  532. punpcklwd mm0, mm4 ;---0,1,2,3,4
  533. punpckhwd mm2, mm4 ;---0,1,2,3,4
  534. movq mm4, mm3
  535. movq mm5, mm1
  536. punpckldq mm3, mm0 ;---0,1,2,3,4,5
  537. movq [esi+8*1], mm3
  538. punpckhdq mm4, mm0 ;---1,2,4,5
  539. movq [esi+8*3], mm4
  540. punpckldq mm1, mm2 ;---1,2,5
  541. movq [esi+8*5], mm1
  542. punpckhdq mm5, mm2 ;---5
  543. movq [esi+8*7], mm5
  544. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  545. ;;;;;;;;; 1D DCT of the rows ;;;;;;;;;;;
  546. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  547. mov esi, wsptr ; source
  548. ; column 0: even part
  549. ; use V4, V12, V0, V8 to produce V22..V25
  550. movq mm0, [esi+8*12] ; V12
  551. movq mm1, [esi+8*4] ; V4
  552. movq mm3, [esi+8*0] ; V0
  553. movq mm2, mm1 ; duplicate V4
  554. movq mm5, [esi+8*8] ; V8
  555. psubw mm1, mm0 ; V16
  556. movq mm6, x5a825a825a825a82 ; 23170 ->V18
  557. ;***************************************************PackMulW
  558. movq mm4, mm1
  559. punpcklwd mm1, const_zero
  560. paddw mm2, mm0 ; V17
  561. pmaddwd mm1, mm6
  562. movq mm0, mm2 ; duplicate V17
  563. punpckhwd mm4, const_zero
  564. pmaddwd mm4, mm6
  565. psrad mm1, 8
  566. psrad mm4, 8
  567. packssdw mm1, mm4
  568. movq mm4, mm3 ; duplicate V0
  569. ;**********************************************************
  570. paddw mm3, mm5 ; V19
  571. psubw mm4, mm5 ; V20 ;mm5 free
  572. movq mm6, mm3 ; duplicate t74=t81
  573. psubw mm1, mm0 ; V21 ; mm0 free
  574. paddw mm3, mm2 ; V22
  575. movq mm5, mm1 ; duplicate V21
  576. paddw mm1, mm4 ; V23
  577. movq [esi+8*4], mm3 ; V22
  578. psubw mm4, mm5 ; V24; mm5 free
  579. movq [esi+8*12], mm1 ; V23
  580. psubw mm6, mm2 ; V25; mm2 free
  581. movq [esi+8*0], mm4 ; V24
  582. ; keep mm6 alive all along the next block
  583. ; column 0: odd part
  584. ; use V2, V6, V10, V14 to produce V31, V39, V40, V41
  585. movq mm7, [esi+8*10] ; V10
  586. movq mm0, [esi+8*6] ; V6
  587. movq mm3, mm7 ; duplicate V10
  588. movq mm5, [esi+8*2] ; V2
  589. psubw mm7, mm0 ; V26
  590. movq mm4, [esi+8*14] ; V14
  591. paddw mm3, mm0 ; V29 ; free mm0
  592. movq mm2, x539f539f539f539f ; 23170 ->V18
  593. movq mm1, mm7 ; duplicate V26
  594. ;***************************************************PackMulW
  595. movq scratch5, mm6 ; store mm6
  596. movq mm0, mm7
  597. punpcklwd mm7, const_zero
  598. pmaddwd mm7, mm2
  599. punpckhwd mm0, const_zero
  600. pmaddwd mm0, mm2
  601. psrad mm7, 8
  602. movq mm6, x4546454645464546 ; 23170 ->V18
  603. psrad mm0, 8
  604. packssdw mm7, mm0
  605. movq mm0, mm5 ; duplicate V2
  606. ;**********************************************************
  607. paddw mm5, mm4 ; V27
  608. psubw mm0, mm4 ; V28 ; free mm4
  609. movq mm2, mm0 ; duplicate V28
  610. ;***************************************************PackMulW
  611. movq mm4, mm0
  612. punpcklwd mm0, const_zero
  613. pmaddwd mm0, mm6
  614. punpckhwd mm4, const_zero
  615. pmaddwd mm4, mm6
  616. paddw mm1, mm2 ; V32 ; free mm2
  617. movq mm2, x61f861f861f861f8 ; 23170 ->V18
  618. psrad mm0, 8
  619. psrad mm4, 8
  620. movq mm6, mm1
  621. packssdw mm0, mm4
  622. movq mm4, mm5 ; duplicate t90=t93
  623. ;**********************************************************
  624. ;***************************************************PackMulW
  625. punpcklwd mm1, const_zero
  626. paddw mm5, mm3 ; V31
  627. pmaddwd mm1, mm2
  628. psubw mm4, mm3 ; V30 ; free mm3
  629. punpckhwd mm6, const_zero
  630. pmaddwd mm6, mm2
  631. psrad mm1, 8
  632. psrad mm6, 8
  633. packssdw mm1, mm6
  634. ;**********************************************************
  635. psubw mm0, mm1 ; V38
  636. paddw mm1, mm7 ; V37 ; free mm7
  637. movq mm7, x5a825a825a825a82 ; 23170 ->V18
  638. ;***************************************************PackMulW
  639. movq mm3, mm4
  640. punpcklwd mm4, const_zero
  641. psubw mm1, mm5 ; V39 (mm5 still needed for next block)
  642. pmaddwd mm4, mm7
  643. punpckhwd mm3, const_zero
  644. movq mm6, scratch5
  645. pmaddwd mm3, mm7
  646. movq mm2, [esi+8*12] ; V23
  647. psrad mm4, 8
  648. movq mm7, [esi+8*4] ; V22
  649. psrad mm3, 8
  650. packssdw mm4, mm3
  651. movq mm3, mm6 ; duplicate V25
  652. ;**********************************************************
  653. psubw mm4, mm1 ; V40
  654. paddw mm0, mm4 ; V41; free mm0
  655. ; column 0: output butterfly
  656. psubw mm6, mm0 ; tm6
  657. paddw mm3, mm0 ; tm8; free mm1
  658. movq mm0, mm1 ; line added by Kumar
  659. movq mm1, mm7 ; duplicate V22
  660. movq [esi+8*8], mm3 ; tm8; free mm3
  661. paddw mm7, mm5 ; tm0
  662. movq [esi+8*6], mm6 ; tm6; free mm6
  663. psubw mm1, mm5 ; tm14; free mm5
  664. movq mm6, [esi+8*0] ; V24
  665. movq mm3, mm2 ; duplicate t117=t125
  666. movq [esi+8*0], mm7 ; tm0; free mm7
  667. paddw mm2, mm0 ; tm2
  668. movq [esi+8*14], mm1 ; tm14; free mm1
  669. psubw mm3, mm0 ; tm12; free mm0
  670. movq [esi+8*2], mm2 ; tm2; free mm2
  671. movq mm0, mm6 ; duplicate t119=t123
  672. movq [esi+8*12], mm3 ; tm12; free mm3
  673. paddw mm6, mm4 ; tm4
  674. movq mm1, [esi+8*5] ; V5
  675. psubw mm0, mm4 ; tm10; free mm4
  676. movq [esi+8*4], mm6 ; tm4; free mm6
  677. movq [esi+8*10], mm0 ; tm10; free mm0
  678. ; column 1: even part
  679. ; use V5, V13, V1, V9 to produce V56..V59
  680. movq mm7, [esi+8*13] ; V13
  681. movq mm2, mm1 ; duplicate t128=t130
  682. movq mm3, [esi+8*1] ; V1
  683. psubw mm1, mm7 ; V50
  684. movq mm5, [esi+8*9] ; V9
  685. paddw mm2, mm7 ; V51
  686. movq mm4, x5a825a825a825a82 ; 23170 ->V18
  687. ;***************************************************PackMulW
  688. movq mm6, mm1
  689. punpcklwd mm1, const_zero
  690. pmaddwd mm1, mm4
  691. punpckhwd mm6, const_zero
  692. pmaddwd mm6, mm4
  693. movq mm4, mm3 ; duplicate V1
  694. paddw mm3, mm5 ; V53
  695. psrad mm1, 8
  696. psubw mm4, mm5 ; V54 ;mm5 free
  697. movq mm7, mm3 ; duplicate V53
  698. psrad mm6, 8
  699. packssdw mm1, mm6
  700. movq mm6, mm2 ; duplicate V51
  701. ;**********************************************************
  702. psubw mm1, mm6 ; V55 ; mm6 free
  703. paddw mm3, mm2 ; V56
  704. movq mm5, mm4 ; duplicate t140=t142
  705. paddw mm4, mm1 ; V57
  706. movq [esi+8*5], mm3 ; V56
  707. psubw mm5, mm1 ; V58; mm1 free
  708. movq [esi+8*13], mm4 ; V57
  709. psubw mm7, mm2 ; V59; mm2 free
  710. movq [esi+8*9], mm5 ; V58
  711. ; keep mm7 alive all along the next block
  712. movq mm0, [esi+8*11] ; V11
  713. movq mm6, [esi+8*7] ; V7
  714. movq mm4, [esi+8*15] ; V15
  715. movq mm3, mm0 ; duplicate V11
  716. movq mm5, [esi+8*3] ; V3
  717. paddw mm0, mm6 ; V63
  718. ; note that V15 computation has a correction step:
  719. ; this is a 'magic' constant that rebiases the results to be closer to the expected result
  720. ; this magic constant can be refined to reduce the error even more
  721. ; by doing the correction step in a later stage when the number is actually multiplied by 16
  722. movq scratch7, mm7
  723. psubw mm3, mm6 ; V60 ; free mm6
  724. movq mm6, x539f539f539f539f ; 23170 ->V18
  725. movq mm1, mm3 ; duplicate V60
  726. ;***************************************************PackMulW
  727. movq mm7, mm1
  728. punpcklwd mm1, const_zero
  729. pmaddwd mm1, mm6
  730. punpckhwd mm7, const_zero
  731. pmaddwd mm7, mm6
  732. movq mm6, mm5 ; duplicate V3
  733. paddw mm5, mm4 ; V61
  734. psrad mm1, 8
  735. psubw mm6, mm4 ; V62 ; free mm4
  736. movq mm4, mm5 ; duplicate V61
  737. psrad mm7, 8
  738. paddw mm5, mm0 ; V65 -> result
  739. packssdw mm1, mm7
  740. psubw mm4, mm0 ; V64 ; free mm0
  741. ;**********************************************************
  742. movq mm7, x5a825a825a825a82 ; 23170 ->V18
  743. ;***************************************************PackMulW
  744. movq mm2, mm4
  745. punpcklwd mm4, const_zero
  746. paddw mm3, mm6 ; V66
  747. pmaddwd mm4, mm7
  748. punpckhwd mm2, const_zero
  749. pmaddwd mm2, mm7
  750. movq mm7, x61f861f861f861f8 ; 23170 ->V18
  751. psrad mm4, 8
  752. psrad mm2, 8
  753. packssdw mm4, mm2
  754. ;**********************************************************
  755. ;***************************************************PackMulW
  756. movq mm2, mm3
  757. punpcklwd mm3, const_zero
  758. pmaddwd mm3, mm7
  759. punpckhwd mm2, const_zero
  760. pmaddwd mm2, mm7
  761. movq mm7, x4546454645464546 ; 23170 ->V18
  762. psrad mm3, 8
  763. psrad mm2, 8
  764. packssdw mm3, mm2
  765. ;**********************************************************
  766. ;***************************************************PackMulW
  767. movq mm2, mm6
  768. punpcklwd mm6, const_zero
  769. pmaddwd mm6, mm7
  770. punpckhwd mm2, const_zero
  771. pmaddwd mm2, mm7
  772. movq mm0, [esi+8*5] ; V56
  773. psrad mm6, 8
  774. movq mm7, scratch7
  775. psrad mm2, 8
  776. packssdw mm6, mm2
  777. movq mm2, mm5 ; duplicate V65
  778. ;**********************************************************
  779. psubw mm6, mm3 ; V72
  780. paddw mm3, mm1 ; V71 ; free mm1
  781. psubw mm3, mm2 ; V73 ; free mm2
  782. movq mm1, mm0 ; duplicate t177=t188
  783. psubw mm4, mm3 ; V74
  784. paddw mm0, mm5 ; tm1
  785. movq mm2, [esi+8*13] ; V57
  786. paddw mm6, mm4 ; V75
  787. ;location
  788. ; 5 - V56
  789. ; 13 - V57
  790. ; 9 - V58
  791. ; X - V59, mm7
  792. ; X - V65, mm5
  793. ; X - V73, mm6
  794. ; X - V74, mm4
  795. ; X - V75, mm3
  796. ; free mm0, mm1 & mm2
  797. movq [esi+8*1], mm0 ; tm1; free mm0
  798. psubw mm1, mm5 ; tm15; free mm5
  799. ;save the store as used directly in the transpose
  800. movq mm5, mm7 ; duplicate t182=t184
  801. psubw mm7, mm6 ; tm7
  802. paddw mm5, mm6 ; tm9; free mm3
  803. movq mm6, mm3
  804. movq mm0, [esi+8*9] ; V58
  805. movq mm3, mm2 ; duplicate V57
  806. movq [esi+8*7], mm7 ; tm7; free mm7
  807. psubw mm3, mm6 ; tm13
  808. paddw mm2, mm6 ; tm3 ; free mm6
  809. movq mm6, mm0 ; duplicate V58
  810. movq [esi+8*3], mm2 ; tm3; free mm2
  811. paddw mm0, mm4 ; tm5
  812. psubw mm6, mm4 ; tm11; free mm4
  813. movq [esi+8*5], mm0 ; tm5; free mm0
  814. ; Final results to be stored after the transpose
  815. ; transpose the bottom right quadrant(4X4) of the matrix
  816. ; --------- ---------
  817. ; | M1 | M2 | | M1'| M3'|
  818. ; --------- --> ---------
  819. ; | M3 | M4 | | M2'| M4'|
  820. ; --------- ---------
  821. ;
  822. ; get the pointer to array "range"
  823. mov edi, range_limit
  824. ; calculate the destination address
  825. mov edx, output_buf ; get output_buf[4]
  826. mov ebx, [edx+16]
  827. add ebx, output_col ; add to output_col
  828. movq mm0, mm5 ; copy w4---0,1,3,5,6
  829. punpcklwd mm5, mm6 ;
  830. punpckhwd mm0, mm6 ;---0,1,3,5,6
  831. movq mm2, mm3 ;---0,1,2,3,5,6
  832. movq mm6, [esi+8*0] ;get w0 of top left quadrant
  833. punpcklwd mm3, mm1 ;
  834. movq mm7, [esi+8*2] ;get w1 of top left quadrant
  835. punpckhwd mm2, mm1 ;---0,2,3,5,6,7
  836. movq mm4, mm5 ;---0,2,3,4,5,6,7
  837. punpckldq mm5, mm3 ; transposed w4
  838. psrlw mm5, 5
  839. movd eax, mm5
  840. and eax, 03ffh
  841. mov al, byte ptr [edi][eax]
  842. mov byte ptr [ebx+4], al
  843. psrlq mm5, 16
  844. movd eax, mm5
  845. and eax, 03ffh
  846. mov al, byte ptr [edi][eax]
  847. mov byte ptr [ebx+5], al
  848. psrlq mm5, 16
  849. movd eax, mm5
  850. and eax, 03ffh
  851. mov al, byte ptr [edi][eax]
  852. mov byte ptr [ebx+6], al
  853. psrlq mm5, 16
  854. movd eax, mm5
  855. and eax, 03ffh
  856. mov al, byte ptr [edi][eax]
  857. mov byte ptr [ebx+7], al
  858. mov ebx, [edx+20]
  859. add ebx, output_col ; add to output_col
  860. punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
  861. movq mm3, mm0 ;---0,2,3,4,6,7
  862. punpckldq mm0, mm2 ; transposed w6
  863. psrlw mm4, 5
  864. movd eax, mm4
  865. and eax, 03ffh
  866. mov al, byte ptr [edi][eax]
  867. mov byte ptr [ebx+4], al
  868. psrlq mm4, 16
  869. movd eax, mm4
  870. and eax, 03ffh
  871. mov al, byte ptr [edi][eax]
  872. mov byte ptr [ebx+5], al
  873. psrlq mm4, 16
  874. movd eax, mm4
  875. and eax, 03ffh
  876. mov al, byte ptr [edi][eax]
  877. mov byte ptr [ebx+6], al
  878. psrlq mm4, 16
  879. movd eax, mm4
  880. and eax, 03ffh
  881. mov al, byte ptr [edi][eax]
  882. mov byte ptr [ebx+7], al
  883. mov ecx, [edx+24]
  884. add ecx, output_col ; add to output_col
  885. punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
  886. psrlw mm0, 5
  887. movd eax, mm0
  888. and eax, 03ffh
  889. mov al, byte ptr [edi][eax]
  890. mov byte ptr [ecx+4], al
  891. psrlq mm0, 16
  892. movd eax, mm0
  893. and eax, 03ffh
  894. mov al, byte ptr [edi][eax]
  895. mov byte ptr [ecx+5], al
  896. psrlq mm0, 16
  897. movd eax, mm0
  898. and eax, 03ffh
  899. mov al, byte ptr [edi][eax]
  900. mov byte ptr [ecx+6], al
  901. psrlq mm0, 16
  902. movd eax, mm0
  903. and eax, 03ffh
  904. mov al, byte ptr [edi][eax]
  905. mov byte ptr [ecx+7], al
  906. mov ebx, [edx+28]
  907. add ebx, output_col ; add to output_col
  908. movq mm5, mm6 ; copy w0
  909. psrlw mm3, 5
  910. movd eax, mm3
  911. and eax, 03ffh
  912. mov al, byte ptr [edi][eax]
  913. mov byte ptr [ebx+4], al
  914. psrlq mm3, 16
  915. movd eax, mm3
  916. and eax, 03ffh
  917. mov al, byte ptr [edi][eax]
  918. mov byte ptr [ebx+5], al
  919. psrlq mm3, 16
  920. movd eax, mm3
  921. and eax, 03ffh
  922. mov al, byte ptr [edi][eax]
  923. mov byte ptr [ebx+6], al
  924. psrlq mm3, 16
  925. movd eax, mm3
  926. and eax, 03ffh
  927. mov al, byte ptr [edi][eax]
  928. mov byte ptr [ebx+7], al
  929. punpcklwd mm6, mm7
  930. ; transpose the top left quadrant(4X4) of the matrix
  931. ; calculate the destination address
  932. mov edx, output_buf ; get output_buf[0]
  933. mov ebx, [edx+0]
  934. add ebx, output_col ; add to output_col
  935. movq mm4, [esi+8*6] ; get w3 of TL quadrant
  936. punpckhwd mm5, mm7 ;---5,6,7
  937. movq mm7, [esi+8*4] ; get w2 of TL quadrant
  938. movq mm2, mm6
  939. movq mm3, mm7 ; copy w2---3,4,5,6,7
  940. punpcklwd mm7, mm4 ;---2,3,4,5,6,7
  941. punpckhwd mm3, mm4 ;---2,3,4,5,6,7
  942. movq mm4, mm5 ;
  943. movq mm1, mm5
  944. punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
  945. psrlw mm6, 5
  946. movd eax, mm6
  947. and eax, 03ffh
  948. mov al, byte ptr [edi][eax]
  949. mov byte ptr [ebx], al
  950. psrlq mm6, 16
  951. movd eax, mm6
  952. and eax, 03ffh
  953. mov al, byte ptr [edi][eax]
  954. mov byte ptr [ebx+1], al
  955. psrlq mm6, 16
  956. movd eax, mm6
  957. and eax, 03ffh
  958. mov al, byte ptr [edi][eax]
  959. mov byte ptr [ebx+2], al
  960. psrlq mm6, 16
  961. movd eax, mm6
  962. and eax, 03ffh
  963. mov al, byte ptr [edi][eax]
  964. mov byte ptr [ebx+3], al
  965. mov ebx, [edx+4]
  966. add ebx, output_col ; add to output_col
  967. punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
  968. psrlw mm2, 5
  969. movd eax, mm2
  970. and eax, 03ffh
  971. mov al, byte ptr [edi][eax]
  972. mov byte ptr [ebx], al
  973. psrlq mm2, 16
  974. movd eax, mm2
  975. and eax, 03ffh
  976. mov al, byte ptr [edi][eax]
  977. mov byte ptr [ebx+1], al
  978. psrlq mm2, 16
  979. movd eax, mm2
  980. and eax, 03ffh
  981. mov al, byte ptr [edi][eax]
  982. mov byte ptr [ebx+2], al
  983. psrlq mm2, 16
  984. movd eax, mm2
  985. and eax, 03ffh
  986. mov al, byte ptr [edi][eax]
  987. mov byte ptr [ebx+3], al
  988. mov ecx, [edx+8]
  989. add ecx, output_col ; add to output_col
  990. punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
  991. psrlw mm5, 5
  992. movd eax, mm5
  993. and eax, 03ffh
  994. mov al, byte ptr [edi][eax]
  995. mov byte ptr [ecx], al
  996. psrlq mm5, 16
  997. movd eax, mm5
  998. and eax, 03ffh
  999. mov al, byte ptr [edi][eax]
  1000. mov byte ptr [ecx+1], al
  1001. psrlq mm5, 16
  1002. movd eax, mm5
  1003. and eax, 03ffh
  1004. mov al, byte ptr [edi][eax]
  1005. mov byte ptr [ecx+2], al
  1006. psrlq mm5, 16
  1007. movd eax, mm5
  1008. and eax, 03ffh
  1009. mov al, byte ptr [edi][eax]
  1010. mov byte ptr [ecx+3], al
  1011. mov ebx, [edx+12]
  1012. add ebx, output_col ; add to output_col
  1013. punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
  1014. psrlw mm1, 5
  1015. movd eax, mm1
  1016. and eax, 03ffh
  1017. mov al, byte ptr [edi][eax]
  1018. mov byte ptr [ebx], al
  1019. psrlq mm1, 16
  1020. movd eax, mm1
  1021. and eax, 03ffh
  1022. mov al, byte ptr [edi][eax]
  1023. mov byte ptr [ebx+1], al
  1024. psrlq mm1, 16
  1025. movd eax, mm1
  1026. and eax, 03ffh
  1027. mov al, byte ptr [edi][eax]
  1028. mov byte ptr [ebx+2], al
  1029. psrlq mm1, 16
  1030. movd eax, mm1
  1031. and eax, 03ffh
  1032. mov al, byte ptr [edi][eax]
  1033. mov byte ptr [ebx+3], al
  1034. ; transpose the top right quadrant(4X4) of the matrix
  1035. ; calculate the destination address for **bottom left quadrant
  1036. mov edx, output_buf ; get output_buf[4]
  1037. mov ebx, [edx+16]
  1038. add ebx, output_col ; add to output_col
  1039. movq mm0, [esi+8*1] ;---0
  1040. movq mm1, [esi+8*3] ;---0,1,2
  1041. movq mm2, mm0
  1042. movq mm3, [esi+8*5]
  1043. punpcklwd mm0, mm1 ;---0,1,2,3
  1044. punpckhwd mm2, mm1
  1045. movq mm4, mm3
  1046. movq mm1, [esi+8*7] ;---0,1,2,3
  1047. movq mm5, mm2
  1048. punpcklwd mm3, mm1 ;---0,1,2,3,4
  1049. punpckhwd mm4, mm1 ;---0,1,2,3,4
  1050. movq mm1, mm0
  1051. punpckldq mm0, mm3 ;---0,1,2,3,4,5
  1052. punpckhdq mm1, mm3 ;---0,1,2,3,4,5
  1053. movq mm3, [esi+8*8]
  1054. psrlw mm0, 5
  1055. movd eax, mm0
  1056. and eax, 03ffh
  1057. mov al, byte ptr [edi][eax]
  1058. mov byte ptr [ebx], al
  1059. psrlq mm0, 16
  1060. movd eax, mm0
  1061. and eax, 03ffh
  1062. mov al, byte ptr [edi][eax]
  1063. mov byte ptr [ebx+1], al
  1064. psrlq mm0, 16
  1065. movd eax, mm0
  1066. and eax, 03ffh
  1067. mov al, byte ptr [edi][eax]
  1068. mov byte ptr [ebx+2], al
  1069. psrlq mm0, 16
  1070. movd eax, mm0
  1071. and eax, 03ffh
  1072. mov al, byte ptr [edi][eax]
  1073. mov byte ptr [ebx+3], al
  1074. mov ebx, [edx+20]
  1075. add ebx, output_col ; add to output_col
  1076. punpckldq mm2, mm4 ;---1,2,3,4,5
  1077. punpckhdq mm5, mm4 ;---1,2,3,4,5
  1078. movq mm4, [esi+8*10]
  1079. ; transpose the bottom left quadrant(4X4) of the matrix
  1080. ; Also store w1,w2,w3 of top right quadrant into
  1081. ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
  1082. ; of BL is already done.
  1083. psrlw mm1, 5
  1084. movd eax, mm1
  1085. and eax, 03ffh
  1086. mov al, byte ptr [edi][eax]
  1087. mov byte ptr [ebx], al
  1088. psrlq mm1, 16
  1089. movd eax, mm1
  1090. and eax, 03ffh
  1091. mov al, byte ptr [edi][eax]
  1092. mov byte ptr [ebx+1], al
  1093. psrlq mm1, 16
  1094. movd eax, mm1
  1095. and eax, 03ffh
  1096. mov al, byte ptr [edi][eax]
  1097. mov byte ptr [ebx+2], al
  1098. psrlq mm1, 16
  1099. movd eax, mm1
  1100. and eax, 03ffh
  1101. mov al, byte ptr [edi][eax]
  1102. mov byte ptr [ebx+3], al
  1103. mov ecx, [edx+24]
  1104. add ecx, output_col ; add to output_col
  1105. movq mm0, [esi+8*12]
  1106. movq mm1, mm3 ;---1,2,3,4,5
  1107. punpcklwd mm3, mm4 ;---0,1,2,3,4,5
  1108. punpckhwd mm1, mm4 ;---0,1,2,3,4,5
  1109. movq mm4, [esi+8*14]
  1110. psrlw mm2, 5
  1111. movd eax, mm2
  1112. and eax, 03ffh
  1113. mov al, byte ptr [edi][eax]
  1114. mov byte ptr [ecx], al
  1115. psrlq mm2, 16
  1116. movd eax, mm2
  1117. and eax, 03ffh
  1118. mov al, byte ptr [edi][eax]
  1119. mov byte ptr [ecx+1], al
  1120. psrlq mm2, 16
  1121. movd eax, mm2
  1122. and eax, 03ffh
  1123. mov al, byte ptr [edi][eax]
  1124. mov byte ptr [ecx+2], al
  1125. psrlq mm2, 16
  1126. movd eax, mm2
  1127. and eax, 03ffh
  1128. mov al, byte ptr [edi][eax]
  1129. mov byte ptr [ecx+3], al
  1130. mov ebx, [edx+28]
  1131. add ebx, output_col ; add to output_col
  1132. movq mm2, mm0
  1133. psrlw mm5, 5
  1134. movd eax, mm5
  1135. and eax, 03ffh
  1136. mov al, byte ptr [edi][eax]
  1137. mov byte ptr [ebx], al
  1138. psrlq mm5, 16
  1139. movd eax, mm5
  1140. and eax, 03ffh
  1141. mov al, byte ptr [edi][eax]
  1142. mov byte ptr [ebx+1], al
  1143. psrlq mm5, 16
  1144. movd eax, mm5
  1145. and eax, 03ffh
  1146. mov al, byte ptr [edi][eax]
  1147. mov byte ptr [ebx+2], al
  1148. psrlq mm5, 16
  1149. movd eax, mm5
  1150. and eax, 03ffh
  1151. mov al, byte ptr [edi][eax]
  1152. mov byte ptr [ebx+3], al
  1153. punpcklwd mm0, mm4 ;---0,1,2,3,4
  1154. punpckhwd mm2, mm4 ;---0,1,2,3,4
  1155. movq mm4, mm3
  1156. movq mm5, mm1
  1157. punpckldq mm3, mm0 ;---0,1,2,3,4,5
  1158. ; calculate the destination address for **top right quadrant
  1159. mov edx, output_buf ; get output_buf[0]
  1160. mov ebx, [edx+0]
  1161. add ebx, output_col ; add to output_col
  1162. psrlw mm3, 5
  1163. movd eax, mm3
  1164. and eax, 03ffh
  1165. mov al, byte ptr [edi][eax]
  1166. mov byte ptr [ebx+4], al
  1167. psrlq mm3, 16
  1168. movd eax, mm3
  1169. and eax, 03ffh
  1170. mov al, byte ptr [edi][eax]
  1171. mov byte ptr [ebx+5], al
  1172. psrlq mm3, 16
  1173. movd eax, mm3
  1174. and eax, 03ffh
  1175. mov al, byte ptr [edi][eax]
  1176. mov byte ptr [ebx+6], al
  1177. psrlq mm3, 16
  1178. movd eax, mm3
  1179. and eax, 03ffh
  1180. mov al, byte ptr [edi][eax]
  1181. mov byte ptr [ebx+7], al
  1182. mov ebx, [edx+4]
  1183. add ebx, output_col ; add to output_col
  1184. punpckhdq mm4, mm0 ;---1,2,4,5
  1185. psrlw mm4, 5
  1186. movd eax, mm4
  1187. and eax, 03ffh
  1188. mov al, byte ptr [edi][eax]
  1189. mov byte ptr [ebx+4], al
  1190. psrlq mm4, 16
  1191. movd eax, mm4
  1192. and eax, 03ffh
  1193. mov al, byte ptr [edi][eax]
  1194. mov byte ptr [ebx+5], al
  1195. psrlq mm4, 16
  1196. movd eax, mm4
  1197. and eax, 03ffh
  1198. mov al, byte ptr [edi][eax]
  1199. mov byte ptr [ebx+6], al
  1200. psrlq mm4, 16
  1201. movd eax, mm4
  1202. and eax, 03ffh
  1203. mov al, byte ptr [edi][eax]
  1204. mov byte ptr [ebx+7], al
  1205. mov ecx, [edx+8]
  1206. add ecx, output_col ; add to output_col
  1207. punpckldq mm1, mm2 ;---1,2,5
  1208. psrlw mm1, 5
  1209. movd eax, mm1
  1210. and eax, 03ffh
  1211. mov al, byte ptr [edi][eax]
  1212. mov byte ptr [ecx+4], al
  1213. psrlq mm1, 16
  1214. movd eax, mm1
  1215. and eax, 03ffh
  1216. mov al, byte ptr [edi][eax]
  1217. mov byte ptr [ecx+5], al
  1218. psrlq mm1, 16
  1219. movd eax, mm1
  1220. and eax, 03ffh
  1221. mov al, byte ptr [edi][eax]
  1222. mov byte ptr [ecx+6], al
  1223. psrlq mm1, 16
  1224. movd eax, mm1
  1225. and eax, 03ffh
  1226. mov al, byte ptr [edi][eax]
  1227. mov byte ptr [ecx+7], al
  1228. mov ebx, [edx+12]
  1229. add ebx, output_col ; add to output_col
  1230. punpckhdq mm5, mm2 ;---5
  1231. psrlw mm5, 5
  1232. movd eax, mm5
  1233. and eax, 03ffh
  1234. mov al, byte ptr [edi][eax]
  1235. mov byte ptr [ebx+4], al
  1236. psrlq mm5, 16
  1237. movd eax, mm5
  1238. and eax, 03ffh
  1239. mov al, byte ptr [edi][eax]
  1240. mov byte ptr [ebx+5], al
  1241. psrlq mm5, 16
  1242. movd eax, mm5
  1243. and eax, 03ffh
  1244. mov al, byte ptr [edi][eax]
  1245. mov byte ptr [ebx+6], al
  1246. psrlq mm5, 16
  1247. movd eax, mm5
  1248. and eax, 03ffh
  1249. mov al, byte ptr [edi][eax]
  1250. mov byte ptr [ebx+7], al
  1251. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1252. emms
  1253. } /* end of __asm */
  1254. }
  1255. #endif /* DCT_IFAST_SUPPORTED */