Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2137 lines
53 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. */
  13. /*
  14. * jfdctint.c
  15. *
  16. * Copyright (C) 1991-1996, Thomas G. Lane.
  17. * This file is part of the Independent JPEG Group's software.
  18. * For conditions of distribution and use, see the accompanying README file.
  19. *
  20. * This file contains a slow-but-accurate integer implementation of the
  21. * forward DCT (Discrete Cosine Transform).
  22. *
  23. * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  24. * on each column. Direct algorithms are also available, but they are
  25. * much more complex and seem not to be any faster when reduced to code.
  26. *
  27. * This implementation is based on an algorithm described in
  28. * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  29. * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  30. * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  31. * The primary algorithm described there uses 11 multiplies and 29 adds.
  32. * We use their alternate method with 12 multiplies and 32 adds.
  33. * The advantage of this method is that no data path contains more than one
  34. * multiplication; this allows a very simple and accurate implementation in
  35. * scaled fixed-point arithmetic, with a minimal number of shifts.
  36. */
  37. #define JPEG_INTERNALS
  38. #include "jinclude.h"
  39. #include "jpeglib.h"
  40. #include "jdct.h" /* Private declarations for DCT subsystem */
  41. #ifdef DCT_ISLOW_SUPPORTED
  42. /*
  43. * This module is specialized to the case DATASIZE = 8.
  44. */
  45. #if DCTSIZE != 8
  46. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  47. #endif
  48. /*
  49. * The poop on this scaling stuff is as follows:
  50. *
  51. * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
  52. * larger than the true DCT outputs. The final outputs are therefore
  53. * a factor of N larger than desired; since N=8 this can be cured by
  54. * a simple right shift at the end of the algorithm. The advantage of
  55. * this arrangement is that we save two multiplications per 1-D DCT,
  56. * because the y0 and y4 outputs need not be divided by sqrt(N).
  57. * In the IJG code, this factor of 8 is removed by the quantization step
  58. * (in jcdctmgr.c), NOT in this module.
  59. *
  60. * We have to do addition and subtraction of the integer inputs, which
  61. * is no problem, and multiplication by fractional constants, which is
  62. * a problem to do in integer arithmetic. We multiply all the constants
  63. * by CONST_SCALE and convert them to integer constants (thus retaining
  64. * CONST_BITS bits of precision in the constants). After doing a
  65. * multiplication we have to divide the product by CONST_SCALE, with proper
  66. * rounding, to produce the correct output. This division can be done
  67. * cheaply as a right shift of CONST_BITS bits. We postpone shifting
  68. * as long as possible so that partial sums can be added together with
  69. * full fractional precision.
  70. *
  71. * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  72. * they are represented to better-than-integral precision. These outputs
  73. * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  74. * with the recommended scaling. (For 12-bit sample data, the intermediate
  75. * array is INT32 anyway.)
  76. *
  77. * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  78. * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
  79. * shows that the values given below are the most effective.
  80. */
  81. #if BITS_IN_JSAMPLE == 8
  82. #define CONST_BITS 13
  83. #define PASS1_BITS 2
  84. #else
  85. #define CONST_BITS 13
  86. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  87. #endif
  88. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  89. * causing a lot of useless floating-point operations at run time.
  90. * To get around this we use the following pre-calculated constants.
  91. * If you change CONST_BITS you may want to add appropriate values.
  92. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  93. */
  94. #if CONST_BITS == 13
  95. #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
  96. #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
  97. #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
  98. #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
  99. #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
  100. #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
  101. #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
  102. #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
  103. #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
  104. #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
  105. #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
  106. #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
  107. #else
  108. #define FIX_0_298631336 FIX(0.298631336)
  109. #define FIX_0_390180644 FIX(0.390180644)
  110. #define FIX_0_541196100 FIX(0.541196100)
  111. #define FIX_0_765366865 FIX(0.765366865)
  112. #define FIX_0_899976223 FIX(0.899976223)
  113. #define FIX_1_175875602 FIX(1.175875602)
  114. #define FIX_1_501321110 FIX(1.501321110)
  115. #define FIX_1_847759065 FIX(1.847759065)
  116. #define FIX_1_961570560 FIX(1.961570560)
  117. #define FIX_2_053119869 FIX(2.053119869)
  118. #define FIX_2_562915447 FIX(2.562915447)
  119. #define FIX_3_072711026 FIX(3.072711026)
  120. #endif
  121. const __int64 Const_1 = 0x0000000100000001;
  122. const __int64 Const_2 = 0x0002000200020002;
  123. const __int64 Const_1024 = 0x0000040000000400;
  124. const __int64 Const_16384 = 0x0000400000004000;
  125. const __int64 Const_FFFF = 0xFFFFFFFFFFFFFFFF;
  126. const __int64 Const_0xFIX_0_298631336 = 0x0000098e0000098e;
  127. const __int64 Const_FIX_0_298631336x0 = 0x098e0000098e0000;
  128. const __int64 Const_0xFIX_0_390180644 = 0x00000c7c00000c7c;
  129. const __int64 Const_FIX_0_390180644x0 = 0x0c7c00000c7c0000;
  130. const __int64 Const_0xFIX_0_541196100 = 0x0000115100001151;
  131. const __int64 Const_FIX_0_541196100x0 = 0x1151000011510000;
  132. const __int64 Const_0xFIX_0_765366865 = 0x0000187e0000187e;
  133. const __int64 Const_FIX_0_765366865x0 = 0x187e0000187e0000;
  134. const __int64 Const_0xFIX_0_899976223 = 0x00001ccd00001ccd;
  135. const __int64 Const_FIX_0_899976223x0 = 0x1ccd00001ccd0000;
  136. const __int64 Const_0xFIX_1_175875602 = 0x000025a1000025a1;
  137. const __int64 Const_FIX_1_175875602x0 = 0x25a1000025a10000;
  138. const __int64 Const_0xFIX_1_501321110 = 0x0000300b0000300b;
  139. const __int64 Const_FIX_1_501321110x0 = 0x300b0000300b0000;
  140. const __int64 Const_0xFIX_1_847759065 = 0x00003b2100003b21;
  141. const __int64 Const_FIX_1_847759065x0 = 0x3b2100003b210000;
  142. const __int64 Const_0xFIX_1_961570560 = 0x00003ec500003ec5;
  143. const __int64 Const_FIX_1_961570560x0 = 0x3ec500003ec50000;
  144. const __int64 Const_0xFIX_2_053119869 = 0x000041b3000041b3;
  145. const __int64 Const_FIX_2_053119869x0 = 0x41b3000041b30000;
  146. const __int64 Const_0xFIX_2_562915447 = 0x0000520300005203;
  147. const __int64 Const_FIX_2_562915447x0 = 0x5203000052030000;
  148. const __int64 Const_0xFIX_3_072711026 = 0x0000625400006254;
  149. const __int64 Const_FIX_3_072711026x0 = 0x6254000062540000;
  150. /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
  151. * For 8-bit samples with the recommended scaling, all the variable
  152. * and constant values involved are no more than 16 bits wide, so a
  153. * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
  154. * For 12-bit samples, a full 32-bit multiplication will be needed.
  155. */
  156. #if BITS_IN_JSAMPLE == 8
  157. #define MULTIPLY(var,const) MULTIPLY16C16(var,const)
  158. #else
  159. #define MULTIPLY(var,const) ((var) * (const))
  160. #endif
  161. #define DATASIZE 32
  162. /*
  163. * Perform the forward DCT on one block of samples.
  164. */
  165. GLOBAL(void)
  166. mfdct8x8llm (DCTELEM * data)
  167. {
  168. __int64 qwTemp0, qwTemp2, qwTemp4, qwTemp6;
  169. __int64 qwZ1, qwZ2, qwZ4_even, qwZ4_odd;
  170. __int64 qwTmp4_Z3_Even, qwTmp4_Z3_Odd;
  171. __int64 qwTmp6_Z3_Even, qwTmp6_Z3_Odd;
  172. __int64 qwTmp5_Z4_Even, qwTmp5_Z4_Odd;
  173. __int64 qwScratch7, qwScratch6, qwScratch5;
  174. __asm{
  175. mov edi, [data]
  176. // transpose the bottom right quadrant(4X4) of the matrix
  177. // --------- ---------
  178. // | M1 | M2 | | M1'| M3'|
  179. // --------- --> ---------
  180. // | M3 | M4 | | M2'| M4'|
  181. // --------- ---------
  182. // Get the 32-bit quantities and pack into 16 bits
  183. movq mm5, [edi][DATASIZE*4+16] //| w41 | w40 |
  184. movq mm3, [edi][DATASIZE*4+24] //| w43 | w42 |
  185. movq mm6, [edi][DATASIZE*5+16]
  186. packssdw mm5, mm3 //|w43|w42|w41|w40|
  187. movq mm7, [edi][DATASIZE*5+24]
  188. movq mm4, mm5 // copy w4---0,1,3,5,6
  189. movq mm3, [edi][DATASIZE*6+16]
  190. packssdw mm6, mm7
  191. movq mm2, [edi][DATASIZE*6+24]
  192. punpcklwd mm5, mm6 //mm6 = w5
  193. movq mm1, [edi][DATASIZE*7+16]
  194. packssdw mm3, mm2
  195. movq mm0, [edi][DATASIZE*7+24]
  196. punpckhwd mm4, mm6 //---0,1,3,5,6
  197. packssdw mm1, mm0
  198. movq mm7, mm3 //---0,1,2,3,5,6 w6
  199. punpcklwd mm3, mm1 //mm1 = w7
  200. movq mm0, mm5 //---0,2,3,4,5,6,7
  201. movq mm2, [edi][DATASIZE*4] //| w01 | w00 |
  202. punpckhdq mm0, mm3 // transposed w5---0,2,4,6,7
  203. punpckhwd mm7, mm1 //---0,2,3,5,6,7
  204. movq mm1, [edi][DATASIZE*5+8]
  205. movq mm6, mm4 //---0,2,3,4,6,7
  206. movq [edi][DATASIZE*5+16], mm0 // store w5
  207. punpckldq mm5, mm3 // transposed w4
  208. movq mm3, [edi][DATASIZE*5]
  209. punpckldq mm4, mm7 // transposed w6
  210. movq mm0, [edi][DATASIZE*4+8] //| w03 | w02 |
  211. punpckhdq mm6, mm7 // transposed w7---0,3,6,7
  212. // transpose the bottom left quadrant(4X4) of the matrix and place
  213. // in the top right quadrant while doing the same for the top
  214. // right quadrant
  215. // --------- ---------
  216. // | M1 | M2 | | M1'| M3'|
  217. // --------- --> ---------
  218. // | M3 | M4 | | M2'| M4'|
  219. // --------- ---------
  220. movq [edi][DATASIZE*4+16], mm5 // store w4
  221. packssdw mm2, mm0 //|w03|w02|w01|w00|
  222. movq mm5, [edi][DATASIZE*7]
  223. packssdw mm3, mm1
  224. movq mm0, [edi][DATASIZE*7+8]
  225. movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
  226. packssdw mm5, mm0
  227. movq mm6, [edi][DATASIZE*6]
  228. movq mm0, mm2 // copy w0---0,1,3,5,6
  229. movq mm7, [edi][DATASIZE*6+8]
  230. punpcklwd mm2, mm3 //mm6 = w1
  231. movq [edi][DATASIZE*6+16], mm4 // store w6---3,5,6,7
  232. packssdw mm6, mm7
  233. movq mm1, [edi][DATASIZE*0+24]
  234. punpckhwd mm0, mm3 //---0,1,3,5,6
  235. movq mm7, mm6 //---0,1,2,3,5,6 w2
  236. punpcklwd mm6, mm5 //mm1 = w3
  237. movq mm3, [edi][DATASIZE*0+16]
  238. punpckhwd mm7, mm5 //---0,2,3,5,6,7
  239. movq mm4, [edi][DATASIZE*2+24]
  240. packssdw mm3, mm1
  241. movq mm1, mm2 //---0,2,3,4,5,6,7
  242. punpckldq mm2, mm6 // transposed w4
  243. movq mm5, [edi][DATASIZE*2+16]
  244. punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
  245. movq [edi][DATASIZE*0+16], mm2 // store w4
  246. packssdw mm5, mm4
  247. movq mm4, [edi][DATASIZE*1+16]
  248. movq mm6, mm0 //---0,2,3,4,6,7
  249. movq mm2, [edi][DATASIZE*1+24]
  250. punpckldq mm0, mm7 // transposed w6
  251. movq [edi][DATASIZE*1+16], mm1 // store w5
  252. punpckhdq mm6, mm7 // transposed w7---0,3,6,7
  253. movq mm7, [edi][DATASIZE*3+24]
  254. packssdw mm4, mm2
  255. movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
  256. movq mm1, mm3 // copy w4---0,1,3,5,6
  257. movq mm2, [edi][DATASIZE*3+16]
  258. punpcklwd mm3, mm4 //mm6 = w5
  259. movq [edi][DATASIZE*3+16], mm6 // store w7---5,6,7
  260. packssdw mm2, mm7
  261. // transpose the bottom left quadrant(4X4) of the matrix
  262. // --------- ---------
  263. // | M1 | M2 | | M1'| M3'|
  264. // --------- --> ---------
  265. // | M3 | M4 | | M2'| M4'|
  266. // --------- ---------
  267. movq mm6, [edi][DATASIZE*0] //| w01 | w00 |
  268. punpckhwd mm1, mm4 //---0,1,3,5,6
  269. movq mm7, mm5 //---0,1,2,3,5,6 w6
  270. punpcklwd mm5, mm2 //mm1 = w7
  271. movq mm4, [edi][DATASIZE*0+8] //| w03 | w02 |
  272. punpckhwd mm7, mm2 //---0,2,3,5,6,7
  273. movq mm0, mm3 //---0,2,3,4,5,6,7
  274. packssdw mm6, mm4 //|w03|w02|w01|w00|
  275. movq mm2, [edi][DATASIZE*2+8]
  276. punpckldq mm3, mm5 // transposed w4
  277. movq mm4, [edi][DATASIZE*1]
  278. punpckhdq mm0, mm5 // transposed w5---0,2,4,6,7
  279. movq [edi][DATASIZE*4], mm3 // store w4
  280. movq mm5, mm1 //---0,2,3,4,6,7
  281. movq mm3, [edi][DATASIZE*2]
  282. punpckldq mm1, mm7 // transposed w6
  283. movq [edi][DATASIZE*5], mm0 // store w5
  284. punpckhdq mm5, mm7 // transposed w7---0,3,6,7
  285. movq mm7, [edi][DATASIZE*1+8]
  286. packssdw mm3, mm2
  287. movq [edi][DATASIZE*7], mm5 // store w7---5,6,7
  288. movq mm2, mm6 // copy w0---0,1,3,5,6
  289. movq [edi][DATASIZE*6], mm1 // store w6---3,5,6,7
  290. packssdw mm4, mm7
  291. // transpose the top left quadrant(4X4) of the matrix
  292. // --------- ---------
  293. // | M1 | M2 | | M1'| M3'|
  294. // --------- --> ---------
  295. // | M3 | M4 | | M2'| M4'|
  296. // --------- ---------
  297. // Get the 32-bit quantities and pack into 16 bits
  298. movq mm1, [edi][DATASIZE*3]
  299. punpcklwd mm6, mm4 //mm6 = w1
  300. movq mm0, [edi][DATASIZE*3+8]
  301. punpckhwd mm2, mm4 //---0,1,3,5,6
  302. packssdw mm1, mm0
  303. movq mm5, mm3 //---0,1,2,3,5,6 w2
  304. punpcklwd mm3, mm1 //mm1 = w3
  305. movq mm0, mm6 //---0,2,3,4,5,6,7
  306. movq mm4, [edi][DATASIZE*7]
  307. punpckhwd mm5, mm1 //---0,2,3,5,6,7
  308. movq mm1, [edi][DATASIZE*4]
  309. punpckhdq mm6, mm3 // transposed w4
  310. punpckldq mm0, mm3 // transposed w5---0,2,4,6,7
  311. movq mm3, mm2 //---0,2,3,4,6,7
  312. movq [edi][DATASIZE*0], mm0 // store w4
  313. punpckldq mm2, mm5 // transposed w6
  314. movq [edi][DATASIZE*1], mm6 // store w5
  315. punpckhdq mm3, mm5 // transposed w7---0,3,6,7
  316. movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
  317. paddw mm0, mm4
  318. movq [edi][DATASIZE*3], mm3 // store w7---5,6,7
  319. paddw mm3, mm1
  320. //******************************************************************************
  321. // End of transpose. Begin row dct.
  322. //******************************************************************************
  323. // tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
  324. movq mm7, mm0
  325. paddw mm0, mm3 //tmp10
  326. paddw mm6, [edi][DATASIZE*6]
  327. psubw mm7, mm3 //tmp13
  328. paddw mm2, [edi][DATASIZE*5]
  329. movq mm1, mm6
  330. // tmp10 = tmp0 + tmp3;
  331. paddw mm1, mm2 //tmp11
  332. psubw mm6, mm2 //tmp12
  333. // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
  334. // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
  335. movq mm3, mm0
  336. paddw mm0, mm1 //tmp10 + tmp11
  337. psubw mm3, mm1 //tmp10 - tmp11
  338. psllw mm0, 2 // descale it
  339. movq mm1, mm6 //copy tmp12
  340. psllw mm3, 2 // descale it
  341. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  342. movq qwTemp0, mm0 //store
  343. paddw mm1, mm7 //tmp12 + tmp13
  344. movq mm2, mm1 //copy
  345. // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
  346. // CONST_BITS-PASS1_BITS);
  347. // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
  348. // CONST_BITS-PASS1_BITS);
  349. pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
  350. movq mm4, mm7
  351. pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
  352. movq mm0, mm6
  353. pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
  354. pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
  355. pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
  356. paddd mm7, mm1 // add z1
  357. pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
  358. paddd mm7, Const_1024
  359. paddd mm4, mm2
  360. paddd mm4, Const_1024
  361. psrad mm7, 11 // descale it | |R2| |R0|
  362. //!!!!!! Negate the results in mm6 and mm0
  363. pxor mm6, Const_FFFF //invert result
  364. psrad mm4, 11 // descale it | |R3| |R1|
  365. paddd mm6, Const_1 // 2's complement
  366. movq mm5, mm7
  367. pxor mm0, Const_FFFF //invert result
  368. punpckldq mm7, mm4 //| |R1| |R0|
  369. paddd mm0, Const_1 // 2's complement
  370. punpckhdq mm5, mm4 //| |R3| |R2|
  371. movq qwTemp4, mm3 //store
  372. packssdw mm7, mm5
  373. movq mm5, Const_1024
  374. paddd mm6, mm1 // add z1
  375. movq qwTemp2, mm7 //store
  376. paddd mm6, mm5
  377. paddd mm0, mm2
  378. psrad mm6, 11 // descale it | |R2| |R0|
  379. paddd mm0, mm5
  380. movq mm5, mm6
  381. movq mm4, [edi][DATASIZE*3]
  382. psrad mm0, 11 // descale it | |R3| |R1|
  383. psubw mm4, [edi][DATASIZE*4]
  384. punpckldq mm6, mm0 //| |R1| |R0|
  385. movq mm7, [edi][DATASIZE*0]
  386. punpckhdq mm5, mm0 //| |R3| |R2|
  387. psubw mm7, [edi][DATASIZE*7]
  388. packssdw mm6, mm5
  389. // tmp4 = dataptr[3] - dataptr[4];
  390. movq mm5, [edi][DATASIZE*2]
  391. movq mm0, mm4
  392. psubw mm5, [edi][DATASIZE*5]
  393. movq mm2, mm4
  394. movq qwTemp6, mm6 //store
  395. paddw mm0, mm7 //z1
  396. movq mm6, [edi][DATASIZE*1]
  397. movq mm1, mm5
  398. psubw mm6, [edi][DATASIZE*6]
  399. movq mm3, mm5
  400. // z1 = tmp4 + tmp7;
  401. movq qwScratch5, mm5
  402. paddw mm3, mm7 //z4
  403. movq qwScratch7, mm7
  404. paddw mm2, mm6 //z3
  405. movq qwZ1, mm0 //store
  406. paddw mm1, mm6 //z2
  407. // z3 = MULTIPLY(z3, - FIX_1_961570560);
  408. // z4 = MULTIPLY(z4, - FIX_0_390180644);
  409. // z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
  410. movq mm0, Const_FFFF
  411. movq mm5, mm2
  412. movq qwZ2, mm1
  413. movq mm7, mm2
  414. pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
  415. paddw mm2, mm3 //z3 + z4
  416. pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
  417. movq mm1, mm3
  418. movq qwScratch6, mm6
  419. movq mm6, mm2
  420. // z3 += z5;
  421. //!!!!!! Negate the results
  422. pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
  423. pxor mm5, mm0 //invert result
  424. paddd mm5, Const_1 // 2's complement
  425. pxor mm7, mm0 //invert result
  426. pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
  427. pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
  428. paddd mm5, mm2 //z3_even
  429. paddd mm7, Const_1 // 2's complement
  430. pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
  431. pxor mm3, mm0 //invert result
  432. // z4 += z5;
  433. //!!!!!! Negate the results
  434. paddd mm3, Const_1 // 2's complement
  435. pxor mm1, mm0 //invert result
  436. paddd mm1, Const_1 // 2's complement
  437. paddd mm3, mm2
  438. movq mm0, qwScratch6
  439. movq mm2, mm4
  440. // tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
  441. pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
  442. paddd mm7, mm6 //z3_odd
  443. pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
  444. paddd mm1, mm6
  445. movq mm6, mm0
  446. paddd mm4, mm5
  447. // tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
  448. pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
  449. paddd mm2, mm7
  450. pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
  451. movq qwTmp4_Z3_Odd, mm2
  452. movq qwTmp4_Z3_Even, mm4
  453. paddd mm6, mm5
  454. movq mm5, qwScratch5
  455. paddd mm0, mm7
  456. movq mm7, qwScratch7
  457. movq mm2, mm5
  458. movq qwTmp6_Z3_Even, mm6
  459. movq mm6, mm7
  460. // tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
  461. // tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
  462. pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
  463. pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
  464. pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
  465. pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
  466. paddd mm5, mm3
  467. movq qwTmp6_Z3_Odd, mm0
  468. paddd mm2, mm1
  469. movq qwTmp5_Z4_Even, mm5
  470. paddd mm7, mm3
  471. movq mm0, qwZ1
  472. paddd mm6, mm1
  473. // z1 = MULTIPLY(z1, - FIX_0_899976223);
  474. movq mm1, Const_FFFF
  475. movq mm4, mm0
  476. //!!!!!! Negate the results
  477. pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
  478. pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
  479. movq mm3, qwTmp4_Z3_Even
  480. movq qwTmp5_Z4_Odd, mm2
  481. pxor mm0, mm1 //invert result
  482. movq mm2, qwTmp4_Z3_Odd
  483. pxor mm4, mm1 //invert result
  484. paddd mm4, Const_1 // 2's complement
  485. paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
  486. paddd mm0, Const_1 // 2's complement
  487. paddd mm6, mm4 //tmp7 + z1 + z4 ODD
  488. // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
  489. paddd mm7, Const_1024 //rounding adj
  490. paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
  491. paddd mm6, Const_1024 //rounding adj
  492. psrad mm7, 11 // descale it | |R2| |R0|
  493. psrad mm6, 11 // descale it | |R3| |R1|
  494. movq mm5, mm7
  495. punpckldq mm7, mm6 //| |R1| |R0|
  496. // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
  497. punpckhdq mm5, mm6 //| |R3| |R2|
  498. paddd mm2, mm4 //tmp4 + z1 + z3 ODD
  499. paddd mm3, Const_1024 //rounding adj
  500. packssdw mm7, mm5
  501. paddd mm2, Const_1024 //rounding adj
  502. psrad mm3, 11 // descale it | |R2| |R0|
  503. movq mm0, qwZ2
  504. psrad mm2, 11 // descale it | |R3| |R1|
  505. movq mm5, mm3
  506. movq mm4, mm0
  507. // z2 = MULTIPLY(z2, - FIX_2_562915447);
  508. pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
  509. punpckldq mm3, mm2 //| |R1| |R0|
  510. pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
  511. punpckhdq mm5, mm2 //| |R3| |R2|
  512. movq mm2, Const_FFFF
  513. packssdw mm3, mm5
  514. movq [edi][DATASIZE*1], mm7 //store
  515. //!!!!!! Negate the results
  516. pxor mm0, mm2 //invert result
  517. movq mm5, Const_1
  518. pxor mm4, mm2 //invert result
  519. movq [edi][DATASIZE*7], mm3 //store
  520. paddd mm0, mm5 // 2's complement
  521. movq mm7, qwTmp6_Z3_Even
  522. paddd mm4, mm5 // 2's complement
  523. // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
  524. movq mm2, qwTmp6_Z3_Odd
  525. paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
  526. paddd mm7, Const_1024 //rounding adj
  527. paddd mm2, mm4 //tmp6 + z2 + z3 ODD
  528. paddd mm2, Const_1024 //rounding adj
  529. psrad mm7, 11 // descale it | |R2| |R0|
  530. movq mm6, qwTemp0 //restore
  531. psrad mm2, 11 // descale it | |R3| |R1|
  532. movq mm3, qwTmp5_Z4_Even
  533. movq mm5, mm7
  534. movq [edi][DATASIZE*0], mm6 //store
  535. punpckldq mm7, mm2 //| |R1| |R0|
  536. movq mm1, qwTmp5_Z4_Odd
  537. punpckhdq mm5, mm2 //| |R3| |R2|
  538. movq mm6, qwTemp2 //restore
  539. packssdw mm7, mm5
  540. movq mm5, Const_1024
  541. paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
  542. // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
  543. movq [edi][DATASIZE*3], mm7 //store
  544. paddd mm1, mm4 //tmp5 + z2 + z4 ODD
  545. movq mm7, qwTemp4 //restore
  546. paddd mm3, mm5 //rounding adj
  547. movq [edi][DATASIZE*2], mm6 //store
  548. paddd mm1, mm5 //rounding adj
  549. movq [edi][DATASIZE*4], mm7 //store
  550. psrad mm3, 11 // descale it | |R2| |R0|
  551. movq mm6, qwTemp6 //restore
  552. psrad mm1, 11 // descale it | |R3| |R1|
  553. movq mm0, [edi][DATASIZE*0+16]
  554. movq mm5, mm3
  555. movq [edi][DATASIZE*6], mm6 //store
  556. punpckldq mm3, mm1 //| |R1| |R0|
  557. paddw mm0, [edi][DATASIZE*7+16]
  558. punpckhdq mm5, mm1 //| |R3| |R2|
  559. movq mm1, [edi][DATASIZE*1+16]
  560. packssdw mm3, mm5
  561. paddw mm1, [edi][DATASIZE*6+16]
  562. movq mm7, mm0
  563. movq [edi][DATASIZE*5], mm3 //store
  564. movq mm6, mm1
  565. //******************************************************************************
  566. // This completes 4x8 dct locations. Copy to do other 4x8.
  567. //******************************************************************************
  568. // tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
  569. movq mm3, [edi][DATASIZE*3+16]
  570. paddw mm3, [edi][DATASIZE*4+16]
  571. movq mm2, [edi][DATASIZE*2+16]
  572. paddw mm0, mm3 //tmp10
  573. paddw mm2, [edi][DATASIZE*5+16]
  574. psubw mm7, mm3 //tmp13
  575. // tmp10 = tmp0 + tmp3;
  576. paddw mm1, mm2 //tmp11
  577. psubw mm6, mm2 //tmp12
  578. // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
  579. // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
  580. movq mm3, mm0
  581. paddw mm0, mm1 //tmp10 + tmp11
  582. psubw mm3, mm1 //tmp10 - tmp11
  583. psllw mm0, 2 // descale it
  584. movq mm1, mm6 //copy tmp12
  585. psllw mm3, 2 // descale it
  586. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  587. movq qwTemp0, mm0 //store
  588. paddw mm1, mm7 //tmp12 + tmp13
  589. //;;; movq [edi][DATASIZE*6+16], mm4 ; store w6---3,5,6,7
  590. movq mm2, mm1 //copy
  591. // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
  592. // CONST_BITS-PASS1_BITS);
  593. // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
  594. // CONST_BITS-PASS1_BITS);
  595. pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
  596. movq mm4, mm7
  597. pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
  598. movq mm0, mm6
  599. pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
  600. pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
  601. pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
  602. paddd mm7, mm1 // add z1
  603. pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
  604. paddd mm7, Const_1024
  605. paddd mm4, mm2
  606. paddd mm4, Const_1024
  607. psrad mm7, 11 // descale it | |R2| |R0|
  608. //!!!!!! Negate the results in mm6 and mm0
  609. pxor mm6, Const_FFFF //invert result
  610. psrad mm4, 11 // descale it | |R3| |R1|
  611. paddd mm6, Const_1 // 2's complement
  612. movq mm5, mm7
  613. pxor mm0, Const_FFFF //invert result
  614. punpckldq mm7, mm4 //| |R1| |R0|
  615. paddd mm0, Const_1 // 2's complement
  616. punpckhdq mm5, mm4 //| |R3| |R2|
  617. movq qwTemp4, mm3 //store
  618. packssdw mm7, mm5
  619. movq mm5, Const_1024
  620. paddd mm6, mm1 // add z1
  621. movq qwTemp2, mm7 //store
  622. paddd mm0, mm2
  623. movq mm4, [edi][DATASIZE*3+16]
  624. paddd mm6, mm5
  625. psubw mm4, [edi][DATASIZE*4+16]
  626. psrad mm6, 11 // descale it | |R2| |R0|
  627. paddd mm0, mm5
  628. movq mm5, mm6
  629. movq mm7, [edi][DATASIZE*0+16]
  630. psrad mm0, 11 // descale it | |R3| |R1|
  631. psubw mm7, [edi][DATASIZE*7+16]
  632. punpckldq mm6, mm0 //| |R1| |R0|
  633. punpckhdq mm5, mm0 //| |R3| |R2|
  634. movq mm0, mm4
  635. packssdw mm6, mm5
  636. movq mm2, mm4
  637. // tmp4 = dataptr[3] - dataptr[4];
  638. movq mm5, [edi][DATASIZE*2+16]
  639. paddw mm0, mm7 //z1
  640. psubw mm5, [edi][DATASIZE*5+16]
  641. movq qwTemp6, mm6 //store
  642. movq mm1, mm5
  643. movq mm6, [edi][DATASIZE*1+16]
  644. movq mm3, mm5
  645. // z1 = tmp4 + tmp7;
  646. psubw mm6, [edi][DATASIZE*6+16]
  647. paddw mm3, mm7 //z4
  648. movq qwScratch7, mm7
  649. paddw mm2, mm6 //z3
  650. movq qwScratch5, mm5
  651. paddw mm1, mm6 //z2
  652. // z3 = MULTIPLY(z3, - FIX_1_961570560);
  653. // z4 = MULTIPLY(z4, - FIX_0_390180644);
  654. // z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
  655. movq qwZ1, mm0 //store
  656. movq mm5, mm2
  657. movq qwZ2, mm1
  658. movq mm7, mm2
  659. movq mm0, Const_FFFF
  660. paddw mm2, mm3 //z3 + z4
  661. pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
  662. movq mm1, mm3
  663. pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
  664. movq qwScratch6, mm6
  665. movq mm6, mm2
  666. // z3 += z5//
  667. //!!!!!! Negate the results
  668. pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
  669. pxor mm5, mm0 //invert result
  670. paddd mm5, Const_1 // 2's complement
  671. pxor mm7, mm0 //invert result
  672. pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
  673. pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
  674. paddd mm5, mm2 //z3_even
  675. paddd mm7, Const_1 // 2's complement
  676. pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
  677. pxor mm3, mm0 //invert result
  678. // z4 += z5;
  679. //!!!!!! Negate the results
  680. paddd mm3, Const_1 // 2's complement
  681. pxor mm1, mm0 //invert result
  682. paddd mm1, Const_1 // 2's complement
  683. paddd mm3, mm2
  684. movq mm0, qwScratch6
  685. movq mm2, mm4
  686. // tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
  687. pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
  688. paddd mm7, mm6 //z3_odd
  689. pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
  690. paddd mm1, mm6
  691. movq mm6, mm0
  692. paddd mm4, mm5
  693. // tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
  694. pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
  695. paddd mm2, mm7
  696. pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
  697. movq qwTmp4_Z3_Odd, mm2
  698. movq qwTmp4_Z3_Even, mm4
  699. paddd mm6, mm5
  700. movq mm5, qwScratch5
  701. paddd mm0, mm7
  702. movq mm7, qwScratch7
  703. movq mm2, mm5
  704. movq qwTmp6_Z3_Even, mm6
  705. movq mm6, mm7
  706. // tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
  707. // tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
  708. pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
  709. pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
  710. pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
  711. pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
  712. paddd mm5, mm3
  713. movq qwTmp6_Z3_Odd, mm0
  714. paddd mm2, mm1
  715. movq qwTmp5_Z4_Even, mm5
  716. paddd mm7, mm3
  717. movq mm0, qwZ1
  718. paddd mm6, mm1
  719. // z1 = MULTIPLY(z1, - FIX_0_899976223);
  720. movq mm1, Const_FFFF
  721. movq mm4, mm0
  722. //!!!!!! Negate the results
  723. pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
  724. pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
  725. movq mm3, qwTmp4_Z3_Even
  726. movq qwTmp5_Z4_Odd, mm2
  727. pxor mm0, mm1 //invert result
  728. movq mm2, qwTmp4_Z3_Odd
  729. pxor mm4, mm1 //invert result
  730. paddd mm4, Const_1 // 2's complement
  731. paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
  732. paddd mm0, Const_1 // 2's complement
  733. paddd mm6, mm4 //tmp7 + z1 + z4 ODD
  734. // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
  735. paddd mm7, Const_1024 //rounding adj
  736. paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
  737. paddd mm6, Const_1024 //rounding adj
  738. psrad mm7, 11 // descale it | |R2| |R0|
  739. psrad mm6, 11 // descale it | |R3| |R1|
  740. movq mm5, mm7
  741. punpckldq mm7, mm6 //| |R1| |R0|
  742. // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
  743. punpckhdq mm5, mm6 //| |R3| |R2|
  744. paddd mm2, mm4 //tmp4 + z1 + z3 ODD
  745. paddd mm3, Const_1024 //rounding adj
  746. packssdw mm7, mm5
  747. paddd mm2, Const_1024 //rounding adj
  748. psrad mm3, 11 // descale it | |R2| |R0|
  749. movq mm0, qwZ2
  750. psrad mm2, 11 // descale it | |R3| |R1|
  751. movq mm5, mm3
  752. movq mm4, mm0
  753. // z2 = MULTIPLY(z2, - FIX_2_562915447);
  754. pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
  755. punpckldq mm3, mm2 //| |R1| |R0|
  756. pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
  757. punpckhdq mm5, mm2 //| |R3| |R2|
  758. movq mm2, Const_FFFF
  759. packssdw mm3, mm5
  760. movq [edi][DATASIZE*1+16], mm7 //store
  761. //!!!!!! Negate the results
  762. pxor mm0, mm2 //invert result
  763. movq mm5, Const_1
  764. pxor mm4, mm2 //invert result
  765. movq [edi][DATASIZE*7+16], mm3 //store
  766. paddd mm0, mm5 // 2's complement
  767. movq mm7, qwTmp6_Z3_Even
  768. paddd mm4, mm5 // 2's complement
  769. // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
  770. movq mm2, qwTmp6_Z3_Odd
  771. paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
  772. paddd mm7, Const_1024 //rounding adj
  773. paddd mm2, mm4 //tmp6 + z2 + z3 ODD
  774. paddd mm2, Const_1024 //rounding adj
  775. psrad mm7, 11 // descale it | |R2| |R0|
  776. movq mm6, qwTemp0 //restore
  777. psrad mm2, 11 // descale it | |R3| |R1|
  778. movq mm5, mm7
  779. movq [edi][DATASIZE*0+16], mm6 //store
  780. punpckldq mm7, mm2 //| |R1| |R0|
  781. movq mm3, qwTmp5_Z4_Even
  782. punpckhdq mm5, mm2 //| |R3| |R2|
  783. movq mm1, qwTmp5_Z4_Odd
  784. packssdw mm7, mm5
  785. movq mm6, qwTemp2 //restore
  786. paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
  787. // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
  788. movq mm0, Const_1024
  789. paddd mm1, mm4 //tmp5 + z2 + z4 ODD
  790. movq [edi][DATASIZE*3+16], mm7 //store
  791. paddd mm3, mm0 //rounding adj
  792. movq mm7, qwTemp4 //restore
  793. paddd mm1, mm0 //rounding adj
  794. movq [edi][DATASIZE*2+16], mm6 //store
  795. psrad mm3, 11 // descale it | |R2| |R0|
  796. movq mm6, qwTemp6 //restore
  797. psrad mm1, 11 // descale it | |R3| |R1|
  798. movq [edi][DATASIZE*4+16], mm7 //store
  799. movq mm5, mm3
  800. movq [edi][DATASIZE*6+16], mm6 //store
  801. punpckldq mm3, mm1 //| |R1| |R0|
  802. punpckhdq mm5, mm1 //| |R3| |R2|
  803. movq mm0, mm7 // copy w4---0,1,3,5,6
  804. movq mm1, [edi][DATASIZE*7+16]
  805. packssdw mm3, mm5
  806. movq [edi][DATASIZE*5+16], mm3 //store
  807. punpcklwd mm7, mm3 //mm6 = w5
  808. //******************************************************************************
  809. //******************************************************************************
  810. // This completes all 8x8 dct locations for the row case.
  811. // Now transpose the data for the columns.
  812. //******************************************************************************
  813. // transpose the bottom right quadrant(4X4) of the matrix
  814. // --------- ---------
  815. // | M1 | M2 | | M1'| M3'|
  816. // --------- --> ---------
  817. // | M3 | M4 | | M2'| M4'|
  818. // --------- ---------
  819. movq mm4, mm7 //---0,2,3,4,5,6,7
  820. punpckhwd mm0, mm3 //---0,1,3,5,6
  821. movq mm2, mm6 //---0,1,2,3,5,6 w6
  822. punpcklwd mm6, mm1 //mm1 = w7
  823. // tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7]//
  824. movq mm5, [edi][DATASIZE*5]
  825. punpckldq mm7, mm6 // transposed w4
  826. punpckhdq mm4, mm6 // transposed w5---0,2,4,6,7
  827. movq mm6, mm0 //---0,2,3,4,6,7
  828. movq [edi][DATASIZE*4+16], mm7 // store w4
  829. punpckhwd mm2, mm1 //---0,2,3,5,6,7
  830. movq [edi][DATASIZE*5+16], mm4 // store w5
  831. punpckldq mm0, mm2 // transposed w6
  832. movq mm7, [edi][DATASIZE*4]
  833. punpckhdq mm6, mm2 // transposed w7---0,3,6,7
  834. movq [edi][DATASIZE*6+16], mm0 // store w6---3,5,6,7
  835. movq mm0, mm7 // copy w0---0,1,3,5,6
  836. movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
  837. punpcklwd mm7, mm5 //mm6 = w1
  838. // transpose the bottom left quadrant(4X4) of the matrix and place
  839. // in the top right quadrant while doing the same for the top
  840. // right quadrant
  841. // --------- ---------
  842. // | M1 | M2 | | M1'| M3'|
  843. // --------- --> ---------
  844. // | M3 | M4 | | M2'| M4'|
  845. // --------- ---------
  846. movq mm3, [edi][DATASIZE*6]
  847. punpckhwd mm0, mm5 //---0,1,3,5,6
  848. movq mm1, [edi][DATASIZE*7]
  849. movq mm2, mm3 //---0,1,2,3,5,6 w2
  850. movq mm6, [edi][DATASIZE*0+16]
  851. punpcklwd mm3, mm1 //mm1 = w3
  852. movq mm5, [edi][DATASIZE*1+16]
  853. punpckhwd mm2, mm1 //---0,2,3,5,6,7
  854. movq mm4, mm7 //---0,2,3,4,5,6,7
  855. punpckldq mm7, mm3 // transposed w4
  856. punpckhdq mm4, mm3 // transposed w5---0,2,4,6,7
  857. movq mm3, mm0 //---0,2,3,4,6,7
  858. movq [edi][DATASIZE*0+16], mm7 // store w4
  859. punpckldq mm0, mm2 // transposed w6
  860. movq mm1, [edi][DATASIZE*2+16]
  861. punpckhdq mm3, mm2 // transposed w7---0,3,6,7
  862. movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
  863. movq mm0, mm6 // copy w4---0,1,3,5,6
  864. movq mm7, [edi][DATASIZE*3+16]
  865. punpcklwd mm6, mm5 //mm6 = w5
  866. movq [edi][DATASIZE*1+16], mm4 // store w5
  867. punpckhwd mm0, mm5 //---0,1,3,5,6
  868. // transpose the top right quadrant(4X4) of the matrix
  869. // --------- ---------
  870. // | M1 | M2 | | M1'| M3'|
  871. // --------- --> ---------
  872. // | M3 | M4 | | M2'| M4'|
  873. // --------- ---------
  874. movq mm2, mm1 //---0,1,2,3,5,6 w6
  875. punpcklwd mm1, mm7 //mm1 = w7
  876. movq mm4, mm6 //---0,2,3,4,5,6,7
  877. punpckldq mm6, mm1 // transposed w4
  878. movq [edi][DATASIZE*3+16], mm3 // store w7---5,6,7
  879. punpckhdq mm4, mm1 // transposed w5---0,2,4,6,7
  880. movq [edi][DATASIZE*4], mm6 // store w4
  881. punpckhwd mm2, mm7 //---0,2,3,5,6,7
  882. movq mm7, [edi][DATASIZE*0]
  883. movq mm1, mm0 //---0,2,3,4,6,7
  884. movq mm3, [edi][DATASIZE*1]
  885. punpckldq mm0, mm2 // transposed w6
  886. movq [edi][DATASIZE*5], mm4 // store w5
  887. punpckhdq mm1, mm2 // transposed w7---0,3,6,7
  888. movq [edi][DATASIZE*6], mm0 // store w6---3,5,6,7
  889. movq mm2, mm7 // copy w0---0,1,3,5,6
  890. movq mm4, [edi][DATASIZE*3]
  891. punpcklwd mm7, mm3 //mm6 = w1
  892. // transpose the top left quadrant(4X4) of the matrix
  893. // --------- ---------
  894. // | M1 | M2 | | M1'| M3'|
  895. // --------- --> ---------
  896. // | M3 | M4 | | M2'| M4'|
  897. // --------- ---------
  898. movq mm6, [edi][DATASIZE*2]
  899. punpckhwd mm2, mm3 //---0,1,3,5,6
  900. movq mm0, mm6 //---0,1,2,3,5,6 w2
  901. punpcklwd mm6, mm4 //mm1 = w3
  902. movq [edi][DATASIZE*7], mm1 // store w7---5,6,7
  903. punpckhwd mm0, mm4 //---0,2,3,5,6,7
  904. movq mm1, mm7 //---0,2,3,4,5,6,7
  905. punpckldq mm7, mm6 // transposed w4
  906. punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
  907. movq mm6, mm2 //---0,2,3,4,6,7
  908. movq [edi][DATASIZE*0], mm7 // store w4
  909. punpckldq mm2, mm0 // transposed w6
  910. paddw mm7, [edi][DATASIZE*7]
  911. punpckhdq mm6, mm0 // transposed w7---0,3,6,7
  912. movq [edi][DATASIZE*3], mm6 // store w7---5,6,7
  913. movq mm4, mm7
  914. paddw mm6, [edi][DATASIZE*4]
  915. movq [edi][DATASIZE*1], mm1 // store w5
  916. paddw mm7, mm6 //tmp10
  917. //******************************************************************************
  918. // This begins the column dct
  919. //******************************************************************************
  920. paddw mm1, [edi][DATASIZE*6]
  921. psubw mm4, mm6 //tmp13
  922. movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
  923. movq mm6, mm1
  924. paddw mm2, [edi][DATASIZE*5]
  925. movq mm3, mm7
  926. paddw mm1, mm2 //tmp11
  927. psubw mm6, mm2 //tmp12
  928. // dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
  929. // dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
  930. paddw mm7, mm1 //tmp10 + tmp11
  931. paddw mm7, Const_2 // round add 2 to each element
  932. psubw mm3, mm1 //tmp10 - tmp11
  933. paddw mm3, Const_2 // round add 2 to each element
  934. psraw mm7, 2 // descale it
  935. // unpack word to dword sign extended
  936. movq mm5, mm7
  937. punpcklwd mm7, mm7
  938. psrad mm7, 16 // even results store in Temp0
  939. punpckhwd mm5, mm5
  940. psrad mm5, 16 // odd results store in array
  941. movq mm1, mm6 //copy tmp12
  942. movq qwTemp0, mm7 //store
  943. psraw mm3, 2 // descale it
  944. movq [edi][DATASIZE*0+8], mm5
  945. movq mm5, mm3
  946. punpcklwd mm3, mm3
  947. paddw mm1, mm4 //tmp12 + tmp13
  948. psrad mm3, 16 // even results store in Temp4
  949. movq mm2, mm1 //copy
  950. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  951. pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
  952. punpckhwd mm5, mm5
  953. pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
  954. movq mm7, mm4
  955. // dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
  956. // CONST_BITS+PASS1_BITS);
  957. pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
  958. psrad mm5, 16 // odd results store in array
  959. pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
  960. movq mm0, mm6
  961. // dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
  962. // CONST_BITS+PASS1_BITS);
  963. pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
  964. movq qwTemp4, mm3 //store
  965. paddd mm4, mm2
  966. paddd mm4, Const_16384
  967. paddd mm7, mm1 // add z1
  968. paddd mm7, Const_16384
  969. psrad mm4, 15 // descale it | |R3| |R1|
  970. movq [edi][DATASIZE*4+8], mm5
  971. psrad mm7, 15 // descale it | |R2| |R0|
  972. pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
  973. movq mm5, mm7
  974. //!!!!!! Negate result
  975. movq mm3, Const_1
  976. punpckldq mm7, mm4 //| |R1| |R0|
  977. pxor mm6, Const_FFFF //invert result
  978. punpckhdq mm5, mm4 //| |R3| |R2|
  979. movq qwTemp2, mm7 //store
  980. paddd mm6, mm3 // 2's complement
  981. pxor mm0, Const_FFFF //invert result
  982. paddd mm6, mm1 // add z1
  983. movq [edi][DATASIZE*2+8], mm5 //write out 2nd half in unused memory
  984. paddd mm0, mm3 // 2's complement
  985. movq mm3, Const_16384
  986. paddd mm0, mm2
  987. movq mm7, [edi][DATASIZE*0]
  988. paddd mm6, mm3
  989. movq mm4, [edi][DATASIZE*3]
  990. paddd mm0, mm3
  991. psubw mm7, [edi][DATASIZE*7]
  992. psrad mm6, 15 // descale it | |R2| |R0|
  993. psubw mm4, [edi][DATASIZE*4]
  994. psrad mm0, 15 // descale it | |R3| |R1|
  995. movq mm3, [edi][DATASIZE*2]
  996. movq mm5, mm6
  997. psubw mm3, [edi][DATASIZE*5]
  998. punpckldq mm6, mm0 //| |R1| |R0|
  999. punpckhdq mm5, mm0 //| |R3| |R2|
  1000. movq mm0, mm4
  1001. movq qwTemp6, mm6 //store
  1002. movq mm2, mm4
  1003. // tmp4 = dataptr[3] - dataptr[4];
  1004. // z1 = tmp4 + tmp7;
  1005. movq mm6, [edi][DATASIZE*1]
  1006. paddw mm0, mm7 //z1
  1007. movq [edi][DATASIZE*6+8], mm5 //write out 2nd half in unused memory
  1008. movq mm1, mm3
  1009. psubw mm6, [edi][DATASIZE*6]
  1010. movq mm5, mm3
  1011. movq qwZ1, mm0 //store
  1012. paddw mm5, mm7 //z4
  1013. movq qwScratch7, mm7
  1014. paddw mm1, mm6 //z2
  1015. movq qwScratch5, mm3
  1016. paddw mm2, mm6 //z3
  1017. movq qwZ2, mm1
  1018. movq mm3, mm2
  1019. // z3 = MULTIPLY(z3, - FIX_1_961570560);
  1020. // z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
  1021. // z4 = MULTIPLY(z4, - FIX_0_390180644);
  1022. movq qwScratch6, mm6
  1023. movq mm1, mm2
  1024. pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
  1025. movq mm7, mm5
  1026. movq mm6, Const_FFFF
  1027. paddw mm2, mm5 //z3 + z4
  1028. pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
  1029. movq mm0, mm2
  1030. pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
  1031. //!!!!!! Negate the results
  1032. pxor mm3, mm6 //invert result
  1033. pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
  1034. pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
  1035. pxor mm1, mm6 //invert result
  1036. pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
  1037. //!!!!!! Negate the results
  1038. pxor mm7, mm6 //invert result
  1039. paddd mm3, Const_1 // 2's complement
  1040. pxor mm5, mm6 //invert result
  1041. // z3 += z5//
  1042. paddd mm1, Const_1 // 2's complement
  1043. paddd mm3, mm2 //z3_even
  1044. paddd mm5, Const_1 // 2's complement
  1045. paddd mm1, mm0 //z3_odd
  1046. // z4 += z5;
  1047. paddd mm7, Const_1 // 2's complement
  1048. paddd mm5, mm2
  1049. paddd mm7, mm0
  1050. movq mm2, mm4
  1051. // tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
  1052. pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
  1053. pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
  1054. movq qwZ4_even, mm5
  1055. movq qwZ4_odd, mm7
  1056. paddd mm4, mm3
  1057. movq mm6, qwScratch6
  1058. paddd mm2, mm1
  1059. movq qwTmp4_Z3_Even, mm4
  1060. movq mm5, mm6
  1061. // tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
  1062. pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
  1063. pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
  1064. movq qwTmp4_Z3_Odd, mm2
  1065. movq mm4, qwZ4_even
  1066. paddd mm6, mm3
  1067. movq mm3, qwScratch5
  1068. paddd mm5, mm1
  1069. movq qwTmp6_Z3_Even, mm6
  1070. movq mm2, mm3
  1071. // tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
  1072. pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
  1073. pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
  1074. movq qwTmp6_Z3_Odd, mm5
  1075. movq mm0, qwZ4_odd
  1076. paddd mm3, mm4
  1077. movq mm7, qwScratch7
  1078. paddd mm2, mm0
  1079. movq qwTmp5_Z4_Even, mm3
  1080. movq mm6, mm7
  1081. // tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
  1082. pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
  1083. pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
  1084. movq mm3, qwZ1
  1085. movq qwTmp5_Z4_Odd, mm2
  1086. paddd mm7, mm4
  1087. movq mm5, Const_FFFF
  1088. movq mm4, mm3
  1089. // z1 = MULTIPLY(z1, - FIX_0_899976223);
  1090. pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
  1091. paddd mm6, mm0
  1092. pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
  1093. movq mm2, qwTmp4_Z3_Odd
  1094. //!!!!!! Negate the results
  1095. pxor mm3, mm5 //invert result
  1096. paddd mm3, Const_1 // 2's complement
  1097. pxor mm4, mm5 //invert result
  1098. paddd mm4, Const_1 // 2's complement
  1099. paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
  1100. // dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
  1101. // CONST_BITS+PASS1_BITS);
  1102. paddd mm7, Const_16384 //rounding adj
  1103. paddd mm6, mm4 //tmp7 + z1 + z4 ODD
  1104. paddd mm6, Const_16384 //rounding adj
  1105. psrad mm7, 15 // descale it | |R2| |R0|
  1106. movq mm0, qwTmp4_Z3_Even
  1107. psrad mm6, 15 // descale it | |R3| |R1|
  1108. paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
  1109. movq mm5, mm7
  1110. movq mm3, qwTemp0 //restore
  1111. punpckldq mm7, mm6 //| |R1| |R0|
  1112. paddd mm0, Const_16384 //rounding adj
  1113. paddd mm2, mm4 //tmp4 + z1 + z3 ODD
  1114. movq [edi][DATASIZE*0], mm3 //store
  1115. punpckhdq mm5, mm6 //| |R3| |R2|
  1116. // dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
  1117. // CONST_BITS+PASS1_BITS);
  1118. paddd mm2, Const_16384 //rounding adj
  1119. psrad mm0, 15 // descale it | |R2| |R0|
  1120. movq mm6, qwZ2
  1121. psrad mm2, 15 // descale it | |R3| |R1|
  1122. movq [edi][DATASIZE*1+8], mm5 //store
  1123. movq mm4, mm6
  1124. // z2 = MULTIPLY(z2, - FIX_2_562915447);
  1125. pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
  1126. movq mm5, mm0
  1127. pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
  1128. punpckldq mm0, mm2 //| |R1| |R0|
  1129. movq mm3, Const_FFFF
  1130. punpckhdq mm5, mm2 //| |R3| |R2|
  1131. movq [edi][DATASIZE*1], mm7 //store
  1132. //!!!!!! Negate the results
  1133. pxor mm6, mm3 //invert result
  1134. movq mm1, Const_1
  1135. pxor mm4, mm3 //invert result
  1136. movq mm7, qwTmp6_Z3_Even
  1137. paddd mm6, mm1 // 2's complement
  1138. movq mm2, qwTmp6_Z3_Odd
  1139. paddd mm4, mm1 // 2's complement
  1140. // dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
  1141. // CONST_BITS+PASS1_BITS);
  1142. movq [edi][DATASIZE*7], mm0 //store
  1143. paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
  1144. movq mm1, Const_16384
  1145. paddd mm2, mm4 //tmp6 + z2 + z3 ODD
  1146. movq mm3, qwTemp2 //restore
  1147. paddd mm7, mm1 //rounding adj
  1148. movq [edi][DATASIZE*7+8], mm5 //store
  1149. paddd mm2, mm1 //rounding adj
  1150. movq [edi][DATASIZE*2], mm3 //store
  1151. psrad mm7, 15 // descale it | |R2| |R0|
  1152. movq mm0, qwTemp4 //restore
  1153. psrad mm2, 15 // descale it | |R3| |R1|
  1154. movq mm3, qwTmp5_Z4_Even
  1155. movq mm5, mm7
  1156. movq [edi][DATASIZE*4], mm0 //store
  1157. paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
  1158. movq mm6, qwTmp5_Z4_Odd
  1159. punpckldq mm7, mm2 //| |R1| |R0|
  1160. punpckhdq mm5, mm2 //| |R3| |R2|
  1161. paddd mm6, mm4 //tmp5 + z2 + z4 ODD
  1162. movq [edi][DATASIZE*3], mm7 //store
  1163. paddd mm3, mm1 //rounding adj
  1164. // dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
  1165. // CONST_BITS+PASS1_BITS);
  1166. movq mm0, qwTemp6 //restore
  1167. paddd mm6, mm1 //rounding adj
  1168. movq [edi][DATASIZE*3+8], mm5 //store
  1169. psrad mm3, 15 // descale it | |R2| |R0|
  1170. movq [edi][DATASIZE*6], mm0 //store
  1171. psrad mm6, 15 // descale it | |R3| |R1|
  1172. movq mm7, [edi][DATASIZE*0+16]
  1173. movq mm5, mm3
  1174. paddw mm7, [edi][DATASIZE*7+16]
  1175. punpckldq mm3, mm6 //| |R1| |R0|
  1176. movq mm1, [edi][DATASIZE*1+16]
  1177. punpckhdq mm5, mm6 //| |R3| |R2|
  1178. paddw mm1, [edi][DATASIZE*6+16]
  1179. movq mm4, mm7
  1180. //******************************************************************************
  1181. // This completes 4x8 dct locations. Copy to do other 4x8.
  1182. //******************************************************************************
  1183. movq mm6, [edi][DATASIZE*3+16]
  1184. paddw mm6, [edi][DATASIZE*4+16]
  1185. movq mm2, [edi][DATASIZE*2+16]
  1186. psubw mm4, mm6 //tmp13
  1187. paddw mm2, [edi][DATASIZE*5+16]
  1188. paddw mm7, mm6 //tmp10
  1189. movq [edi][DATASIZE*5], mm3 //store
  1190. movq mm6, mm1
  1191. movq [edi][DATASIZE*5+8], mm5 //store
  1192. paddw mm1, mm2 //tmp11
  1193. psubw mm6, mm2 //tmp12
  1194. movq mm3, mm7
  1195. // dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
  1196. // dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
  1197. paddw mm7, mm1 //tmp10 + tmp11
  1198. paddw mm7, Const_2 // round add 2 to each element
  1199. psubw mm3, mm1 //tmp10 - tmp11
  1200. paddw mm3, Const_2 // round add 2 to each element
  1201. psraw mm7, 2 // descale it
  1202. // unpack word to dword sign extended
  1203. movq mm5, mm7
  1204. punpcklwd mm7, mm7
  1205. psrad mm7, 16 // even results store in Temp0
  1206. punpckhwd mm5, mm5
  1207. psrad mm5, 16 // odd results store in array
  1208. movq mm1, mm6 //copy tmp12
  1209. movq qwTemp0, mm7 //store
  1210. psraw mm3, 2 // descale it
  1211. movq [edi][DATASIZE*0+24], mm5
  1212. movq mm5, mm3
  1213. punpcklwd mm3, mm3
  1214. paddw mm1, mm4 //tmp12 + tmp13
  1215. psrad mm3, 16 // even results store in Temp4
  1216. movq mm2, mm1 //copy
  1217. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  1218. pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
  1219. punpckhwd mm5, mm5
  1220. pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
  1221. movq mm7, mm4
  1222. // dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
  1223. // CONST_BITS+PASS1_BITS);
  1224. pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
  1225. psrad mm5, 16 // odd results store in array
  1226. pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
  1227. movq mm0, mm6
  1228. // dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
  1229. // CONST_BITS+PASS1_BITS);
  1230. pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
  1231. movq qwTemp4, mm3 //store
  1232. paddd mm4, mm2
  1233. paddd mm4, Const_16384
  1234. paddd mm7, mm1 // add z1
  1235. paddd mm7, Const_16384
  1236. psrad mm4, 15 // descale it | |R3| |R1|
  1237. movq [edi][DATASIZE*4+24], mm5
  1238. psrad mm7, 15 // descale it | |R2| |R0|
  1239. pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
  1240. movq mm5, mm7
  1241. //!!!!!! Negate result
  1242. movq mm3, Const_1
  1243. punpckldq mm7, mm4 //| |R1| |R0|
  1244. pxor mm6, Const_FFFF //invert result
  1245. punpckhdq mm5, mm4 //| |R3| |R2|
  1246. movq qwTemp2, mm7 //store
  1247. paddd mm6, mm3 // 2's complement
  1248. pxor mm0, Const_FFFF //invert result
  1249. paddd mm6, mm1 // add z1
  1250. movq [edi][DATASIZE*2+24], mm5 //write out 2nd half in unused memory
  1251. paddd mm0, mm3 // 2's complement
  1252. movq mm3, Const_16384
  1253. paddd mm0, mm2
  1254. movq mm7, [edi][DATASIZE*0+16]
  1255. paddd mm6, mm3
  1256. movq mm4, [edi][DATASIZE*3+16]
  1257. paddd mm0, mm3
  1258. psubw mm7, [edi][DATASIZE*7+16]
  1259. psrad mm6, 15 // descale it | |R2| |R0|
  1260. psubw mm4, [edi][DATASIZE*4+16]
  1261. psrad mm0, 15 // descale it | |R3| |R1|
  1262. movq mm3, [edi][DATASIZE*2+16]
  1263. movq mm5, mm6
  1264. psubw mm3, [edi][DATASIZE*5+16]
  1265. punpckldq mm6, mm0 //| |R1| |R0|
  1266. punpckhdq mm5, mm0 //| |R3| |R2|
  1267. movq mm0, mm4
  1268. movq qwTemp6, mm6 //store
  1269. movq mm2, mm4
  1270. // tmp4 = dataptr[3] - dataptr[4];
  1271. // z1 = tmp4 + tmp7;
  1272. movq mm6, [edi][DATASIZE*1+16]
  1273. paddw mm0, mm7 //z1
  1274. movq [edi][DATASIZE*6+24], mm5 //write out 2nd half in unused memory
  1275. movq mm1, mm3
  1276. psubw mm6, [edi][DATASIZE*6+16]
  1277. movq mm5, mm3
  1278. movq qwZ1, mm0 //store
  1279. paddw mm5, mm7 //z4
  1280. movq qwScratch7, mm7
  1281. paddw mm1, mm6 //z2
  1282. movq qwScratch5, mm3
  1283. paddw mm2, mm6 //z3
  1284. movq qwZ2, mm1
  1285. movq mm3, mm2
  1286. // z3 = MULTIPLY(z3, - FIX_1_961570560);
  1287. // z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
  1288. // z4 = MULTIPLY(z4, - FIX_0_390180644);
  1289. movq qwScratch6, mm6
  1290. movq mm1, mm2
  1291. pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
  1292. movq mm7, mm5
  1293. movq mm6, Const_FFFF
  1294. paddw mm2, mm5 //z3 + z4
  1295. pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
  1296. movq mm0, mm2
  1297. pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
  1298. //!!!!!! Negate the results
  1299. pxor mm3, mm6 //invert result
  1300. pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
  1301. pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
  1302. pxor mm1, mm6 //invert result
  1303. pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
  1304. //!!!!!! Negate the results
  1305. pxor mm7, mm6 //invert result
  1306. paddd mm3, Const_1 // 2's complement
  1307. pxor mm5, mm6 //invert result
  1308. // z3 += z5;
  1309. paddd mm1, Const_1 // 2's complement
  1310. paddd mm3, mm2 //z3_even
  1311. paddd mm5, Const_1 // 2's complement
  1312. paddd mm1, mm0 //z3_odd
  1313. // z4 += z5;
  1314. paddd mm7, Const_1 // 2's complement
  1315. paddd mm5, mm2
  1316. paddd mm7, mm0
  1317. movq mm2, mm4
  1318. // tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
  1319. pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
  1320. pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
  1321. movq qwZ4_even, mm5
  1322. movq qwZ4_odd, mm7
  1323. paddd mm4, mm3
  1324. movq mm6, qwScratch6
  1325. paddd mm2, mm1
  1326. movq qwTmp4_Z3_Even, mm4
  1327. movq mm5, mm6
  1328. // tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
  1329. pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
  1330. pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
  1331. movq qwTmp4_Z3_Odd, mm2
  1332. movq mm4, qwZ4_even
  1333. paddd mm6, mm3
  1334. movq mm3, qwScratch5
  1335. paddd mm5, mm1
  1336. movq qwTmp6_Z3_Even, mm6
  1337. movq mm2, mm3
  1338. // tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
  1339. pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
  1340. pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
  1341. movq qwTmp6_Z3_Odd, mm5
  1342. movq mm0, qwZ4_odd
  1343. paddd mm3, mm4
  1344. movq mm7, qwScratch7
  1345. paddd mm2, mm0
  1346. movq qwTmp5_Z4_Even, mm3
  1347. movq mm6, mm7
  1348. // tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
  1349. pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
  1350. pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
  1351. movq mm3, qwZ1
  1352. movq qwTmp5_Z4_Odd, mm2
  1353. paddd mm7, mm4
  1354. movq mm5, Const_FFFF
  1355. movq mm4, mm3
  1356. // z1 = MULTIPLY(z1, - FIX_0_899976223);
  1357. pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
  1358. paddd mm6, mm0
  1359. pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
  1360. movq mm2, qwTmp4_Z3_Odd
  1361. //!!!!!! Negate the results
  1362. pxor mm3, mm5 //invert result
  1363. paddd mm3, Const_1 // 2's complement
  1364. pxor mm4, mm5 //invert result
  1365. paddd mm4, Const_1 // 2's complement
  1366. paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
  1367. // dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
  1368. // CONST_BITS+PASS1_BITS);
  1369. paddd mm7, Const_16384 //rounding adj
  1370. paddd mm6, mm4 //tmp7 + z1 + z4 ODD
  1371. paddd mm6, Const_16384 //rounding adj
  1372. psrad mm7, 15 // descale it | |R2| |R0|
  1373. movq mm0, qwTmp4_Z3_Even
  1374. psrad mm6, 15 // descale it | |R3| |R1|
  1375. paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
  1376. movq mm5, mm7
  1377. movq mm3, qwTemp0 //restore
  1378. punpckldq mm7, mm6 //| |R1| |R0|
  1379. paddd mm0, Const_16384 //rounding adj
  1380. paddd mm2, mm4 //tmp4 + z1 + z3 ODD
  1381. movq [edi][DATASIZE*0+16], mm3 //store
  1382. punpckhdq mm5, mm6 //| |R3| |R2|
  1383. // dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
  1384. // CONST_BITS+PASS1_BITS);
  1385. paddd mm2, Const_16384 //rounding adj
  1386. psrad mm0, 15 // descale it | |R2| |R0|
  1387. movq mm6, qwZ2
  1388. psrad mm2, 15 // descale it | |R3| |R1|
  1389. movq [edi][DATASIZE*1+24], mm5 //store
  1390. movq mm4, mm6
  1391. // z2 = MULTIPLY(z2, - FIX_2_562915447);
  1392. pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
  1393. movq mm5, mm0
  1394. pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
  1395. punpckldq mm0, mm2 //| |R1| |R0|
  1396. movq mm3, Const_FFFF
  1397. punpckhdq mm5, mm2 //| |R3| |R2|
  1398. movq [edi][DATASIZE*1+16], mm7 //store
  1399. //!!!!!! Negate the results
  1400. pxor mm6, mm3 //invert result
  1401. movq mm1, Const_1
  1402. pxor mm4, mm3 //invert result
  1403. movq mm7, qwTmp6_Z3_Even
  1404. paddd mm6, mm1 // 2's complement
  1405. movq mm2, qwTmp6_Z3_Odd
  1406. paddd mm4, mm1 // 2's complement
  1407. // dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
  1408. // CONST_BITS+PASS1_BITS);
  1409. movq [edi][DATASIZE*7+16], mm0 //store
  1410. paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
  1411. movq mm1, Const_16384
  1412. paddd mm2, mm4 //tmp6 + z2 + z3 ODD
  1413. movq mm3, qwTemp2 //restore
  1414. paddd mm7, mm1 //rounding adj
  1415. movq [edi][DATASIZE*7+24], mm5 //store
  1416. paddd mm2, mm1 //rounding adj
  1417. movq [edi][DATASIZE*2+16], mm3 //store
  1418. psrad mm7, 15 // descale it | |R2| |R0|
  1419. movq mm3, qwTmp5_Z4_Even
  1420. psrad mm2, 15 // descale it | |R3| |R1|
  1421. movq mm5, mm7
  1422. paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
  1423. movq mm6, qwTmp5_Z4_Odd
  1424. punpckldq mm7, mm2 //| |R1| |R0|
  1425. punpckhdq mm5, mm2 //| |R3| |R2|
  1426. paddd mm6, mm4 //tmp5 + z2 + z4 ODD
  1427. movq [edi][DATASIZE*3+16], mm7 //store
  1428. paddd mm3, mm1 //rounding adj
  1429. // dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
  1430. // CONST_BITS+PASS1_BITS);
  1431. movq mm7, qwTemp4 //restore
  1432. paddd mm6, mm1 //rounding adj
  1433. movq [edi][DATASIZE*3+24], mm5 //store
  1434. psrad mm3, 15 // descale it | |R2| |R0|
  1435. movq [edi][DATASIZE*4+16], mm7 //store
  1436. psrad mm6, 15 // descale it | |R3| |R1|
  1437. movq mm7, qwTemp6 //restore
  1438. movq mm5, mm3
  1439. punpckldq mm3, mm6 //| |R1| |R0|
  1440. movq [edi][DATASIZE*6+16], mm7 //store
  1441. punpckhdq mm5, mm6 //| |R3| |R2|
  1442. movq [edi][DATASIZE*5+16], mm3 //store
  1443. movq [edi][DATASIZE*5+24], mm5 //store
  1444. //******************************************************************************
  1445. // This completes all 8x8 dct locations for the column case.
  1446. //******************************************************************************
  1447. emms
  1448. }
  1449. }
  1450. #endif /* DCT_ISLOW_SUPPORTED */