Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1212 lines
31 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. */
  13. /*
  14. * jfdctfst.c
  15. *
  16. * Copyright (C) 1994-1996, Thomas G. Lane.
  17. * This file is part of the Independent JPEG Group's software.
  18. * For conditions of distribution and use, see the accompanying README file.
  19. *
  20. * This file contains a fast, not so accurate integer implementation of the
  21. * forward DCT (Discrete Cosine Transform).
  22. *
  23. * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  24. * on each column. Direct algorithms are also available, but they are
  25. * much more complex and seem not to be any faster when reduced to code.
  26. *
  27. * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  28. * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
  29. * Japanese, but the algorithm is described in the Pennebaker & Mitchell
  30. * JPEG textbook (see REFERENCES section in file README). The following code
  31. * is based directly on figure 4-8 in P&M.
  32. * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  33. * possible to arrange the computation so that many of the multiplies are
  34. * simple scalings of the final outputs. These multiplies can then be
  35. * folded into the multiplications or divisions by the JPEG quantization
  36. * table entries. The AA&N method leaves only 5 multiplies and 29 adds
  37. * to be done in the DCT itself.
  38. * The primary disadvantage of this method is that with fixed-point math,
  39. * accuracy is lost due to imprecise representation of the scaled
  40. * quantization values. The smaller the quantization table entry, the less
  41. * precise the scaled value, so this implementation does worse with high-
  42. * quality-setting files than with low-quality ones.
  43. */
  44. #define JPEG_INTERNALS
  45. #include "jinclude.h"
  46. #include "jpeglib.h"
  47. #include "jdct.h" /* Private declarations for DCT subsystem */
  48. #ifdef DCT_IFAST_SUPPORTED
  49. /*
  50. * This module is specialized to the case DCTSIZE = 8.
  51. */
  52. #if DCTSIZE != 8
  53. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  54. #endif
  55. /* Scaling decisions are generally the same as in the LL&M algorithm;
  56. * see jfdctint.c for more details. However, we choose to descale
  57. * (right shift) multiplication products as soon as they are formed,
  58. * rather than carrying additional fractional bits into subsequent additions.
  59. * This compromises accuracy slightly, but it lets us save a few shifts.
  60. * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  61. * everywhere except in the multiplications proper; this saves a good deal
  62. * of work on 16-bit-int machines.
  63. *
  64. * Again to save a few shifts, the intermediate results between pass 1 and
  65. * pass 2 are not upscaled, but are represented only to integral precision.
  66. *
  67. * A final compromise is to represent the multiplicative constants to only
  68. * 8 fractional bits, rather than 13. This saves some shifting work on some
  69. * machines, and may also reduce the cost of multiplication (since there
  70. * are fewer one-bits in the constants).
  71. */
  72. #define CONST_BITS 8
  73. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  74. * causing a lot of useless floating-point operations at run time.
  75. * To get around this we use the following pre-calculated constants.
  76. * If you change CONST_BITS you may want to add appropriate values.
  77. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  78. */
  79. #if CONST_BITS == 8
  80. #define FIX_0_382683433 98 /* FIX(0.382683433) */
  81. #define FIX_0_541196100 139 /* FIX(0.541196100) */
  82. #define FIX_0_707106781 181 /* FIX(0.707106781) */
  83. #define FIX_1_306562965 334 /* FIX(1.306562965) */
  84. #else
  85. #define FIX_0_382683433 FIX(0.382683433)
  86. #define FIX_0_541196100 FIX(0.541196100)
  87. #define FIX_0_707106781 FIX(0.707106781)
  88. #define FIX_1_306562965 FIX(1.306562965)
  89. #endif
  90. //The following constant is shifted left 8 for the pmulhw instruction
  91. const __int64 Const_FIX_0_382683433 = 0x6200620062006200;
  92. //The following constants are shifted left 7 for the pmulhw instruction
  93. const __int64 Const_FIX_0_541196100 = 0x4580458045804580;
  94. const __int64 Const_FIX_0_707106781 = 0x5a805a805a805a80;
  95. //The following constant is shifted left 6 for the pmulhw instruction
  96. const __int64 Const_FIX_1_306562965 = 0x5380538053805380;
  97. /* We can gain a little more speed, with a further compromise in accuracy,
  98. * by omitting the addition in a descaling shift. This yields an incorrectly
  99. * rounded result half the time...
  100. */
  101. // The assembly version makes this compromise.
  102. //#ifndef USE_ACCURATE_ROUNDING
  103. //#undef DESCALE
  104. //#define DESCALE(x,n) RIGHT_SHIFT(x, n)
  105. //#endif
  106. #define DATASIZE 32
  107. /* Multiply a DCTELEM variable by an INT32 constant, and immediately
  108. * descale to yield a DCTELEM result.
  109. */
  110. #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
  111. /*
  112. * Perform the forward DCT on one block of samples.
  113. */
  114. GLOBAL(void)
  115. mfdct8x8aan (DCTELEM * data)
  116. {
  117. __asm{
  118. mov edi, [data]
  119. // transpose the bottom right quadrant(4X4) of the matrix
  120. // --------- ---------
  121. // | M1 | M2 | | M1'| M3'|
  122. // --------- --> ---------
  123. // | M3 | M4 | | M2'| M4'|
  124. // --------- ---------
  125. // Get the 32-bit quantities and pack into 16 bits
  126. movq mm5, [edi][DATASIZE*4+16] //| w41 | w40 |
  127. movq mm3, [edi][DATASIZE*4+24] //| w43 | w42 |
  128. movq mm6, [edi][DATASIZE*5+16]
  129. packssdw mm5, mm3 //|w43|w42|w41|w40|
  130. movq mm7, [edi][DATASIZE*5+24]
  131. movq mm4, mm5 // copy w4---0,1,3,5,6
  132. movq mm3, [edi][DATASIZE*6+16]
  133. packssdw mm6, mm7
  134. movq mm2, [edi][DATASIZE*6+24]
  135. punpcklwd mm5, mm6 //mm6 = w5
  136. movq mm1, [edi][DATASIZE*7+16]
  137. packssdw mm3, mm2
  138. movq mm0, [edi][DATASIZE*7+24]
  139. punpckhwd mm4, mm6 //---0,1,3,5,6
  140. packssdw mm1, mm0
  141. movq mm7, mm3 //---0,1,2,3,5,6 w6
  142. punpcklwd mm3, mm1 //mm1 = w7
  143. movq mm0, mm5 //---0,2,3,4,5,6,7
  144. movq mm2, [edi][DATASIZE*4] //| w01 | w00 |
  145. punpckhdq mm0, mm3 // transposed w5---0,2,4,6,7
  146. punpckhwd mm7, mm1 //---0,2,3,5,6,7
  147. movq mm1, [edi][DATASIZE*5+8]
  148. movq mm6, mm4 //---0,2,3,4,6,7
  149. movq [edi][DATASIZE*5+16], mm0 // store w5
  150. punpckldq mm5, mm3 // transposed w4
  151. movq mm3, [edi][DATASIZE*5]
  152. punpckldq mm4, mm7 // transposed w6
  153. movq mm0, [edi][DATASIZE*4+8] //| w03 | w02 |
  154. punpckhdq mm6, mm7 // transposed w7---0,3,6,7
  155. // transpose the bottom left quadrant(4X4) of the matrix and place
  156. // in the top right quadrant while doing the same for the top
  157. // right quadrant
  158. // --------- ---------
  159. // | M1 | M2 | | M1'| M3'|
  160. // --------- --> ---------
  161. // | M3 | M4 | | M2'| M4'|
  162. // --------- ---------
  163. movq [edi][DATASIZE*4+16], mm5 // store w4
  164. packssdw mm2, mm0 //|w03|w02|w01|w00|
  165. movq mm5, [edi][DATASIZE*7]
  166. packssdw mm3, mm1
  167. movq mm0, [edi][DATASIZE*7+8]
  168. movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
  169. packssdw mm5, mm0
  170. movq mm6, [edi][DATASIZE*6]
  171. movq mm0, mm2 // copy w0---0,1,3,5,6
  172. movq mm7, [edi][DATASIZE*6+8]
  173. punpcklwd mm2, mm3 //mm6 = w1
  174. movq [edi][DATASIZE*6+16], mm4 // store w6---3,5,6,7
  175. packssdw mm6, mm7
  176. movq mm1, [edi][DATASIZE*0+24]
  177. punpckhwd mm0, mm3 //---0,1,3,5,6
  178. movq mm7, mm6 //---0,1,2,3,5,6 w2
  179. punpcklwd mm6, mm5 //mm1 = w3
  180. movq mm3, [edi][DATASIZE*0+16]
  181. punpckhwd mm7, mm5 //---0,2,3,5,6,7
  182. movq mm4, [edi][DATASIZE*2+24]
  183. packssdw mm3, mm1
  184. movq mm1, mm2 //---0,2,3,4,5,6,7
  185. punpckldq mm2, mm6 // transposed w4
  186. movq mm5, [edi][DATASIZE*2+16]
  187. punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
  188. movq [edi][DATASIZE*0+16], mm2 // store w4
  189. packssdw mm5, mm4
  190. movq mm4, [edi][DATASIZE*1+16]
  191. movq mm6, mm0 //---0,2,3,4,6,7
  192. movq mm2, [edi][DATASIZE*1+24]
  193. punpckldq mm0, mm7 // transposed w6
  194. movq [edi][DATASIZE*1+16], mm1 // store w5
  195. punpckhdq mm6, mm7 // transposed w7---0,3,6,7
  196. movq mm7, [edi][DATASIZE*3+24]
  197. packssdw mm4, mm2
  198. movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
  199. movq mm1, mm3 // copy w4---0,1,3,5,6
  200. movq mm2, [edi][DATASIZE*3+16]
  201. punpcklwd mm3, mm4 //mm6 = w5
  202. movq [edi][DATASIZE*3+16], mm6 // store w7---5,6,7
  203. packssdw mm2, mm7
  204. // transpose the bottom left quadrant(4X4) of the matrix
  205. // --------- ---------
  206. // | M1 | M2 | | M1'| M3'|
  207. // --------- --> ---------
  208. // | M3 | M4 | | M2'| M4'|
  209. // --------- ---------
  210. movq mm6, [edi][DATASIZE*0] //| w01 | w00 |
  211. punpckhwd mm1, mm4 //---0,1,3,5,6
  212. movq mm7, mm5 //---0,1,2,3,5,6 w6
  213. punpcklwd mm5, mm2 //mm1 = w7
  214. movq mm4, [edi][DATASIZE*0+8] //| w03 | w02 |
  215. punpckhwd mm7, mm2 //---0,2,3,5,6,7
  216. movq mm0, mm3 //---0,2,3,4,5,6,7
  217. packssdw mm6, mm4 //|w03|w02|w01|w00|
  218. movq mm2, [edi][DATASIZE*2+8]
  219. punpckldq mm3, mm5 // transposed w4
  220. movq mm4, [edi][DATASIZE*1]
  221. punpckhdq mm0, mm5 // transposed w5---0,2,4,6,7
  222. movq [edi][DATASIZE*4], mm3 // store w4
  223. movq mm5, mm1 //---0,2,3,4,6,7
  224. movq mm3, [edi][DATASIZE*2]
  225. punpckldq mm1, mm7 // transposed w6
  226. movq [edi][DATASIZE*5], mm0 // store w5
  227. punpckhdq mm5, mm7 // transposed w7---0,3,6,7
  228. movq mm7, [edi][DATASIZE*1+8]
  229. packssdw mm3, mm2
  230. movq [edi][DATASIZE*7], mm5 // store w7---5,6,7
  231. movq mm5, mm6 // copy w0---0,1,3,5,6
  232. movq [edi][DATASIZE*6], mm1 // store w6---3,5,6,7
  233. packssdw mm4, mm7
  234. // transpose the top left quadrant(4X4) of the matrix
  235. // --------- ---------
  236. // | M1 | M2 | | M1'| M3'|
  237. // --------- --> ---------
  238. // | M3 | M4 | | M2'| M4'|
  239. // --------- ---------
  240. // Get the 32-bit quantities and pack into 16 bits
  241. movq mm1, [edi][DATASIZE*3]
  242. punpcklwd mm6, mm4 //mm6 = w1
  243. movq mm0, [edi][DATASIZE*3+8]
  244. punpckhwd mm5, mm4 //---0,1,3,5,6
  245. packssdw mm1, mm0
  246. movq mm2, mm3 //---0,1,2,3,5,6 w2
  247. punpcklwd mm3, mm1 //mm1 = w3
  248. movq mm0, mm6 //---0,2,3,4,5,6,7
  249. movq mm4, [edi][DATASIZE*7]
  250. punpckhwd mm2, mm1 //---0,2,3,5,6,7
  251. movq mm1, [edi][DATASIZE*4]
  252. punpckldq mm6, mm3 // transposed w4
  253. punpckhdq mm0, mm3 // transposed w5---0,2,4,6,7
  254. movq mm3, mm5 //---0,2,3,4,6,7
  255. movq [edi][DATASIZE*0], mm6 // store w4
  256. punpckldq mm5, mm2 // transposed w6
  257. movq [edi][DATASIZE*1], mm0 // store w5
  258. punpckhdq mm3, mm2 // transposed w7---0,3,6,7
  259. movq [edi][DATASIZE*2], mm5 // store w6---3,5,6,7
  260. paddw mm6, mm4 // tmp0
  261. movq [edi][DATASIZE*3], mm3 // store w7---5,6,7
  262. movq mm7, mm6
  263. //******************************************************************************
  264. // End of transpose. Begin row dct.
  265. //******************************************************************************
  266. // tmp0 = dataptr[0] + dataptr[7];
  267. // tmp7 = dataptr[0] - dataptr[7];
  268. // tmp1 = dataptr[1] + dataptr[6];
  269. // tmp6 = dataptr[1] - dataptr[6];
  270. // tmp2 = dataptr[2] + dataptr[5];
  271. // tmp5 = dataptr[2] - dataptr[5];
  272. // tmp3 = dataptr[3] + dataptr[4];
  273. // tmp4 = dataptr[3] - dataptr[4];
  274. paddw mm0, [edi][DATASIZE*6] // tmp1
  275. paddw mm3, mm1 // tmp3
  276. paddw mm5, [edi][DATASIZE*5] // tmp2
  277. movq mm1, mm0
  278. // tmp10 = tmp0 + tmp3;
  279. // tmp13 = tmp0 - tmp3;
  280. // tmp11 = tmp1 + tmp2;
  281. // tmp12 = tmp1 - tmp2;
  282. psubw mm7, mm3 //tmp13
  283. psubw mm0, mm5 //tmp12
  284. paddw mm0, mm7 //tmp12 + tmp13
  285. paddw mm6, mm3 //tmp10
  286. // dataptr[0] = tmp10 + tmp11; /* phase 3 */
  287. // dataptr[4] = tmp10 - tmp11;
  288. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  289. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  290. //haven't been calculated yet!
  291. paddw mm1, mm5 //tmp11
  292. psllw mm0, 1
  293. pmulhw mm0, Const_FIX_0_707106781 // z1
  294. movq mm3, mm6
  295. // dataptr[2] = tmp13 + z1; /* phase 5 */
  296. // dataptr[6] = tmp13 - z1;
  297. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  298. //haven't been calculated yet!
  299. movq mm5, [edi][DATASIZE*3]
  300. paddw mm6, mm1 //tmp10 + tmp11
  301. // tmp4 = dataptr[3] - dataptr[4]//
  302. psubw mm5, [edi][DATASIZE*4] //tmp4
  303. movq mm4, mm7
  304. movq mm2, [edi][DATASIZE*2]
  305. psubw mm3, mm1 //tmp10 - tmp11
  306. psubw mm2, [edi][DATASIZE*5] //tmp5
  307. paddw mm7, mm0 //tmp13 + z1
  308. movq mm1, [edi][DATASIZE*1]
  309. psubw mm4, mm0 //tmp13 - z1
  310. // tmp10 = tmp4 + tmp5; /* phase 2 */
  311. // tmp11 = tmp5 + tmp6;
  312. // tmp12 = tmp6 + tmp7;
  313. psubw mm1, [edi][DATASIZE*6] //tmp6
  314. paddw mm5, mm2 //tmp10
  315. movq mm0, [edi][DATASIZE*0]
  316. paddw mm2, mm1 //tmp11
  317. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  318. // z11 = tmp7 + z3; /* phase 5 */
  319. // z13 = tmp7 - z3;
  320. psubw mm0, [edi][DATASIZE*7] //tmp7
  321. psllw mm2, 1
  322. movq [edi][DATASIZE*0], mm6
  323. movq mm6, mm0
  324. movq [edi][DATASIZE*2], mm7
  325. movq mm7, mm5
  326. pmulhw mm2, Const_FIX_0_707106781 //z3
  327. paddw mm1, mm0 //tmp12
  328. movq [edi][DATASIZE*4], mm3
  329. psubw mm5, mm1 //tmp10 - tmp12
  330. pmulhw mm5, Const_FIX_0_382683433 //z5
  331. psllw mm7, 1
  332. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  333. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  334. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  335. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  336. pmulhw mm7, Const_FIX_0_541196100
  337. psllw mm1, 2
  338. pmulhw mm1, Const_FIX_1_306562965
  339. psubw mm6, mm2 //z13
  340. movq [edi][DATASIZE*6], mm4
  341. paddw mm0, mm2 //z11
  342. movq mm2, [edi][DATASIZE*3+16]
  343. paddw mm7, mm5 //z2
  344. paddw mm2, [edi][DATASIZE*4+16] // tmp3
  345. paddw mm1, mm5 //z4
  346. // dataptr[5] = z13 + z2; /* phase 6 */
  347. // dataptr[3] = z13 - z2;
  348. // dataptr[1] = z11 + z4;
  349. // dataptr[7] = z11 - z4;
  350. movq mm5, [edi][DATASIZE*0+16]
  351. movq mm3, mm6
  352. paddw mm5, [edi][DATASIZE*7+16] //tmp0
  353. paddw mm6, mm7 //z13 + z2
  354. psubw mm3, mm7 //z13 - z2
  355. movq mm7, mm5
  356. movq [edi][DATASIZE*5], mm6 //store
  357. movq mm4, mm0
  358. movq [edi][DATASIZE*3], mm3 //store
  359. paddw mm0, mm1 //z11 + z4
  360. movq mm3, [edi][DATASIZE*1+16]
  361. psubw mm4, mm1 //z11 - z4
  362. //******************************************************************************
  363. // This completes 4x8 dct locations. Copy to do other 4x8.
  364. //******************************************************************************
  365. // tmp0 = dataptr[0] + dataptr[7];
  366. // tmp7 = dataptr[0] - dataptr[7];
  367. // tmp1 = dataptr[1] + dataptr[6];
  368. // tmp6 = dataptr[1] - dataptr[6];
  369. // tmp2 = dataptr[2] + dataptr[5];
  370. // tmp5 = dataptr[2] - dataptr[5];
  371. // tmp3 = dataptr[3] + dataptr[4];
  372. // tmp4 = dataptr[3] - dataptr[4];
  373. paddw mm3, [edi][DATASIZE*6+16] // tmp1
  374. paddw mm5, mm2 //tmp10
  375. movq mm1, [edi][DATASIZE*2+16]
  376. psubw mm7, mm2 //tmp13
  377. paddw mm1, [edi][DATASIZE*5+16] // tmp2
  378. movq mm6, mm3
  379. // tmp10 = tmp0 + tmp3;
  380. // tmp13 = tmp0 - tmp3;
  381. // tmp11 = tmp1 + tmp2;
  382. // tmp12 = tmp1 - tmp2;
  383. paddw mm3, mm1 //tmp11
  384. psubw mm6, mm1 //tmp12
  385. // dataptr[0] = tmp10 + tmp11; /* phase 3 */
  386. // dataptr[4] = tmp10 - tmp11;
  387. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  388. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  389. //haven't been calculated yet!
  390. movq [edi][DATASIZE*1], mm0 //store
  391. paddw mm6, mm7 //tmp12 + tmp13
  392. movq [edi][DATASIZE*7], mm4 //store
  393. psllw mm6, 1
  394. pmulhw mm6, Const_FIX_0_707106781 // z1
  395. movq mm1, mm5
  396. // dataptr[2] = tmp13 + z1; /* phase 5 */
  397. // dataptr[6] = tmp13 - z1;
  398. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  399. //haven't been calculated yet!
  400. movq mm2, [edi][DATASIZE*3+16]
  401. paddw mm5, mm3 //tmp10 + tmp11
  402. // tmp4 = dataptr[3] - dataptr[4]//
  403. psubw mm2, [edi][DATASIZE*4+16] //tmp4
  404. movq mm4, mm7
  405. movq mm0, [edi][DATASIZE*2+16]
  406. psubw mm1, mm3 //tmp10 - tmp11
  407. psubw mm0, [edi][DATASIZE*5+16] //tmp5
  408. paddw mm7, mm6 //tmp13 + z1
  409. movq mm3, [edi][DATASIZE*1+16]
  410. psubw mm4, mm6 //tmp13 - z1
  411. // tmp10 = tmp4 + tmp5; /* phase 2 */
  412. // tmp11 = tmp5 + tmp6;
  413. // tmp12 = tmp6 + tmp7;
  414. psubw mm3, [edi][DATASIZE*6+16] //tmp6
  415. paddw mm2, mm0 //tmp10
  416. movq mm6, [edi][DATASIZE*0+16]
  417. paddw mm0, mm3 //tmp11
  418. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  419. // z11 = tmp7 + z3; /* phase 5 */
  420. // z13 = tmp7 - z3;
  421. psubw mm6, [edi][DATASIZE*7+16] //tmp7
  422. psllw mm0, 1
  423. movq [edi][DATASIZE*0+16], mm5
  424. movq mm5, mm6
  425. movq [edi][DATASIZE*2+16], mm7
  426. movq mm7, mm2
  427. pmulhw mm0, Const_FIX_0_707106781 //z3
  428. paddw mm3, mm6 //tmp12
  429. movq [edi][DATASIZE*4+16], mm1
  430. psubw mm2, mm3 //tmp10 - tmp12
  431. pmulhw mm2, Const_FIX_0_382683433 //z5
  432. psllw mm7, 1
  433. pmulhw mm7, Const_FIX_0_541196100
  434. paddw mm6, mm0 //z11
  435. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  436. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  437. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  438. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  439. movq [edi][DATASIZE*6+16], mm4
  440. psllw mm3, 2
  441. pmulhw mm3, Const_FIX_1_306562965
  442. psubw mm5, mm0 //z13
  443. paddw mm7, mm2 //z2
  444. movq mm1, mm5
  445. paddw mm5, mm7 //z13 + z2
  446. psubw mm1, mm7 //z13 - z2
  447. movq mm7, [edi][DATASIZE*4]
  448. paddw mm3, mm2 //z4
  449. // dataptr[5] = z13 + z2; /* phase 6 */
  450. // dataptr[3] = z13 - z2;
  451. // dataptr[1] = z11 + z4;
  452. // dataptr[7] = z11 - z4;
  453. movq [edi][DATASIZE*5+16], mm5 //store
  454. movq mm4, mm6
  455. movq mm2, [edi][DATASIZE*7]
  456. paddw mm6, mm3 //z11 + z4
  457. movq mm5, [edi][DATASIZE*5]
  458. psubw mm4, mm3 //z11 - z4
  459. //******************************************************************************
  460. //******************************************************************************
  461. // This completes all 8x8 dct locations for the row case.
  462. // Now transpose the data for the columns.
  463. //******************************************************************************
  464. // transpose the bottom left quadrant(4X4) of the matrix and place
  465. // in the top right quadrant while doing the same for the top
  466. // right quadrant
  467. // --------- ---------
  468. // | M1 | M2 | | M1'| M3'|
  469. // --------- --> ---------
  470. // | M3 | M4 | | M2'| M4'|
  471. // --------- ---------
  472. movq mm0, mm7 // copy w0---0,1,3,5,6
  473. punpcklwd mm7, mm5 //mm6 = w1
  474. movq mm3, [edi][DATASIZE*6]
  475. punpckhwd mm0, mm5 //---0,1,3,5,6
  476. movq mm5, mm3 //---0,1,2,3,5,6 w2
  477. punpcklwd mm3, mm2 //mm1 = w3
  478. movq [edi][DATASIZE*7+16], mm4 //store
  479. punpckhwd mm5, mm2 //---0,2,3,5,6,7
  480. movq mm4, mm7 //---0,2,3,4,5,6,7
  481. punpckldq mm7, mm3 // transposed w4
  482. movq mm2, [edi][DATASIZE*0+16]
  483. punpckhdq mm4, mm3 // transposed w5---0,2,4,6,7
  484. movq [edi][DATASIZE*0+16], mm7 // store w4
  485. movq mm3, mm0 //---0,2,3,4,6,7
  486. movq [edi][DATASIZE*1+16], mm4 // store w5
  487. punpckldq mm0, mm5 // transposed w6
  488. movq mm7, [edi][DATASIZE*2+16]
  489. punpckhdq mm3, mm5 // transposed w7---0,3,6,7
  490. movq mm5, mm2 // copy w4---0,1,3,5,6
  491. punpcklwd mm2, mm6 //mm6 = w5
  492. // transpose the top right quadrant(4X4) of the matrix
  493. // --------- ---------
  494. // | M1 | M2 | | M1'| M3'|
  495. // --------- --> ---------
  496. // | M3 | M4 | | M2'| M4'|
  497. // --------- ---------
  498. movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
  499. punpckhwd mm5, mm6 //---0,1,3,5,6
  500. movq mm4, mm7 //---0,1,2,3,5,6 w6
  501. punpckhwd mm7, mm1 //---0,2,3,5,6,7
  502. movq [edi][DATASIZE*3+16], mm3 // store w7---5,6,7
  503. movq mm0, mm2 //---0,2,3,4,5,6,7
  504. movq mm6, [edi][DATASIZE*5+16]
  505. punpcklwd mm4, mm1 //mm1 = w7
  506. movq mm1, [edi][DATASIZE*4+16]
  507. punpckldq mm0, mm4 // transposed w4
  508. movq mm3, [edi][DATASIZE*6+16]
  509. punpckhdq mm2, mm4 // transposed w5---0,2,4,6,7
  510. // transpose the bottom right quadrant(4X4) of the matrix
  511. // --------- ---------
  512. // | M1 | M2 | | M1'| M3'|
  513. // --------- --> ---------
  514. // | M3 | M4 | | M2'| M4'|
  515. // --------- ---------
  516. movq [edi][DATASIZE*4], mm0 // store w4
  517. movq mm4, mm5 //---0,2,3,4,6,7
  518. movq [edi][DATASIZE*5], mm2 // store w5
  519. punpckldq mm5, mm7 // transposed w6
  520. movq mm2, [edi][DATASIZE*7+16]
  521. punpckhdq mm4, mm7 // transposed w7---0,3,6,7
  522. movq mm7, mm1 // copy w4---0,1,3,5,6
  523. punpcklwd mm1, mm6 //mm6 = w5
  524. movq [edi][DATASIZE*6], mm5 // store w6---3,5,6,7
  525. punpckhwd mm7, mm6 //---0,1,3,5,6
  526. movq mm5, mm3 //---0,1,2,3,5,6 w6
  527. punpcklwd mm3, mm2 //mm1 = w7
  528. movq [edi][DATASIZE*7], mm4 // store w7---5,6,7
  529. punpckhwd mm5, mm2 //---0,2,3,5,6,7
  530. movq mm0, [edi][DATASIZE*0]
  531. movq mm4, mm1 //---0,2,3,4,5,6,7
  532. movq mm6, [edi][DATASIZE*1]
  533. punpckldq mm1, mm3 // transposed w4
  534. punpckhdq mm4, mm3 // transposed w5---0,2,4,6,7
  535. movq mm3, mm7 //---0,2,3,4,6,7
  536. movq [edi][DATASIZE*4+16], mm1 // store w4
  537. punpckldq mm7, mm5 // transposed w6
  538. movq [edi][DATASIZE*5+16], mm4 // store w5
  539. punpckhdq mm3, mm5 // transposed w7---0,3,6,7
  540. // transpose the top left quadrant(4X4) of the matrix
  541. // --------- ---------
  542. // | M1 | M2 | | M1'| M3'|
  543. // --------- --> ---------
  544. // | M3 | M4 | | M2'| M4'|
  545. // --------- ---------
  546. movq mm1, [edi][DATASIZE*3]
  547. movq mm2, mm0 // copy w0---0,1,3,5,6
  548. movq [edi][DATASIZE*7+16], mm3 // store w7---5,6,7
  549. punpcklwd mm0, mm6 //mm6 = w1
  550. movq mm3, [edi][DATASIZE*2]
  551. punpckhwd mm2, mm6 //---0,1,3,5,6
  552. movq mm5, mm3 //---0,1,2,3,5,6 w2
  553. punpcklwd mm3, mm1 //mm1 = w3
  554. movq [edi][DATASIZE*6+16], mm7 // store w6---3,5,6,7
  555. punpckhwd mm5, mm1 //---0,2,3,5,6,7
  556. movq mm1, mm0 //---0,2,3,4,5,6,7
  557. punpckldq mm0, mm3 // transposed w4
  558. movq mm6, [edi][DATASIZE*4]
  559. punpckhdq mm1, mm3 // transposed w5---0,2,4,6,7
  560. movq [edi][DATASIZE*0], mm0 // store w4
  561. movq mm3, mm2 //---0,2,3,4,6,7
  562. paddw mm0, [edi][DATASIZE*7] // tmp0
  563. punpckhdq mm3, mm5 // transposed w7---0,3,6,7
  564. movq [edi][DATASIZE*1], mm1 // store w5
  565. punpckldq mm2, mm5 // transposed w6
  566. //******************************************************************************
  567. // This begins the column dct
  568. //******************************************************************************
  569. // tmp0 = dataptr[0] + dataptr[7];
  570. // tmp7 = dataptr[0] - dataptr[7];
  571. // tmp1 = dataptr[1] + dataptr[6];
  572. // tmp6 = dataptr[1] - dataptr[6];
  573. // tmp2 = dataptr[2] + dataptr[5];
  574. // tmp5 = dataptr[2] - dataptr[5];
  575. // tmp3 = dataptr[3] + dataptr[4];
  576. // tmp4 = dataptr[3] - dataptr[4];
  577. movq [edi][DATASIZE*3], mm3 // store w7---5,6,7
  578. movq mm7, mm0
  579. paddw mm1, [edi][DATASIZE*6] // tmp1
  580. paddw mm3, mm6 // tmp3
  581. movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
  582. paddw mm0, mm3 //tmp10
  583. paddw mm2, [edi][DATASIZE*5] // tmp2
  584. movq mm6, mm1
  585. // tmp10 = tmp0 + tmp3;
  586. // tmp13 = tmp0 - tmp3;
  587. // tmp11 = tmp1 + tmp2;
  588. // tmp12 = tmp1 - tmp2;
  589. psubw mm7, mm3 //tmp13
  590. movq mm3, mm0
  591. movq mm5, [edi][DATASIZE*2]
  592. paddw mm1, mm2 //tmp11
  593. psubw mm3, mm1 //tmp10 - tmp11
  594. paddw mm0, mm1 //tmp10 + tmp11
  595. // dataptr[0] = tmp10 + tmp11; /* phase 3 */
  596. // dataptr[4] = tmp10 - tmp11;
  597. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  598. //haven't been calculated yet!
  599. movq mm1, mm3
  600. punpcklwd mm3, mm3
  601. psubw mm6, mm2 //tmp12
  602. punpckhwd mm1, mm1
  603. movq mm2, [edi][DATASIZE*3]
  604. psrad mm3, 16
  605. // tmp4 = dataptr[3] - dataptr[4]//
  606. psubw mm2, [edi][DATASIZE*4] //tmp4
  607. psrad mm1, 16
  608. movq [edi][DATASIZE*4], mm3
  609. movq mm3, mm0
  610. movq [edi][DATASIZE*4+8], mm1
  611. punpcklwd mm0, mm0
  612. paddw mm6, mm7 //tmp12 + tmp13
  613. punpckhwd mm3, mm3
  614. movq mm1, [edi][DATASIZE*1]
  615. psllw mm6, 1
  616. pmulhw mm6, Const_FIX_0_707106781 // z1
  617. psrad mm3, 16
  618. psubw mm5, [edi][DATASIZE*5] //tmp5
  619. psrad mm0, 16
  620. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  621. // dataptr[2] = tmp13 + z1; /* phase 5 */
  622. // dataptr[6] = tmp13 - z1;
  623. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  624. //haven't been calculated yet!
  625. movq [edi][DATASIZE*0+8], mm3
  626. movq mm4, mm7
  627. movq mm3, [edi][DATASIZE*0]
  628. paddw mm7, mm6 //tmp13 + z1
  629. movq [edi][DATASIZE*0], mm0
  630. psubw mm4, mm6 //tmp13 - z1
  631. movq mm0, mm7
  632. punpcklwd mm7, mm7
  633. psubw mm1, [edi][DATASIZE*6] //tmp6
  634. punpckhwd mm0, mm0
  635. // tmp10 = tmp4 + tmp5; /* phase 2 */
  636. // tmp11 = tmp5 + tmp6;
  637. // tmp12 = tmp6 + tmp7;
  638. psrad mm7, 16
  639. paddw mm2, mm5 //tmp10
  640. psrad mm0, 16
  641. paddw mm5, mm1 //tmp11
  642. movq mm6, mm4
  643. punpcklwd mm4, mm4
  644. movq [edi][DATASIZE*2], mm7
  645. punpckhwd mm6, mm6
  646. psubw mm3, [edi][DATASIZE*7] //tmp7
  647. movq mm7, mm2
  648. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  649. // z11 = tmp7 + z3; /* phase 5 */
  650. // z13 = tmp7 - z3;
  651. movq [edi][DATASIZE*2+8], mm0
  652. movq mm0, mm3
  653. psllw mm5, 1
  654. paddw mm1, mm3 //tmp12
  655. pmulhw mm5, Const_FIX_0_707106781 //z3
  656. psrad mm4, 16
  657. psubw mm2, mm1 //tmp10 - tmp12
  658. psrad mm6, 16
  659. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  660. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  661. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  662. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  663. pmulhw mm2, Const_FIX_0_382683433 //z5
  664. psllw mm7, 1
  665. pmulhw mm7, Const_FIX_0_541196100
  666. psllw mm1, 2
  667. pmulhw mm1, Const_FIX_1_306562965
  668. psubw mm0, mm5 //z13
  669. movq [edi][DATASIZE*6+8], mm6
  670. movq mm6, mm0
  671. movq [edi][DATASIZE*6], mm4
  672. paddw mm7, mm2 //z2
  673. // dataptr[5] = z13 + z2; /* phase 6 */
  674. // dataptr[3] = z13 - z2;
  675. // dataptr[1] = z11 + z4;
  676. // dataptr[7] = z11 - z4;
  677. paddw mm0, mm7 //z13 + z2
  678. psubw mm6, mm7 //z13 - z2
  679. movq mm7, mm6
  680. punpcklwd mm6, mm6
  681. punpckhwd mm7, mm7
  682. paddw mm3, mm5 //z11
  683. movq mm5, mm0
  684. punpcklwd mm0, mm0
  685. psrad mm6, 16
  686. movq mm4, mm3
  687. psrad mm7, 16
  688. paddw mm1, mm2 //z4
  689. punpckhwd mm5, mm5
  690. paddw mm3, mm1 //z11 + z4
  691. psrad mm0, 16
  692. psubw mm4, mm1 //z11 - z4
  693. movq [edi][DATASIZE*3], mm6 //store
  694. psrad mm5, 16
  695. movq mm6, [edi][DATASIZE*1+16]
  696. movq mm1, mm3
  697. paddw mm6, [edi][DATASIZE*6+16] // tmp1
  698. punpcklwd mm3, mm3
  699. movq [edi][DATASIZE*3+8], mm7
  700. punpckhwd mm1, mm1
  701. movq [edi][DATASIZE*5], mm0 //store
  702. psrad mm3, 16
  703. movq [edi][DATASIZE*5+8], mm5
  704. psrad mm1, 16
  705. movq mm0, [edi][DATASIZE*0+16]
  706. movq mm7, mm4
  707. paddw mm0, [edi][DATASIZE*7+16] //tmp0
  708. punpcklwd mm4, mm4
  709. movq [edi][DATASIZE*1], mm3 //store
  710. punpckhwd mm7, mm7
  711. movq [edi][DATASIZE*1+8], mm1
  712. psrad mm4, 16
  713. movq mm3, [edi][DATASIZE*3+16]
  714. psrad mm7, 16
  715. //******************************************************************************
  716. // This completes 4x8 dct locations. Copy to do other 4x8.
  717. //******************************************************************************
  718. // tmp0 = dataptr[0] + dataptr[7];
  719. // tmp7 = dataptr[0] - dataptr[7];
  720. // tmp1 = dataptr[1] + dataptr[6];
  721. // tmp6 = dataptr[1] - dataptr[6];
  722. // tmp2 = dataptr[2] + dataptr[5];
  723. // tmp5 = dataptr[2] - dataptr[5];
  724. // tmp3 = dataptr[3] + dataptr[4];
  725. // tmp4 = dataptr[3] - dataptr[4];
  726. paddw mm3, [edi][DATASIZE*4+16] // tmp3
  727. movq mm1, mm6
  728. movq [edi][DATASIZE*7+8], mm7
  729. movq mm7, mm0
  730. movq mm2, [edi][DATASIZE*2+16]
  731. paddw mm0, mm3 //tmp10
  732. paddw mm2, [edi][DATASIZE*5+16] // tmp2
  733. psubw mm7, mm3 //tmp13
  734. movq mm3, mm0
  735. paddw mm1, mm2 //tmp11
  736. // tmp10 = tmp0 + tmp3;
  737. // tmp13 = tmp0 - tmp3;
  738. // tmp11 = tmp1 + tmp2;
  739. // tmp12 = tmp1 - tmp2;
  740. paddw mm0, mm1 //tmp10 + tmp11
  741. psubw mm3, mm1 //tmp10 - tmp11
  742. // dataptr[0] = tmp10 + tmp11; /* phase 3 */
  743. // dataptr[4] = tmp10 - tmp11;
  744. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  745. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  746. //haven't been calculated yet!
  747. movq mm1, mm3
  748. punpcklwd mm3, mm3
  749. punpckhwd mm1, mm1
  750. psubw mm6, mm2 //tmp12
  751. movq [edi][DATASIZE*7], mm4 //store
  752. psrad mm3, 16
  753. psrad mm1, 16
  754. paddw mm6, mm7 //tmp12 + tmp13
  755. movq mm2, [edi][DATASIZE*3+16]
  756. psllw mm6, 1
  757. movq mm4, mm0
  758. punpcklwd mm0, mm0
  759. pmulhw mm6, Const_FIX_0_707106781 // z1
  760. punpckhwd mm4, mm4
  761. // tmp4 = dataptr[3] - dataptr[4]//
  762. psubw mm2, [edi][DATASIZE*4+16] //tmp4
  763. psrad mm4, 16
  764. movq mm5, [edi][DATASIZE*2+16]
  765. psrad mm0, 16
  766. movq [edi][DATASIZE*0+24], mm4
  767. movq mm4, mm7
  768. // dataptr[2] = tmp13 + z1; /* phase 5 */
  769. // dataptr[6] = tmp13 - z1;
  770. //NOTE: We can't write these values out immediately. Values for tmp4 - tmp7
  771. //haven't been calculated yet!
  772. psubw mm5, [edi][DATASIZE*5+16] //tmp5
  773. paddw mm7, mm6 //tmp13 + z1
  774. movq [edi][DATASIZE*4+16], mm3
  775. psubw mm4, mm6 //tmp13 - z1
  776. movq mm3, mm7
  777. punpcklwd mm7, mm7
  778. movq mm6, [edi][DATASIZE*0+16]
  779. punpckhwd mm3, mm3
  780. movq [edi][DATASIZE*4+24], mm1
  781. psrad mm7, 16
  782. movq [edi][DATASIZE*0+16], mm0
  783. psrad mm3, 16
  784. movq mm1, [edi][DATASIZE*1+16]
  785. movq mm0, mm4
  786. psubw mm1, [edi][DATASIZE*6+16] //tmp6
  787. punpcklwd mm4, mm4
  788. movq [edi][DATASIZE*2+16], mm7
  789. paddw mm2, mm5 //tmp10
  790. // tmp10 = tmp4 + tmp5; /* phase 2 */
  791. // tmp11 = tmp5 + tmp6;
  792. // tmp12 = tmp6 + tmp7;
  793. movq mm7, mm2
  794. paddw mm5, mm1 //tmp11
  795. psubw mm6, [edi][DATASIZE*7+16] //tmp7
  796. punpckhwd mm0, mm0
  797. movq [edi][DATASIZE*2+24], mm3
  798. psllw mm5, 1
  799. pmulhw mm5, Const_FIX_0_707106781 //z3
  800. psrad mm0, 16
  801. psrad mm4, 16
  802. paddw mm1, mm6 //tmp12
  803. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  804. // z11 = tmp7 + z3; /* phase 5 */
  805. // z13 = tmp7 - z3;
  806. movq [edi][DATASIZE*6+24], mm0
  807. psubw mm2, mm1 //tmp10 - tmp12
  808. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  809. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  810. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  811. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  812. pmulhw mm2, Const_FIX_0_382683433 //z5
  813. psllw mm7, 1
  814. pmulhw mm7, Const_FIX_0_541196100
  815. psllw mm1, 2
  816. movq [edi][DATASIZE*6+16], mm4
  817. movq mm0, mm6
  818. pmulhw mm1, Const_FIX_1_306562965
  819. psubw mm0, mm5 //z13
  820. paddw mm7, mm2 //z2
  821. movq mm3, mm0
  822. // dataptr[5] = z13 + z2; /* phase 6 */
  823. // dataptr[3] = z13 - z2;
  824. // dataptr[1] = z11 + z4;
  825. // dataptr[7] = z11 - z4;
  826. paddw mm0, mm7 //z13 + z2
  827. psubw mm3, mm7 //z13 - z2
  828. movq mm7, mm3
  829. punpcklwd mm3, mm3
  830. punpckhwd mm7, mm7
  831. paddw mm6, mm5 //z11
  832. psrad mm3, 16
  833. paddw mm1, mm2 //z4
  834. psrad mm7, 16
  835. movq mm4, mm6
  836. movq mm5, mm0
  837. punpcklwd mm0, mm0
  838. punpckhwd mm5, mm5
  839. paddw mm6, mm1 //z11 + z4
  840. psrad mm0, 16
  841. psubw mm4, mm1 //z11 - z4
  842. movq [edi][DATASIZE*3+16], mm3 //store
  843. psrad mm5, 16
  844. movq mm1, mm6
  845. punpcklwd mm6, mm6
  846. movq [edi][DATASIZE*3+24], mm7
  847. punpckhwd mm1, mm1
  848. movq [edi][DATASIZE*5+16], mm0 //store
  849. psrad mm6, 16
  850. movq [edi][DATASIZE*5+24], mm5
  851. psrad mm1, 16
  852. movq mm7, mm4
  853. punpcklwd mm4, mm4
  854. movq [edi][DATASIZE*1+16], mm6 //store
  855. punpckhwd mm7, mm7
  856. movq [edi][DATASIZE*1+24], mm1
  857. psrad mm4, 16
  858. psrad mm7, 16
  859. movq [edi][DATASIZE*7+16], mm4 //store
  860. movq [edi][DATASIZE*7+24], mm7
  861. //******************************************************************************
  862. // This completes all 8x8 dct locations for the column case.
  863. //******************************************************************************
  864. emms
  865. }
  866. }
  867. #endif /* DCT_ISLOW_SUPPORTED */