Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4348 lines
155 KiB

  1. /*
  2. * jfdctint.c
  3. *
  4. * Copyright (C) 1991-1996, Thomas G. Lane.
  5. * Modification developed 2003-2009 by Guido Vollbeding.
  6. * This file is part of the Independent JPEG Group's software.
  7. * For conditions of distribution and use, see the accompanying README file.
  8. *
  9. * This file contains a slow-but-accurate integer implementation of the
  10. * forward DCT (Discrete Cosine Transform).
  11. *
  12. * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  13. * on each column. Direct algorithms are also available, but they are
  14. * much more complex and seem not to be any faster when reduced to code.
  15. *
  16. * This implementation is based on an algorithm described in
  17. * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  18. * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  19. * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  20. * The primary algorithm described there uses 11 multiplies and 29 adds.
  21. * We use their alternate method with 12 multiplies and 32 adds.
  22. * The advantage of this method is that no data path contains more than one
  23. * multiplication; this allows a very simple and accurate implementation in
  24. * scaled fixed-point arithmetic, with a minimal number of shifts.
  25. *
  26. * We also provide FDCT routines with various input sample block sizes for
  27. * direct resolution reduction or enlargement and for direct resolving the
  28. * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
  29. * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
  30. *
  31. * For N<8 we fill the remaining block coefficients with zero.
  32. * For N>8 we apply a partial N-point FDCT on the input samples, computing
  33. * just the lower 8 frequency coefficients and discarding the rest.
  34. *
  35. * We must scale the output coefficients of the N-point FDCT appropriately
  36. * to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
  37. * is folded into the constant multipliers (pass 2) and/or final/initial
  38. * shifting.
  39. *
  40. * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
  41. * since there would be too many additional constants to pre-calculate.
  42. */
  43. #define JPEG_INTERNALS
  44. #include "jinclude.h"
  45. #include "jpeglib.h"
  46. #include "jdct.h" /* Private declarations for DCT subsystem */
  47. #ifdef DCT_ISLOW_SUPPORTED
  48. /*
  49. * This module is specialized to the case DCTSIZE = 8.
  50. */
  51. #if DCTSIZE != 8
  52. Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
  53. #endif
  54. /*
  55. * The poop on this scaling stuff is as follows:
  56. *
  57. * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
  58. * larger than the true DCT outputs. The final outputs are therefore
  59. * a factor of N larger than desired; since N=8 this can be cured by
  60. * a simple right shift at the end of the algorithm. The advantage of
  61. * this arrangement is that we save two multiplications per 1-D DCT,
  62. * because the y0 and y4 outputs need not be divided by sqrt(N).
  63. * In the IJG code, this factor of 8 is removed by the quantization step
  64. * (in jcdctmgr.c), NOT in this module.
  65. *
  66. * We have to do addition and subtraction of the integer inputs, which
  67. * is no problem, and multiplication by fractional constants, which is
  68. * a problem to do in integer arithmetic. We multiply all the constants
  69. * by CONST_SCALE and convert them to integer constants (thus retaining
  70. * CONST_BITS bits of precision in the constants). After doing a
  71. * multiplication we have to divide the product by CONST_SCALE, with proper
  72. * rounding, to produce the correct output. This division can be done
  73. * cheaply as a right shift of CONST_BITS bits. We postpone shifting
  74. * as long as possible so that partial sums can be added together with
  75. * full fractional precision.
  76. *
  77. * The outputs of the first pass are scaled up by PASS1_BITS bits so that
  78. * they are represented to better-than-integral precision. These outputs
  79. * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  80. * with the recommended scaling. (For 12-bit sample data, the intermediate
  81. * array is INT32 anyway.)
  82. *
  83. * To avoid overflow of the 32-bit intermediate results in pass 2, we must
  84. * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
  85. * shows that the values given below are the most effective.
  86. */
  87. #if BITS_IN_JSAMPLE == 8
  88. #define CONST_BITS 13
  89. #define PASS1_BITS 2
  90. #else
  91. #define CONST_BITS 13
  92. #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
  93. #endif
  94. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  95. * causing a lot of useless floating-point operations at run time.
  96. * To get around this we use the following pre-calculated constants.
  97. * If you change CONST_BITS you may want to add appropriate values.
  98. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  99. */
  100. #if CONST_BITS == 13
  101. #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
  102. #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
  103. #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
  104. #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
  105. #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
  106. #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
  107. #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
  108. #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
  109. #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
  110. #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
  111. #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
  112. #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
  113. #else
  114. #define FIX_0_298631336 FIX(0.298631336)
  115. #define FIX_0_390180644 FIX(0.390180644)
  116. #define FIX_0_541196100 FIX(0.541196100)
  117. #define FIX_0_765366865 FIX(0.765366865)
  118. #define FIX_0_899976223 FIX(0.899976223)
  119. #define FIX_1_175875602 FIX(1.175875602)
  120. #define FIX_1_501321110 FIX(1.501321110)
  121. #define FIX_1_847759065 FIX(1.847759065)
  122. #define FIX_1_961570560 FIX(1.961570560)
  123. #define FIX_2_053119869 FIX(2.053119869)
  124. #define FIX_2_562915447 FIX(2.562915447)
  125. #define FIX_3_072711026 FIX(3.072711026)
  126. #endif
  127. /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
  128. * For 8-bit samples with the recommended scaling, all the variable
  129. * and constant values involved are no more than 16 bits wide, so a
  130. * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
  131. * For 12-bit samples, a full 32-bit multiplication will be needed.
  132. */
  133. #if BITS_IN_JSAMPLE == 8
  134. #define MULTIPLY(var,const) MULTIPLY16C16(var,const)
  135. #else
  136. #define MULTIPLY(var,const) ((var) * (const))
  137. #endif
  138. /*
  139. * Perform the forward DCT on one block of samples.
  140. */
  141. GLOBAL(void)
  142. jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  143. {
  144. INT32 tmp0, tmp1, tmp2, tmp3;
  145. INT32 tmp10, tmp11, tmp12, tmp13;
  146. INT32 z1;
  147. DCTELEM *dataptr;
  148. JSAMPROW elemptr;
  149. int ctr;
  150. SHIFT_TEMPS
  151. /* Pass 1: process rows. */
  152. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  153. /* furthermore, we scale the results by 2**PASS1_BITS. */
  154. dataptr = data;
  155. for (ctr = 0; ctr < DCTSIZE; ctr++) {
  156. elemptr = sample_data[ctr] + start_col;
  157. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  158. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  159. */
  160. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
  161. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
  162. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
  163. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
  164. tmp10 = tmp0 + tmp3;
  165. tmp12 = tmp0 - tmp3;
  166. tmp11 = tmp1 + tmp2;
  167. tmp13 = tmp1 - tmp2;
  168. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
  169. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
  170. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
  171. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
  172. /* Apply unsigned->signed conversion */
  173. dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
  174. dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
  175. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  176. /* Add fudge factor here for final descale. */
  177. z1 += ONE << (CONST_BITS-PASS1_BITS-1);
  178. dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
  179. CONST_BITS-PASS1_BITS);
  180. dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
  181. CONST_BITS-PASS1_BITS);
  182. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  183. * cK represents sqrt(2) * cos(K*pi/16).
  184. * i0..i3 in the paper are tmp0..tmp3 here.
  185. */
  186. tmp10 = tmp0 + tmp3;
  187. tmp11 = tmp1 + tmp2;
  188. tmp12 = tmp0 + tmp2;
  189. tmp13 = tmp1 + tmp3;
  190. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  191. /* Add fudge factor here for final descale. */
  192. z1 += ONE << (CONST_BITS-PASS1_BITS-1);
  193. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  194. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  195. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  196. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  197. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  198. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  199. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  200. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  201. tmp12 += z1;
  202. tmp13 += z1;
  203. dataptr[1] = (DCTELEM)
  204. RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
  205. dataptr[3] = (DCTELEM)
  206. RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
  207. dataptr[5] = (DCTELEM)
  208. RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
  209. dataptr[7] = (DCTELEM)
  210. RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
  211. dataptr += DCTSIZE; /* advance pointer to next row */
  212. }
  213. /* Pass 2: process columns.
  214. * We remove the PASS1_BITS scaling, but leave the results scaled up
  215. * by an overall factor of 8.
  216. */
  217. dataptr = data;
  218. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  219. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  220. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  221. */
  222. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  223. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  224. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  225. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  226. /* Add fudge factor here for final descale. */
  227. tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
  228. tmp12 = tmp0 - tmp3;
  229. tmp11 = tmp1 + tmp2;
  230. tmp13 = tmp1 - tmp2;
  231. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  232. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  233. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  234. tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  235. dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
  236. dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
  237. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  238. /* Add fudge factor here for final descale. */
  239. z1 += ONE << (CONST_BITS+PASS1_BITS-1);
  240. dataptr[DCTSIZE*2] = (DCTELEM)
  241. RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
  242. dataptr[DCTSIZE*6] = (DCTELEM)
  243. RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
  244. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  245. * cK represents sqrt(2) * cos(K*pi/16).
  246. * i0..i3 in the paper are tmp0..tmp3 here.
  247. */
  248. tmp10 = tmp0 + tmp3;
  249. tmp11 = tmp1 + tmp2;
  250. tmp12 = tmp0 + tmp2;
  251. tmp13 = tmp1 + tmp3;
  252. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  253. /* Add fudge factor here for final descale. */
  254. z1 += ONE << (CONST_BITS+PASS1_BITS-1);
  255. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  256. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  257. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  258. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  259. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  260. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  261. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  262. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  263. tmp12 += z1;
  264. tmp13 += z1;
  265. dataptr[DCTSIZE*1] = (DCTELEM)
  266. RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
  267. dataptr[DCTSIZE*3] = (DCTELEM)
  268. RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
  269. dataptr[DCTSIZE*5] = (DCTELEM)
  270. RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
  271. dataptr[DCTSIZE*7] = (DCTELEM)
  272. RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
  273. dataptr++; /* advance pointer to next column */
  274. }
  275. }
  276. #ifdef DCT_SCALING_SUPPORTED
  277. /*
  278. * Perform the forward DCT on a 7x7 sample block.
  279. */
  280. GLOBAL(void)
  281. jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  282. {
  283. INT32 tmp0, tmp1, tmp2, tmp3;
  284. INT32 tmp10, tmp11, tmp12;
  285. INT32 z1, z2, z3;
  286. DCTELEM *dataptr;
  287. JSAMPROW elemptr;
  288. int ctr;
  289. SHIFT_TEMPS
  290. /* Pre-zero output coefficient block. */
  291. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  292. /* Pass 1: process rows. */
  293. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  294. /* furthermore, we scale the results by 2**PASS1_BITS. */
  295. /* cK represents sqrt(2) * cos(K*pi/14). */
  296. dataptr = data;
  297. for (ctr = 0; ctr < 7; ctr++) {
  298. elemptr = sample_data[ctr] + start_col;
  299. /* Even part */
  300. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
  301. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
  302. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
  303. tmp3 = GETJSAMPLE(elemptr[3]);
  304. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
  305. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
  306. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
  307. z1 = tmp0 + tmp2;
  308. /* Apply unsigned->signed conversion */
  309. dataptr[0] = (DCTELEM)
  310. ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
  311. tmp3 += tmp3;
  312. z1 -= tmp3;
  313. z1 -= tmp3;
  314. z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
  315. z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
  316. z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
  317. dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
  318. z1 -= z2;
  319. z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
  320. dataptr[4] = (DCTELEM)
  321. DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
  322. CONST_BITS-PASS1_BITS);
  323. dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
  324. /* Odd part */
  325. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
  326. tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
  327. tmp0 = tmp1 - tmp2;
  328. tmp1 += tmp2;
  329. tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
  330. tmp1 += tmp2;
  331. tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
  332. tmp0 += tmp3;
  333. tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
  334. dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
  335. dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
  336. dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
  337. dataptr += DCTSIZE; /* advance pointer to next row */
  338. }
  339. /* Pass 2: process columns.
  340. * We remove the PASS1_BITS scaling, but leave the results scaled up
  341. * by an overall factor of 8.
  342. * We must also scale the output by (8/7)**2 = 64/49, which we fold
  343. * into the constant multipliers:
  344. * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
  345. */
  346. dataptr = data;
  347. for (ctr = 0; ctr < 7; ctr++) {
  348. /* Even part */
  349. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
  350. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
  351. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
  352. tmp3 = dataptr[DCTSIZE*3];
  353. tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
  354. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
  355. tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
  356. z1 = tmp0 + tmp2;
  357. dataptr[DCTSIZE*0] = (DCTELEM)
  358. DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
  359. CONST_BITS+PASS1_BITS);
  360. tmp3 += tmp3;
  361. z1 -= tmp3;
  362. z1 -= tmp3;
  363. z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
  364. z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
  365. z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
  366. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
  367. z1 -= z2;
  368. z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
  369. dataptr[DCTSIZE*4] = (DCTELEM)
  370. DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
  371. CONST_BITS+PASS1_BITS);
  372. dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
  373. /* Odd part */
  374. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
  375. tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
  376. tmp0 = tmp1 - tmp2;
  377. tmp1 += tmp2;
  378. tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
  379. tmp1 += tmp2;
  380. tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
  381. tmp0 += tmp3;
  382. tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
  383. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
  384. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
  385. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
  386. dataptr++; /* advance pointer to next column */
  387. }
  388. }
  389. /*
  390. * Perform the forward DCT on a 6x6 sample block.
  391. */
  392. GLOBAL(void)
  393. jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  394. {
  395. INT32 tmp0, tmp1, tmp2;
  396. INT32 tmp10, tmp11, tmp12;
  397. DCTELEM *dataptr;
  398. JSAMPROW elemptr;
  399. int ctr;
  400. SHIFT_TEMPS
  401. /* Pre-zero output coefficient block. */
  402. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  403. /* Pass 1: process rows. */
  404. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  405. /* furthermore, we scale the results by 2**PASS1_BITS. */
  406. /* cK represents sqrt(2) * cos(K*pi/12). */
  407. dataptr = data;
  408. for (ctr = 0; ctr < 6; ctr++) {
  409. elemptr = sample_data[ctr] + start_col;
  410. /* Even part */
  411. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
  412. tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
  413. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
  414. tmp10 = tmp0 + tmp2;
  415. tmp12 = tmp0 - tmp2;
  416. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
  417. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
  418. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
  419. /* Apply unsigned->signed conversion */
  420. dataptr[0] = (DCTELEM)
  421. ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
  422. dataptr[2] = (DCTELEM)
  423. DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
  424. CONST_BITS-PASS1_BITS);
  425. dataptr[4] = (DCTELEM)
  426. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
  427. CONST_BITS-PASS1_BITS);
  428. /* Odd part */
  429. tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
  430. CONST_BITS-PASS1_BITS);
  431. dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
  432. dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
  433. dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
  434. dataptr += DCTSIZE; /* advance pointer to next row */
  435. }
  436. /* Pass 2: process columns.
  437. * We remove the PASS1_BITS scaling, but leave the results scaled up
  438. * by an overall factor of 8.
  439. * We must also scale the output by (8/6)**2 = 16/9, which we fold
  440. * into the constant multipliers:
  441. * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
  442. */
  443. dataptr = data;
  444. for (ctr = 0; ctr < 6; ctr++) {
  445. /* Even part */
  446. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
  447. tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
  448. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
  449. tmp10 = tmp0 + tmp2;
  450. tmp12 = tmp0 - tmp2;
  451. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
  452. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
  453. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
  454. dataptr[DCTSIZE*0] = (DCTELEM)
  455. DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
  456. CONST_BITS+PASS1_BITS);
  457. dataptr[DCTSIZE*2] = (DCTELEM)
  458. DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
  459. CONST_BITS+PASS1_BITS);
  460. dataptr[DCTSIZE*4] = (DCTELEM)
  461. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
  462. CONST_BITS+PASS1_BITS);
  463. /* Odd part */
  464. tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
  465. dataptr[DCTSIZE*1] = (DCTELEM)
  466. DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
  467. CONST_BITS+PASS1_BITS);
  468. dataptr[DCTSIZE*3] = (DCTELEM)
  469. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
  470. CONST_BITS+PASS1_BITS);
  471. dataptr[DCTSIZE*5] = (DCTELEM)
  472. DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
  473. CONST_BITS+PASS1_BITS);
  474. dataptr++; /* advance pointer to next column */
  475. }
  476. }
  477. /*
  478. * Perform the forward DCT on a 5x5 sample block.
  479. */
  480. GLOBAL(void)
  481. jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  482. {
  483. INT32 tmp0, tmp1, tmp2;
  484. INT32 tmp10, tmp11;
  485. DCTELEM *dataptr;
  486. JSAMPROW elemptr;
  487. int ctr;
  488. SHIFT_TEMPS
  489. /* Pre-zero output coefficient block. */
  490. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  491. /* Pass 1: process rows. */
  492. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  493. /* furthermore, we scale the results by 2**PASS1_BITS. */
  494. /* We scale the results further by 2 as part of output adaption */
  495. /* scaling for different DCT size. */
  496. /* cK represents sqrt(2) * cos(K*pi/10). */
  497. dataptr = data;
  498. for (ctr = 0; ctr < 5; ctr++) {
  499. elemptr = sample_data[ctr] + start_col;
  500. /* Even part */
  501. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
  502. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
  503. tmp2 = GETJSAMPLE(elemptr[2]);
  504. tmp10 = tmp0 + tmp1;
  505. tmp11 = tmp0 - tmp1;
  506. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
  507. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
  508. /* Apply unsigned->signed conversion */
  509. dataptr[0] = (DCTELEM)
  510. ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
  511. tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
  512. tmp10 -= tmp2 << 2;
  513. tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
  514. dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
  515. dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
  516. /* Odd part */
  517. tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
  518. dataptr[1] = (DCTELEM)
  519. DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
  520. CONST_BITS-PASS1_BITS-1);
  521. dataptr[3] = (DCTELEM)
  522. DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
  523. CONST_BITS-PASS1_BITS-1);
  524. dataptr += DCTSIZE; /* advance pointer to next row */
  525. }
  526. /* Pass 2: process columns.
  527. * We remove the PASS1_BITS scaling, but leave the results scaled up
  528. * by an overall factor of 8.
  529. * We must also scale the output by (8/5)**2 = 64/25, which we partially
  530. * fold into the constant multipliers (other part was done in pass 1):
  531. * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
  532. */
  533. dataptr = data;
  534. for (ctr = 0; ctr < 5; ctr++) {
  535. /* Even part */
  536. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
  537. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
  538. tmp2 = dataptr[DCTSIZE*2];
  539. tmp10 = tmp0 + tmp1;
  540. tmp11 = tmp0 - tmp1;
  541. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
  542. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
  543. dataptr[DCTSIZE*0] = (DCTELEM)
  544. DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
  545. CONST_BITS+PASS1_BITS);
  546. tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
  547. tmp10 -= tmp2 << 2;
  548. tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
  549. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
  550. dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
  551. /* Odd part */
  552. tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
  553. dataptr[DCTSIZE*1] = (DCTELEM)
  554. DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
  555. CONST_BITS+PASS1_BITS);
  556. dataptr[DCTSIZE*3] = (DCTELEM)
  557. DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
  558. CONST_BITS+PASS1_BITS);
  559. dataptr++; /* advance pointer to next column */
  560. }
  561. }
  562. /*
  563. * Perform the forward DCT on a 4x4 sample block.
  564. */
  565. GLOBAL(void)
  566. jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  567. {
  568. INT32 tmp0, tmp1;
  569. INT32 tmp10, tmp11;
  570. DCTELEM *dataptr;
  571. JSAMPROW elemptr;
  572. int ctr;
  573. SHIFT_TEMPS
  574. /* Pre-zero output coefficient block. */
  575. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  576. /* Pass 1: process rows. */
  577. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  578. /* furthermore, we scale the results by 2**PASS1_BITS. */
  579. /* We must also scale the output by (8/4)**2 = 2**2, which we add here. */
  580. /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
  581. dataptr = data;
  582. for (ctr = 0; ctr < 4; ctr++) {
  583. elemptr = sample_data[ctr] + start_col;
  584. /* Even part */
  585. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
  586. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
  587. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
  588. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
  589. /* Apply unsigned->signed conversion */
  590. dataptr[0] = (DCTELEM)
  591. ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
  592. dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
  593. /* Odd part */
  594. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  595. /* Add fudge factor here for final descale. */
  596. tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
  597. dataptr[1] = (DCTELEM)
  598. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  599. CONST_BITS-PASS1_BITS-2);
  600. dataptr[3] = (DCTELEM)
  601. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  602. CONST_BITS-PASS1_BITS-2);
  603. dataptr += DCTSIZE; /* advance pointer to next row */
  604. }
  605. /* Pass 2: process columns.
  606. * We remove the PASS1_BITS scaling, but leave the results scaled up
  607. * by an overall factor of 8.
  608. */
  609. dataptr = data;
  610. for (ctr = 0; ctr < 4; ctr++) {
  611. /* Even part */
  612. /* Add fudge factor here for final descale. */
  613. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
  614. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
  615. tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
  616. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
  617. dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
  618. dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
  619. /* Odd part */
  620. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  621. /* Add fudge factor here for final descale. */
  622. tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
  623. dataptr[DCTSIZE*1] = (DCTELEM)
  624. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  625. CONST_BITS+PASS1_BITS);
  626. dataptr[DCTSIZE*3] = (DCTELEM)
  627. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  628. CONST_BITS+PASS1_BITS);
  629. dataptr++; /* advance pointer to next column */
  630. }
  631. }
  632. /*
  633. * Perform the forward DCT on a 3x3 sample block.
  634. */
  635. GLOBAL(void)
  636. jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  637. {
  638. INT32 tmp0, tmp1, tmp2;
  639. DCTELEM *dataptr;
  640. JSAMPROW elemptr;
  641. int ctr;
  642. SHIFT_TEMPS
  643. /* Pre-zero output coefficient block. */
  644. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  645. /* Pass 1: process rows. */
  646. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  647. /* furthermore, we scale the results by 2**PASS1_BITS. */
  648. /* We scale the results further by 2**2 as part of output adaption */
  649. /* scaling for different DCT size. */
  650. /* cK represents sqrt(2) * cos(K*pi/6). */
  651. dataptr = data;
  652. for (ctr = 0; ctr < 3; ctr++) {
  653. elemptr = sample_data[ctr] + start_col;
  654. /* Even part */
  655. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
  656. tmp1 = GETJSAMPLE(elemptr[1]);
  657. tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
  658. /* Apply unsigned->signed conversion */
  659. dataptr[0] = (DCTELEM)
  660. ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
  661. dataptr[2] = (DCTELEM)
  662. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
  663. CONST_BITS-PASS1_BITS-2);
  664. /* Odd part */
  665. dataptr[1] = (DCTELEM)
  666. DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
  667. CONST_BITS-PASS1_BITS-2);
  668. dataptr += DCTSIZE; /* advance pointer to next row */
  669. }
  670. /* Pass 2: process columns.
  671. * We remove the PASS1_BITS scaling, but leave the results scaled up
  672. * by an overall factor of 8.
  673. * We must also scale the output by (8/3)**2 = 64/9, which we partially
  674. * fold into the constant multipliers (other part was done in pass 1):
  675. * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
  676. */
  677. dataptr = data;
  678. for (ctr = 0; ctr < 3; ctr++) {
  679. /* Even part */
  680. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
  681. tmp1 = dataptr[DCTSIZE*1];
  682. tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
  683. dataptr[DCTSIZE*0] = (DCTELEM)
  684. DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
  685. CONST_BITS+PASS1_BITS);
  686. dataptr[DCTSIZE*2] = (DCTELEM)
  687. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
  688. CONST_BITS+PASS1_BITS);
  689. /* Odd part */
  690. dataptr[DCTSIZE*1] = (DCTELEM)
  691. DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
  692. CONST_BITS+PASS1_BITS);
  693. dataptr++; /* advance pointer to next column */
  694. }
  695. }
  696. /*
  697. * Perform the forward DCT on a 2x2 sample block.
  698. */
  699. GLOBAL(void)
  700. jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  701. {
  702. INT32 tmp0, tmp1, tmp2, tmp3;
  703. JSAMPROW elemptr;
  704. /* Pre-zero output coefficient block. */
  705. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  706. /* Pass 1: process rows. */
  707. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  708. /* Row 0 */
  709. elemptr = sample_data[0] + start_col;
  710. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
  711. tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
  712. /* Row 1 */
  713. elemptr = sample_data[1] + start_col;
  714. tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
  715. tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
  716. /* Pass 2: process columns.
  717. * We leave the results scaled up by an overall factor of 8.
  718. * We must also scale the output by (8/2)**2 = 2**4.
  719. */
  720. /* Column 0 */
  721. /* Apply unsigned->signed conversion */
  722. data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4);
  723. data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
  724. /* Column 1 */
  725. data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
  726. data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
  727. }
  728. /*
  729. * Perform the forward DCT on a 1x1 sample block.
  730. */
  731. GLOBAL(void)
  732. jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  733. {
  734. /* Pre-zero output coefficient block. */
  735. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  736. /* We leave the result scaled up by an overall factor of 8. */
  737. /* We must also scale the output by (8/1)**2 = 2**6. */
  738. /* Apply unsigned->signed conversion */
  739. data[0] = (DCTELEM)
  740. ((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
  741. }
  742. /*
  743. * Perform the forward DCT on a 9x9 sample block.
  744. */
  745. GLOBAL(void)
  746. jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  747. {
  748. INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  749. INT32 tmp10, tmp11, tmp12, tmp13;
  750. INT32 z1, z2;
  751. DCTELEM workspace[8];
  752. DCTELEM *dataptr;
  753. DCTELEM *wsptr;
  754. JSAMPROW elemptr;
  755. int ctr;
  756. SHIFT_TEMPS
  757. /* Pass 1: process rows. */
  758. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  759. /* we scale the results further by 2 as part of output adaption */
  760. /* scaling for different DCT size. */
  761. /* cK represents sqrt(2) * cos(K*pi/18). */
  762. dataptr = data;
  763. ctr = 0;
  764. for (;;) {
  765. elemptr = sample_data[ctr] + start_col;
  766. /* Even part */
  767. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
  768. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
  769. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
  770. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
  771. tmp4 = GETJSAMPLE(elemptr[4]);
  772. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
  773. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
  774. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
  775. tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
  776. z1 = tmp0 + tmp2 + tmp3;
  777. z2 = tmp1 + tmp4;
  778. /* Apply unsigned->signed conversion */
  779. dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
  780. dataptr[6] = (DCTELEM)
  781. DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
  782. CONST_BITS-1);
  783. z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
  784. z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
  785. dataptr[2] = (DCTELEM)
  786. DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
  787. + z1 + z2, CONST_BITS-1);
  788. dataptr[4] = (DCTELEM)
  789. DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
  790. + z1 - z2, CONST_BITS-1);
  791. /* Odd part */
  792. dataptr[3] = (DCTELEM)
  793. DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
  794. CONST_BITS-1);
  795. tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
  796. tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
  797. tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
  798. dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
  799. tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
  800. dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
  801. dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
  802. ctr++;
  803. if (ctr != DCTSIZE) {
  804. if (ctr == 9)
  805. break; /* Done. */
  806. dataptr += DCTSIZE; /* advance pointer to next row */
  807. } else
  808. dataptr = workspace; /* switch pointer to extended workspace */
  809. }
  810. /* Pass 2: process columns.
  811. * We leave the results scaled up by an overall factor of 8.
  812. * We must also scale the output by (8/9)**2 = 64/81, which we partially
  813. * fold into the constant multipliers and final/initial shifting:
  814. * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
  815. */
  816. dataptr = data;
  817. wsptr = workspace;
  818. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  819. /* Even part */
  820. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
  821. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
  822. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
  823. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
  824. tmp4 = dataptr[DCTSIZE*4];
  825. tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
  826. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
  827. tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
  828. tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
  829. z1 = tmp0 + tmp2 + tmp3;
  830. z2 = tmp1 + tmp4;
  831. dataptr[DCTSIZE*0] = (DCTELEM)
  832. DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
  833. CONST_BITS+2);
  834. dataptr[DCTSIZE*6] = (DCTELEM)
  835. DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
  836. CONST_BITS+2);
  837. z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
  838. z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
  839. dataptr[DCTSIZE*2] = (DCTELEM)
  840. DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
  841. + z1 + z2, CONST_BITS+2);
  842. dataptr[DCTSIZE*4] = (DCTELEM)
  843. DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
  844. + z1 - z2, CONST_BITS+2);
  845. /* Odd part */
  846. dataptr[DCTSIZE*3] = (DCTELEM)
  847. DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
  848. CONST_BITS+2);
  849. tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
  850. tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
  851. tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
  852. dataptr[DCTSIZE*1] = (DCTELEM)
  853. DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
  854. tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
  855. dataptr[DCTSIZE*5] = (DCTELEM)
  856. DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
  857. dataptr[DCTSIZE*7] = (DCTELEM)
  858. DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
  859. dataptr++; /* advance pointer to next column */
  860. wsptr++; /* advance pointer to next column */
  861. }
  862. }
  863. /*
  864. * Perform the forward DCT on a 10x10 sample block.
  865. */
  866. GLOBAL(void)
  867. jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  868. {
  869. INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  870. INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  871. DCTELEM workspace[8*2];
  872. DCTELEM *dataptr;
  873. DCTELEM *wsptr;
  874. JSAMPROW elemptr;
  875. int ctr;
  876. SHIFT_TEMPS
  877. /* Pass 1: process rows. */
  878. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  879. /* we scale the results further by 2 as part of output adaption */
  880. /* scaling for different DCT size. */
  881. /* cK represents sqrt(2) * cos(K*pi/20). */
  882. dataptr = data;
  883. ctr = 0;
  884. for (;;) {
  885. elemptr = sample_data[ctr] + start_col;
  886. /* Even part */
  887. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
  888. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
  889. tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
  890. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
  891. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
  892. tmp10 = tmp0 + tmp4;
  893. tmp13 = tmp0 - tmp4;
  894. tmp11 = tmp1 + tmp3;
  895. tmp14 = tmp1 - tmp3;
  896. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
  897. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
  898. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
  899. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
  900. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
  901. /* Apply unsigned->signed conversion */
  902. dataptr[0] = (DCTELEM)
  903. ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
  904. tmp12 += tmp12;
  905. dataptr[4] = (DCTELEM)
  906. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
  907. MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
  908. CONST_BITS-1);
  909. tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
  910. dataptr[2] = (DCTELEM)
  911. DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
  912. CONST_BITS-1);
  913. dataptr[6] = (DCTELEM)
  914. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
  915. CONST_BITS-1);
  916. /* Odd part */
  917. tmp10 = tmp0 + tmp4;
  918. tmp11 = tmp1 - tmp3;
  919. dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
  920. tmp2 <<= CONST_BITS;
  921. dataptr[1] = (DCTELEM)
  922. DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
  923. MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
  924. MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
  925. MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
  926. CONST_BITS-1);
  927. tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
  928. MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
  929. tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
  930. (tmp11 << (CONST_BITS - 1)) - tmp2;
  931. dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
  932. dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
  933. ctr++;
  934. if (ctr != DCTSIZE) {
  935. if (ctr == 10)
  936. break; /* Done. */
  937. dataptr += DCTSIZE; /* advance pointer to next row */
  938. } else
  939. dataptr = workspace; /* switch pointer to extended workspace */
  940. }
  941. /* Pass 2: process columns.
  942. * We leave the results scaled up by an overall factor of 8.
  943. * We must also scale the output by (8/10)**2 = 16/25, which we partially
  944. * fold into the constant multipliers and final/initial shifting:
  945. * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
  946. */
  947. dataptr = data;
  948. wsptr = workspace;
  949. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  950. /* Even part */
  951. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
  952. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
  953. tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
  954. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
  955. tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
  956. tmp10 = tmp0 + tmp4;
  957. tmp13 = tmp0 - tmp4;
  958. tmp11 = tmp1 + tmp3;
  959. tmp14 = tmp1 - tmp3;
  960. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
  961. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
  962. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
  963. tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
  964. tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
  965. dataptr[DCTSIZE*0] = (DCTELEM)
  966. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
  967. CONST_BITS+2);
  968. tmp12 += tmp12;
  969. dataptr[DCTSIZE*4] = (DCTELEM)
  970. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
  971. MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
  972. CONST_BITS+2);
  973. tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
  974. dataptr[DCTSIZE*2] = (DCTELEM)
  975. DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
  976. CONST_BITS+2);
  977. dataptr[DCTSIZE*6] = (DCTELEM)
  978. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
  979. CONST_BITS+2);
  980. /* Odd part */
  981. tmp10 = tmp0 + tmp4;
  982. tmp11 = tmp1 - tmp3;
  983. dataptr[DCTSIZE*5] = (DCTELEM)
  984. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
  985. CONST_BITS+2);
  986. tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
  987. dataptr[DCTSIZE*1] = (DCTELEM)
  988. DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
  989. MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
  990. MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
  991. MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
  992. CONST_BITS+2);
  993. tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
  994. MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
  995. tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
  996. MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
  997. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
  998. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
  999. dataptr++; /* advance pointer to next column */
  1000. wsptr++; /* advance pointer to next column */
  1001. }
  1002. }
  1003. /*
  1004. * Perform the forward DCT on an 11x11 sample block.
  1005. */
  1006. GLOBAL(void)
  1007. jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1008. {
  1009. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  1010. INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  1011. INT32 z1, z2, z3;
  1012. DCTELEM workspace[8*3];
  1013. DCTELEM *dataptr;
  1014. DCTELEM *wsptr;
  1015. JSAMPROW elemptr;
  1016. int ctr;
  1017. SHIFT_TEMPS
  1018. /* Pass 1: process rows. */
  1019. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  1020. /* we scale the results further by 2 as part of output adaption */
  1021. /* scaling for different DCT size. */
  1022. /* cK represents sqrt(2) * cos(K*pi/22). */
  1023. dataptr = data;
  1024. ctr = 0;
  1025. for (;;) {
  1026. elemptr = sample_data[ctr] + start_col;
  1027. /* Even part */
  1028. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
  1029. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
  1030. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
  1031. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
  1032. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
  1033. tmp5 = GETJSAMPLE(elemptr[5]);
  1034. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
  1035. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
  1036. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
  1037. tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
  1038. tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
  1039. /* Apply unsigned->signed conversion */
  1040. dataptr[0] = (DCTELEM)
  1041. ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
  1042. tmp5 += tmp5;
  1043. tmp0 -= tmp5;
  1044. tmp1 -= tmp5;
  1045. tmp2 -= tmp5;
  1046. tmp3 -= tmp5;
  1047. tmp4 -= tmp5;
  1048. z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
  1049. MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
  1050. z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
  1051. z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
  1052. dataptr[2] = (DCTELEM)
  1053. DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
  1054. - MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
  1055. CONST_BITS-1);
  1056. dataptr[4] = (DCTELEM)
  1057. DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
  1058. - MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
  1059. + MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
  1060. CONST_BITS-1);
  1061. dataptr[6] = (DCTELEM)
  1062. DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
  1063. - MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
  1064. CONST_BITS-1);
  1065. /* Odd part */
  1066. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
  1067. tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
  1068. tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
  1069. tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
  1070. + MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
  1071. tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
  1072. tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
  1073. tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
  1074. - MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
  1075. tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
  1076. tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
  1077. + MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
  1078. tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
  1079. - MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
  1080. dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
  1081. dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
  1082. dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
  1083. dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
  1084. ctr++;
  1085. if (ctr != DCTSIZE) {
  1086. if (ctr == 11)
  1087. break; /* Done. */
  1088. dataptr += DCTSIZE; /* advance pointer to next row */
  1089. } else
  1090. dataptr = workspace; /* switch pointer to extended workspace */
  1091. }
  1092. /* Pass 2: process columns.
  1093. * We leave the results scaled up by an overall factor of 8.
  1094. * We must also scale the output by (8/11)**2 = 64/121, which we partially
  1095. * fold into the constant multipliers and final/initial shifting:
  1096. * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
  1097. */
  1098. dataptr = data;
  1099. wsptr = workspace;
  1100. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1101. /* Even part */
  1102. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
  1103. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
  1104. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
  1105. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
  1106. tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
  1107. tmp5 = dataptr[DCTSIZE*5];
  1108. tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
  1109. tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
  1110. tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
  1111. tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
  1112. tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
  1113. dataptr[DCTSIZE*0] = (DCTELEM)
  1114. DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
  1115. FIX(1.057851240)), /* 128/121 */
  1116. CONST_BITS+2);
  1117. tmp5 += tmp5;
  1118. tmp0 -= tmp5;
  1119. tmp1 -= tmp5;
  1120. tmp2 -= tmp5;
  1121. tmp3 -= tmp5;
  1122. tmp4 -= tmp5;
  1123. z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
  1124. MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
  1125. z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
  1126. z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
  1127. dataptr[DCTSIZE*2] = (DCTELEM)
  1128. DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
  1129. - MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
  1130. CONST_BITS+2);
  1131. dataptr[DCTSIZE*4] = (DCTELEM)
  1132. DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
  1133. - MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
  1134. + MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
  1135. CONST_BITS+2);
  1136. dataptr[DCTSIZE*6] = (DCTELEM)
  1137. DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
  1138. - MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
  1139. CONST_BITS+2);
  1140. /* Odd part */
  1141. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
  1142. tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
  1143. tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
  1144. tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
  1145. + MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
  1146. tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
  1147. tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
  1148. tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
  1149. - MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
  1150. tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
  1151. tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
  1152. + MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
  1153. tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
  1154. - MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
  1155. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
  1156. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
  1157. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
  1158. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
  1159. dataptr++; /* advance pointer to next column */
  1160. wsptr++; /* advance pointer to next column */
  1161. }
  1162. }
  1163. /*
  1164. * Perform the forward DCT on a 12x12 sample block.
  1165. */
  1166. GLOBAL(void)
  1167. jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1168. {
  1169. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  1170. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  1171. DCTELEM workspace[8*4];
  1172. DCTELEM *dataptr;
  1173. DCTELEM *wsptr;
  1174. JSAMPROW elemptr;
  1175. int ctr;
  1176. SHIFT_TEMPS
  1177. /* Pass 1: process rows. */
  1178. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  1179. /* cK represents sqrt(2) * cos(K*pi/24). */
  1180. dataptr = data;
  1181. ctr = 0;
  1182. for (;;) {
  1183. elemptr = sample_data[ctr] + start_col;
  1184. /* Even part */
  1185. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
  1186. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
  1187. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
  1188. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
  1189. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
  1190. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
  1191. tmp10 = tmp0 + tmp5;
  1192. tmp13 = tmp0 - tmp5;
  1193. tmp11 = tmp1 + tmp4;
  1194. tmp14 = tmp1 - tmp4;
  1195. tmp12 = tmp2 + tmp3;
  1196. tmp15 = tmp2 - tmp3;
  1197. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
  1198. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
  1199. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
  1200. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
  1201. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
  1202. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
  1203. /* Apply unsigned->signed conversion */
  1204. dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
  1205. dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
  1206. dataptr[4] = (DCTELEM)
  1207. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
  1208. CONST_BITS);
  1209. dataptr[2] = (DCTELEM)
  1210. DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
  1211. CONST_BITS);
  1212. /* Odd part */
  1213. tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
  1214. tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
  1215. tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
  1216. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
  1217. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
  1218. tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
  1219. + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
  1220. tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
  1221. tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
  1222. + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
  1223. tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
  1224. - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
  1225. tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
  1226. - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
  1227. dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
  1228. dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
  1229. dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
  1230. dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
  1231. ctr++;
  1232. if (ctr != DCTSIZE) {
  1233. if (ctr == 12)
  1234. break; /* Done. */
  1235. dataptr += DCTSIZE; /* advance pointer to next row */
  1236. } else
  1237. dataptr = workspace; /* switch pointer to extended workspace */
  1238. }
  1239. /* Pass 2: process columns.
  1240. * We leave the results scaled up by an overall factor of 8.
  1241. * We must also scale the output by (8/12)**2 = 4/9, which we partially
  1242. * fold into the constant multipliers and final shifting:
  1243. * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
  1244. */
  1245. dataptr = data;
  1246. wsptr = workspace;
  1247. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1248. /* Even part */
  1249. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
  1250. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
  1251. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
  1252. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
  1253. tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
  1254. tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
  1255. tmp10 = tmp0 + tmp5;
  1256. tmp13 = tmp0 - tmp5;
  1257. tmp11 = tmp1 + tmp4;
  1258. tmp14 = tmp1 - tmp4;
  1259. tmp12 = tmp2 + tmp3;
  1260. tmp15 = tmp2 - tmp3;
  1261. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
  1262. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
  1263. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
  1264. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
  1265. tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
  1266. tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
  1267. dataptr[DCTSIZE*0] = (DCTELEM)
  1268. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
  1269. CONST_BITS+1);
  1270. dataptr[DCTSIZE*6] = (DCTELEM)
  1271. DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
  1272. CONST_BITS+1);
  1273. dataptr[DCTSIZE*4] = (DCTELEM)
  1274. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
  1275. CONST_BITS+1);
  1276. dataptr[DCTSIZE*2] = (DCTELEM)
  1277. DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
  1278. MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
  1279. CONST_BITS+1);
  1280. /* Odd part */
  1281. tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
  1282. tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
  1283. tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
  1284. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
  1285. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
  1286. tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
  1287. + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
  1288. tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
  1289. tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
  1290. + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
  1291. tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
  1292. - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
  1293. tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
  1294. - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
  1295. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
  1296. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
  1297. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
  1298. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
  1299. dataptr++; /* advance pointer to next column */
  1300. wsptr++; /* advance pointer to next column */
  1301. }
  1302. }
  1303. /*
  1304. * Perform the forward DCT on a 13x13 sample block.
  1305. */
  1306. GLOBAL(void)
  1307. jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1308. {
  1309. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  1310. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  1311. INT32 z1, z2;
  1312. DCTELEM workspace[8*5];
  1313. DCTELEM *dataptr;
  1314. DCTELEM *wsptr;
  1315. JSAMPROW elemptr;
  1316. int ctr;
  1317. SHIFT_TEMPS
  1318. /* Pass 1: process rows. */
  1319. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  1320. /* cK represents sqrt(2) * cos(K*pi/26). */
  1321. dataptr = data;
  1322. ctr = 0;
  1323. for (;;) {
  1324. elemptr = sample_data[ctr] + start_col;
  1325. /* Even part */
  1326. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
  1327. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
  1328. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
  1329. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
  1330. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
  1331. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
  1332. tmp6 = GETJSAMPLE(elemptr[6]);
  1333. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
  1334. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
  1335. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
  1336. tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
  1337. tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
  1338. tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
  1339. /* Apply unsigned->signed conversion */
  1340. dataptr[0] = (DCTELEM)
  1341. (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
  1342. tmp6 += tmp6;
  1343. tmp0 -= tmp6;
  1344. tmp1 -= tmp6;
  1345. tmp2 -= tmp6;
  1346. tmp3 -= tmp6;
  1347. tmp4 -= tmp6;
  1348. tmp5 -= tmp6;
  1349. dataptr[2] = (DCTELEM)
  1350. DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
  1351. MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
  1352. MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
  1353. MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
  1354. MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
  1355. MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
  1356. CONST_BITS);
  1357. z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
  1358. MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
  1359. MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
  1360. z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
  1361. MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
  1362. MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
  1363. dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
  1364. dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
  1365. /* Odd part */
  1366. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
  1367. tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
  1368. tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
  1369. MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
  1370. tmp0 = tmp1 + tmp2 + tmp3 -
  1371. MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
  1372. MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
  1373. tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
  1374. MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
  1375. tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
  1376. tmp1 += tmp4 + tmp5 +
  1377. MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
  1378. MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
  1379. tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
  1380. tmp2 += tmp4 + tmp6 -
  1381. MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
  1382. MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
  1383. tmp3 += tmp5 + tmp6 +
  1384. MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
  1385. MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
  1386. dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
  1387. dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
  1388. dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
  1389. dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
  1390. ctr++;
  1391. if (ctr != DCTSIZE) {
  1392. if (ctr == 13)
  1393. break; /* Done. */
  1394. dataptr += DCTSIZE; /* advance pointer to next row */
  1395. } else
  1396. dataptr = workspace; /* switch pointer to extended workspace */
  1397. }
  1398. /* Pass 2: process columns.
  1399. * We leave the results scaled up by an overall factor of 8.
  1400. * We must also scale the output by (8/13)**2 = 64/169, which we partially
  1401. * fold into the constant multipliers and final shifting:
  1402. * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
  1403. */
  1404. dataptr = data;
  1405. wsptr = workspace;
  1406. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1407. /* Even part */
  1408. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
  1409. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
  1410. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
  1411. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
  1412. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
  1413. tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
  1414. tmp6 = dataptr[DCTSIZE*6];
  1415. tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
  1416. tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
  1417. tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
  1418. tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
  1419. tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
  1420. tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
  1421. dataptr[DCTSIZE*0] = (DCTELEM)
  1422. DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
  1423. FIX(0.757396450)), /* 128/169 */
  1424. CONST_BITS+1);
  1425. tmp6 += tmp6;
  1426. tmp0 -= tmp6;
  1427. tmp1 -= tmp6;
  1428. tmp2 -= tmp6;
  1429. tmp3 -= tmp6;
  1430. tmp4 -= tmp6;
  1431. tmp5 -= tmp6;
  1432. dataptr[DCTSIZE*2] = (DCTELEM)
  1433. DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
  1434. MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
  1435. MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
  1436. MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
  1437. MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
  1438. MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
  1439. CONST_BITS+1);
  1440. z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
  1441. MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
  1442. MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
  1443. z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
  1444. MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
  1445. MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
  1446. dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
  1447. dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
  1448. /* Odd part */
  1449. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
  1450. tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
  1451. tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
  1452. MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
  1453. tmp0 = tmp1 + tmp2 + tmp3 -
  1454. MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
  1455. MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
  1456. tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
  1457. MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
  1458. tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
  1459. tmp1 += tmp4 + tmp5 +
  1460. MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
  1461. MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
  1462. tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
  1463. tmp2 += tmp4 + tmp6 -
  1464. MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
  1465. MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
  1466. tmp3 += tmp5 + tmp6 +
  1467. MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
  1468. MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
  1469. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
  1470. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
  1471. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
  1472. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
  1473. dataptr++; /* advance pointer to next column */
  1474. wsptr++; /* advance pointer to next column */
  1475. }
  1476. }
  1477. /*
  1478. * Perform the forward DCT on a 14x14 sample block.
  1479. */
  1480. GLOBAL(void)
  1481. jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1482. {
  1483. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  1484. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  1485. DCTELEM workspace[8*6];
  1486. DCTELEM *dataptr;
  1487. DCTELEM *wsptr;
  1488. JSAMPROW elemptr;
  1489. int ctr;
  1490. SHIFT_TEMPS
  1491. /* Pass 1: process rows. */
  1492. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  1493. /* cK represents sqrt(2) * cos(K*pi/28). */
  1494. dataptr = data;
  1495. ctr = 0;
  1496. for (;;) {
  1497. elemptr = sample_data[ctr] + start_col;
  1498. /* Even part */
  1499. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
  1500. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
  1501. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
  1502. tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
  1503. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
  1504. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
  1505. tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
  1506. tmp10 = tmp0 + tmp6;
  1507. tmp14 = tmp0 - tmp6;
  1508. tmp11 = tmp1 + tmp5;
  1509. tmp15 = tmp1 - tmp5;
  1510. tmp12 = tmp2 + tmp4;
  1511. tmp16 = tmp2 - tmp4;
  1512. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
  1513. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
  1514. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
  1515. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
  1516. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
  1517. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
  1518. tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
  1519. /* Apply unsigned->signed conversion */
  1520. dataptr[0] = (DCTELEM)
  1521. (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
  1522. tmp13 += tmp13;
  1523. dataptr[4] = (DCTELEM)
  1524. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
  1525. MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
  1526. MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
  1527. CONST_BITS);
  1528. tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
  1529. dataptr[2] = (DCTELEM)
  1530. DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
  1531. + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
  1532. CONST_BITS);
  1533. dataptr[6] = (DCTELEM)
  1534. DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
  1535. - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
  1536. CONST_BITS);
  1537. /* Odd part */
  1538. tmp10 = tmp1 + tmp2;
  1539. tmp11 = tmp5 - tmp4;
  1540. dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
  1541. tmp3 <<= CONST_BITS;
  1542. tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
  1543. tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
  1544. tmp10 += tmp11 - tmp3;
  1545. tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
  1546. MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
  1547. dataptr[5] = (DCTELEM)
  1548. DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
  1549. + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
  1550. CONST_BITS);
  1551. tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
  1552. MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
  1553. dataptr[3] = (DCTELEM)
  1554. DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
  1555. - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
  1556. CONST_BITS);
  1557. dataptr[1] = (DCTELEM)
  1558. DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
  1559. MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
  1560. CONST_BITS);
  1561. ctr++;
  1562. if (ctr != DCTSIZE) {
  1563. if (ctr == 14)
  1564. break; /* Done. */
  1565. dataptr += DCTSIZE; /* advance pointer to next row */
  1566. } else
  1567. dataptr = workspace; /* switch pointer to extended workspace */
  1568. }
  1569. /* Pass 2: process columns.
  1570. * We leave the results scaled up by an overall factor of 8.
  1571. * We must also scale the output by (8/14)**2 = 16/49, which we partially
  1572. * fold into the constant multipliers and final shifting:
  1573. * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
  1574. */
  1575. dataptr = data;
  1576. wsptr = workspace;
  1577. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1578. /* Even part */
  1579. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
  1580. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
  1581. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
  1582. tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
  1583. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
  1584. tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
  1585. tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
  1586. tmp10 = tmp0 + tmp6;
  1587. tmp14 = tmp0 - tmp6;
  1588. tmp11 = tmp1 + tmp5;
  1589. tmp15 = tmp1 - tmp5;
  1590. tmp12 = tmp2 + tmp4;
  1591. tmp16 = tmp2 - tmp4;
  1592. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
  1593. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
  1594. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
  1595. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
  1596. tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
  1597. tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
  1598. tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
  1599. dataptr[DCTSIZE*0] = (DCTELEM)
  1600. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
  1601. FIX(0.653061224)), /* 32/49 */
  1602. CONST_BITS+1);
  1603. tmp13 += tmp13;
  1604. dataptr[DCTSIZE*4] = (DCTELEM)
  1605. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
  1606. MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
  1607. MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
  1608. CONST_BITS+1);
  1609. tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
  1610. dataptr[DCTSIZE*2] = (DCTELEM)
  1611. DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
  1612. + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
  1613. CONST_BITS+1);
  1614. dataptr[DCTSIZE*6] = (DCTELEM)
  1615. DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
  1616. - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
  1617. CONST_BITS+1);
  1618. /* Odd part */
  1619. tmp10 = tmp1 + tmp2;
  1620. tmp11 = tmp5 - tmp4;
  1621. dataptr[DCTSIZE*7] = (DCTELEM)
  1622. DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
  1623. FIX(0.653061224)), /* 32/49 */
  1624. CONST_BITS+1);
  1625. tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
  1626. tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
  1627. tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
  1628. tmp10 += tmp11 - tmp3;
  1629. tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
  1630. MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
  1631. dataptr[DCTSIZE*5] = (DCTELEM)
  1632. DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
  1633. + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
  1634. CONST_BITS+1);
  1635. tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
  1636. MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
  1637. dataptr[DCTSIZE*3] = (DCTELEM)
  1638. DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
  1639. - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
  1640. CONST_BITS+1);
  1641. dataptr[DCTSIZE*1] = (DCTELEM)
  1642. DESCALE(tmp11 + tmp12 + tmp3
  1643. - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
  1644. - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
  1645. CONST_BITS+1);
  1646. dataptr++; /* advance pointer to next column */
  1647. wsptr++; /* advance pointer to next column */
  1648. }
  1649. }
  1650. /*
  1651. * Perform the forward DCT on a 15x15 sample block.
  1652. */
  1653. GLOBAL(void)
  1654. jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1655. {
  1656. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1657. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  1658. INT32 z1, z2, z3;
  1659. DCTELEM workspace[8*7];
  1660. DCTELEM *dataptr;
  1661. DCTELEM *wsptr;
  1662. JSAMPROW elemptr;
  1663. int ctr;
  1664. SHIFT_TEMPS
  1665. /* Pass 1: process rows. */
  1666. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  1667. /* cK represents sqrt(2) * cos(K*pi/30). */
  1668. dataptr = data;
  1669. ctr = 0;
  1670. for (;;) {
  1671. elemptr = sample_data[ctr] + start_col;
  1672. /* Even part */
  1673. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
  1674. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
  1675. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
  1676. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
  1677. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
  1678. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
  1679. tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
  1680. tmp7 = GETJSAMPLE(elemptr[7]);
  1681. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
  1682. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
  1683. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
  1684. tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
  1685. tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
  1686. tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
  1687. tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
  1688. z1 = tmp0 + tmp4 + tmp5;
  1689. z2 = tmp1 + tmp3 + tmp6;
  1690. z3 = tmp2 + tmp7;
  1691. /* Apply unsigned->signed conversion */
  1692. dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
  1693. z3 += z3;
  1694. dataptr[6] = (DCTELEM)
  1695. DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
  1696. MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
  1697. CONST_BITS);
  1698. tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
  1699. z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
  1700. MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
  1701. z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
  1702. MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
  1703. z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
  1704. MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
  1705. MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
  1706. dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
  1707. dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
  1708. /* Odd part */
  1709. tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
  1710. FIX(1.224744871)); /* c5 */
  1711. tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
  1712. MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
  1713. tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
  1714. tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
  1715. MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
  1716. MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
  1717. tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
  1718. MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
  1719. MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
  1720. tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
  1721. MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
  1722. MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
  1723. dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
  1724. dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
  1725. dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
  1726. dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
  1727. ctr++;
  1728. if (ctr != DCTSIZE) {
  1729. if (ctr == 15)
  1730. break; /* Done. */
  1731. dataptr += DCTSIZE; /* advance pointer to next row */
  1732. } else
  1733. dataptr = workspace; /* switch pointer to extended workspace */
  1734. }
  1735. /* Pass 2: process columns.
  1736. * We leave the results scaled up by an overall factor of 8.
  1737. * We must also scale the output by (8/15)**2 = 64/225, which we partially
  1738. * fold into the constant multipliers and final shifting:
  1739. * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
  1740. */
  1741. dataptr = data;
  1742. wsptr = workspace;
  1743. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1744. /* Even part */
  1745. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
  1746. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
  1747. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
  1748. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
  1749. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
  1750. tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
  1751. tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
  1752. tmp7 = dataptr[DCTSIZE*7];
  1753. tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
  1754. tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
  1755. tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
  1756. tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
  1757. tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
  1758. tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
  1759. tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
  1760. z1 = tmp0 + tmp4 + tmp5;
  1761. z2 = tmp1 + tmp3 + tmp6;
  1762. z3 = tmp2 + tmp7;
  1763. dataptr[DCTSIZE*0] = (DCTELEM)
  1764. DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
  1765. CONST_BITS+2);
  1766. z3 += z3;
  1767. dataptr[DCTSIZE*6] = (DCTELEM)
  1768. DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
  1769. MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
  1770. CONST_BITS+2);
  1771. tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
  1772. z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
  1773. MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
  1774. z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
  1775. MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
  1776. z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
  1777. MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
  1778. MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
  1779. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
  1780. dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
  1781. /* Odd part */
  1782. tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
  1783. FIX(1.393487498)); /* c5 */
  1784. tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
  1785. MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
  1786. tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
  1787. tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
  1788. MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
  1789. MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
  1790. tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
  1791. MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
  1792. MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
  1793. tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
  1794. MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
  1795. MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
  1796. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
  1797. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
  1798. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
  1799. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
  1800. dataptr++; /* advance pointer to next column */
  1801. wsptr++; /* advance pointer to next column */
  1802. }
  1803. }
  1804. /*
  1805. * Perform the forward DCT on a 16x16 sample block.
  1806. */
  1807. GLOBAL(void)
  1808. jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1809. {
  1810. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1811. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  1812. DCTELEM workspace[DCTSIZE2];
  1813. DCTELEM *dataptr;
  1814. DCTELEM *wsptr;
  1815. JSAMPROW elemptr;
  1816. int ctr;
  1817. SHIFT_TEMPS
  1818. /* Pass 1: process rows. */
  1819. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  1820. /* furthermore, we scale the results by 2**PASS1_BITS. */
  1821. /* cK represents sqrt(2) * cos(K*pi/32). */
  1822. dataptr = data;
  1823. ctr = 0;
  1824. for (;;) {
  1825. elemptr = sample_data[ctr] + start_col;
  1826. /* Even part */
  1827. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
  1828. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
  1829. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
  1830. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
  1831. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
  1832. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
  1833. tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
  1834. tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
  1835. tmp10 = tmp0 + tmp7;
  1836. tmp14 = tmp0 - tmp7;
  1837. tmp11 = tmp1 + tmp6;
  1838. tmp15 = tmp1 - tmp6;
  1839. tmp12 = tmp2 + tmp5;
  1840. tmp16 = tmp2 - tmp5;
  1841. tmp13 = tmp3 + tmp4;
  1842. tmp17 = tmp3 - tmp4;
  1843. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
  1844. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
  1845. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
  1846. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
  1847. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
  1848. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
  1849. tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
  1850. tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
  1851. /* Apply unsigned->signed conversion */
  1852. dataptr[0] = (DCTELEM)
  1853. ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
  1854. dataptr[4] = (DCTELEM)
  1855. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
  1856. MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
  1857. CONST_BITS-PASS1_BITS);
  1858. tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
  1859. MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
  1860. dataptr[2] = (DCTELEM)
  1861. DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
  1862. + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
  1863. CONST_BITS-PASS1_BITS);
  1864. dataptr[6] = (DCTELEM)
  1865. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
  1866. - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
  1867. CONST_BITS-PASS1_BITS);
  1868. /* Odd part */
  1869. tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
  1870. MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
  1871. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
  1872. MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
  1873. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
  1874. MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
  1875. tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
  1876. MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
  1877. tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
  1878. MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
  1879. tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
  1880. MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
  1881. tmp10 = tmp11 + tmp12 + tmp13 -
  1882. MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
  1883. MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
  1884. tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
  1885. - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
  1886. tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
  1887. + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
  1888. tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
  1889. + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
  1890. dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
  1891. dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
  1892. dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
  1893. dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
  1894. ctr++;
  1895. if (ctr != DCTSIZE) {
  1896. if (ctr == DCTSIZE * 2)
  1897. break; /* Done. */
  1898. dataptr += DCTSIZE; /* advance pointer to next row */
  1899. } else
  1900. dataptr = workspace; /* switch pointer to extended workspace */
  1901. }
  1902. /* Pass 2: process columns.
  1903. * We remove the PASS1_BITS scaling, but leave the results scaled up
  1904. * by an overall factor of 8.
  1905. * We must also scale the output by (8/16)**2 = 1/2**2.
  1906. */
  1907. dataptr = data;
  1908. wsptr = workspace;
  1909. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  1910. /* Even part */
  1911. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
  1912. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
  1913. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
  1914. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
  1915. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
  1916. tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
  1917. tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
  1918. tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
  1919. tmp10 = tmp0 + tmp7;
  1920. tmp14 = tmp0 - tmp7;
  1921. tmp11 = tmp1 + tmp6;
  1922. tmp15 = tmp1 - tmp6;
  1923. tmp12 = tmp2 + tmp5;
  1924. tmp16 = tmp2 - tmp5;
  1925. tmp13 = tmp3 + tmp4;
  1926. tmp17 = tmp3 - tmp4;
  1927. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
  1928. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
  1929. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
  1930. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
  1931. tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
  1932. tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
  1933. tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
  1934. tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
  1935. dataptr[DCTSIZE*0] = (DCTELEM)
  1936. DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
  1937. dataptr[DCTSIZE*4] = (DCTELEM)
  1938. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
  1939. MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
  1940. CONST_BITS+PASS1_BITS+2);
  1941. tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
  1942. MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
  1943. dataptr[DCTSIZE*2] = (DCTELEM)
  1944. DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
  1945. + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
  1946. CONST_BITS+PASS1_BITS+2);
  1947. dataptr[DCTSIZE*6] = (DCTELEM)
  1948. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
  1949. - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
  1950. CONST_BITS+PASS1_BITS+2);
  1951. /* Odd part */
  1952. tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
  1953. MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
  1954. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
  1955. MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
  1956. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
  1957. MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
  1958. tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
  1959. MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
  1960. tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
  1961. MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
  1962. tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
  1963. MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
  1964. tmp10 = tmp11 + tmp12 + tmp13 -
  1965. MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
  1966. MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
  1967. tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
  1968. - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
  1969. tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
  1970. + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
  1971. tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
  1972. + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
  1973. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
  1974. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
  1975. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
  1976. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
  1977. dataptr++; /* advance pointer to next column */
  1978. wsptr++; /* advance pointer to next column */
  1979. }
  1980. }
  1981. /*
  1982. * Perform the forward DCT on a 16x8 sample block.
  1983. *
  1984. * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
  1985. */
  1986. GLOBAL(void)
  1987. jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  1988. {
  1989. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1990. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  1991. INT32 z1;
  1992. DCTELEM *dataptr;
  1993. JSAMPROW elemptr;
  1994. int ctr;
  1995. SHIFT_TEMPS
  1996. /* Pass 1: process rows. */
  1997. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  1998. /* furthermore, we scale the results by 2**PASS1_BITS. */
  1999. /* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32). */
  2000. dataptr = data;
  2001. ctr = 0;
  2002. for (ctr = 0; ctr < DCTSIZE; ctr++) {
  2003. elemptr = sample_data[ctr] + start_col;
  2004. /* Even part */
  2005. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
  2006. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
  2007. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
  2008. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
  2009. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
  2010. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
  2011. tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
  2012. tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
  2013. tmp10 = tmp0 + tmp7;
  2014. tmp14 = tmp0 - tmp7;
  2015. tmp11 = tmp1 + tmp6;
  2016. tmp15 = tmp1 - tmp6;
  2017. tmp12 = tmp2 + tmp5;
  2018. tmp16 = tmp2 - tmp5;
  2019. tmp13 = tmp3 + tmp4;
  2020. tmp17 = tmp3 - tmp4;
  2021. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
  2022. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
  2023. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
  2024. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
  2025. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
  2026. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
  2027. tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
  2028. tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
  2029. /* Apply unsigned->signed conversion */
  2030. dataptr[0] = (DCTELEM)
  2031. ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
  2032. dataptr[4] = (DCTELEM)
  2033. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
  2034. MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
  2035. CONST_BITS-PASS1_BITS);
  2036. tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
  2037. MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
  2038. dataptr[2] = (DCTELEM)
  2039. DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
  2040. + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
  2041. CONST_BITS-PASS1_BITS);
  2042. dataptr[6] = (DCTELEM)
  2043. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
  2044. - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
  2045. CONST_BITS-PASS1_BITS);
  2046. /* Odd part */
  2047. tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
  2048. MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
  2049. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
  2050. MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
  2051. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
  2052. MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
  2053. tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
  2054. MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
  2055. tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
  2056. MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
  2057. tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
  2058. MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
  2059. tmp10 = tmp11 + tmp12 + tmp13 -
  2060. MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
  2061. MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
  2062. tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
  2063. - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
  2064. tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
  2065. + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
  2066. tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
  2067. + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
  2068. dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
  2069. dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
  2070. dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
  2071. dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
  2072. dataptr += DCTSIZE; /* advance pointer to next row */
  2073. }
  2074. /* Pass 2: process columns.
  2075. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2076. * by an overall factor of 8.
  2077. * We must also scale the output by 8/16 = 1/2.
  2078. */
  2079. dataptr = data;
  2080. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2081. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  2082. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  2083. */
  2084. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  2085. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  2086. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  2087. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  2088. tmp10 = tmp0 + tmp3;
  2089. tmp12 = tmp0 - tmp3;
  2090. tmp11 = tmp1 + tmp2;
  2091. tmp13 = tmp1 - tmp2;
  2092. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  2093. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  2094. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  2095. tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  2096. dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
  2097. dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
  2098. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  2099. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
  2100. CONST_BITS+PASS1_BITS+1);
  2101. dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
  2102. CONST_BITS+PASS1_BITS+1);
  2103. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  2104. * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
  2105. * i0..i3 in the paper are tmp0..tmp3 here.
  2106. */
  2107. tmp10 = tmp0 + tmp3;
  2108. tmp11 = tmp1 + tmp2;
  2109. tmp12 = tmp0 + tmp2;
  2110. tmp13 = tmp1 + tmp3;
  2111. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  2112. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  2113. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  2114. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  2115. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  2116. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  2117. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  2118. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  2119. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  2120. tmp12 += z1;
  2121. tmp13 += z1;
  2122. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
  2123. CONST_BITS+PASS1_BITS+1);
  2124. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
  2125. CONST_BITS+PASS1_BITS+1);
  2126. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
  2127. CONST_BITS+PASS1_BITS+1);
  2128. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
  2129. CONST_BITS+PASS1_BITS+1);
  2130. dataptr++; /* advance pointer to next column */
  2131. }
  2132. }
  2133. /*
  2134. * Perform the forward DCT on a 14x7 sample block.
  2135. *
  2136. * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
  2137. */
  2138. GLOBAL(void)
  2139. jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2140. {
  2141. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  2142. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  2143. INT32 z1, z2, z3;
  2144. DCTELEM *dataptr;
  2145. JSAMPROW elemptr;
  2146. int ctr;
  2147. SHIFT_TEMPS
  2148. /* Zero bottom row of output coefficient block. */
  2149. MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
  2150. /* Pass 1: process rows. */
  2151. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2152. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2153. /* 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28). */
  2154. dataptr = data;
  2155. for (ctr = 0; ctr < 7; ctr++) {
  2156. elemptr = sample_data[ctr] + start_col;
  2157. /* Even part */
  2158. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
  2159. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
  2160. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
  2161. tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
  2162. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
  2163. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
  2164. tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
  2165. tmp10 = tmp0 + tmp6;
  2166. tmp14 = tmp0 - tmp6;
  2167. tmp11 = tmp1 + tmp5;
  2168. tmp15 = tmp1 - tmp5;
  2169. tmp12 = tmp2 + tmp4;
  2170. tmp16 = tmp2 - tmp4;
  2171. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
  2172. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
  2173. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
  2174. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
  2175. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
  2176. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
  2177. tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
  2178. /* Apply unsigned->signed conversion */
  2179. dataptr[0] = (DCTELEM)
  2180. ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
  2181. tmp13 += tmp13;
  2182. dataptr[4] = (DCTELEM)
  2183. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
  2184. MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
  2185. MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
  2186. CONST_BITS-PASS1_BITS);
  2187. tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
  2188. dataptr[2] = (DCTELEM)
  2189. DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
  2190. + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
  2191. CONST_BITS-PASS1_BITS);
  2192. dataptr[6] = (DCTELEM)
  2193. DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
  2194. - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
  2195. CONST_BITS-PASS1_BITS);
  2196. /* Odd part */
  2197. tmp10 = tmp1 + tmp2;
  2198. tmp11 = tmp5 - tmp4;
  2199. dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
  2200. tmp3 <<= CONST_BITS;
  2201. tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
  2202. tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
  2203. tmp10 += tmp11 - tmp3;
  2204. tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
  2205. MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
  2206. dataptr[5] = (DCTELEM)
  2207. DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
  2208. + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
  2209. CONST_BITS-PASS1_BITS);
  2210. tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
  2211. MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
  2212. dataptr[3] = (DCTELEM)
  2213. DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
  2214. - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
  2215. CONST_BITS-PASS1_BITS);
  2216. dataptr[1] = (DCTELEM)
  2217. DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
  2218. MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
  2219. CONST_BITS-PASS1_BITS);
  2220. dataptr += DCTSIZE; /* advance pointer to next row */
  2221. }
  2222. /* Pass 2: process columns.
  2223. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2224. * by an overall factor of 8.
  2225. * We must also scale the output by (8/14)*(8/7) = 32/49, which we
  2226. * partially fold into the constant multipliers and final shifting:
  2227. * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
  2228. */
  2229. dataptr = data;
  2230. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2231. /* Even part */
  2232. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
  2233. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
  2234. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
  2235. tmp3 = dataptr[DCTSIZE*3];
  2236. tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
  2237. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
  2238. tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
  2239. z1 = tmp0 + tmp2;
  2240. dataptr[DCTSIZE*0] = (DCTELEM)
  2241. DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
  2242. CONST_BITS+PASS1_BITS+1);
  2243. tmp3 += tmp3;
  2244. z1 -= tmp3;
  2245. z1 -= tmp3;
  2246. z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
  2247. z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
  2248. z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
  2249. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
  2250. z1 -= z2;
  2251. z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
  2252. dataptr[DCTSIZE*4] = (DCTELEM)
  2253. DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
  2254. CONST_BITS+PASS1_BITS+1);
  2255. dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
  2256. /* Odd part */
  2257. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
  2258. tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
  2259. tmp0 = tmp1 - tmp2;
  2260. tmp1 += tmp2;
  2261. tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
  2262. tmp1 += tmp2;
  2263. tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
  2264. tmp0 += tmp3;
  2265. tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
  2266. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
  2267. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
  2268. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
  2269. dataptr++; /* advance pointer to next column */
  2270. }
  2271. }
  2272. /*
  2273. * Perform the forward DCT on a 12x6 sample block.
  2274. *
  2275. * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
  2276. */
  2277. GLOBAL(void)
  2278. jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2279. {
  2280. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  2281. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  2282. DCTELEM *dataptr;
  2283. JSAMPROW elemptr;
  2284. int ctr;
  2285. SHIFT_TEMPS
  2286. /* Zero 2 bottom rows of output coefficient block. */
  2287. MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
  2288. /* Pass 1: process rows. */
  2289. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2290. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2291. /* 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24). */
  2292. dataptr = data;
  2293. for (ctr = 0; ctr < 6; ctr++) {
  2294. elemptr = sample_data[ctr] + start_col;
  2295. /* Even part */
  2296. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
  2297. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
  2298. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
  2299. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
  2300. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
  2301. tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
  2302. tmp10 = tmp0 + tmp5;
  2303. tmp13 = tmp0 - tmp5;
  2304. tmp11 = tmp1 + tmp4;
  2305. tmp14 = tmp1 - tmp4;
  2306. tmp12 = tmp2 + tmp3;
  2307. tmp15 = tmp2 - tmp3;
  2308. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
  2309. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
  2310. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
  2311. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
  2312. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
  2313. tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
  2314. /* Apply unsigned->signed conversion */
  2315. dataptr[0] = (DCTELEM)
  2316. ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
  2317. dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
  2318. dataptr[4] = (DCTELEM)
  2319. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
  2320. CONST_BITS-PASS1_BITS);
  2321. dataptr[2] = (DCTELEM)
  2322. DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
  2323. CONST_BITS-PASS1_BITS);
  2324. /* Odd part */
  2325. tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
  2326. tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
  2327. tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
  2328. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
  2329. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
  2330. tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
  2331. + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
  2332. tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
  2333. tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
  2334. + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
  2335. tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
  2336. - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
  2337. tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
  2338. - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
  2339. dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
  2340. dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
  2341. dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
  2342. dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
  2343. dataptr += DCTSIZE; /* advance pointer to next row */
  2344. }
  2345. /* Pass 2: process columns.
  2346. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2347. * by an overall factor of 8.
  2348. * We must also scale the output by (8/12)*(8/6) = 8/9, which we
  2349. * partially fold into the constant multipliers and final shifting:
  2350. * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
  2351. */
  2352. dataptr = data;
  2353. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2354. /* Even part */
  2355. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
  2356. tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
  2357. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
  2358. tmp10 = tmp0 + tmp2;
  2359. tmp12 = tmp0 - tmp2;
  2360. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
  2361. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
  2362. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
  2363. dataptr[DCTSIZE*0] = (DCTELEM)
  2364. DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
  2365. CONST_BITS+PASS1_BITS+1);
  2366. dataptr[DCTSIZE*2] = (DCTELEM)
  2367. DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
  2368. CONST_BITS+PASS1_BITS+1);
  2369. dataptr[DCTSIZE*4] = (DCTELEM)
  2370. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
  2371. CONST_BITS+PASS1_BITS+1);
  2372. /* Odd part */
  2373. tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
  2374. dataptr[DCTSIZE*1] = (DCTELEM)
  2375. DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
  2376. CONST_BITS+PASS1_BITS+1);
  2377. dataptr[DCTSIZE*3] = (DCTELEM)
  2378. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
  2379. CONST_BITS+PASS1_BITS+1);
  2380. dataptr[DCTSIZE*5] = (DCTELEM)
  2381. DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
  2382. CONST_BITS+PASS1_BITS+1);
  2383. dataptr++; /* advance pointer to next column */
  2384. }
  2385. }
  2386. /*
  2387. * Perform the forward DCT on a 10x5 sample block.
  2388. *
  2389. * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
  2390. */
  2391. GLOBAL(void)
  2392. jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2393. {
  2394. INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  2395. INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  2396. DCTELEM *dataptr;
  2397. JSAMPROW elemptr;
  2398. int ctr;
  2399. SHIFT_TEMPS
  2400. /* Zero 3 bottom rows of output coefficient block. */
  2401. MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
  2402. /* Pass 1: process rows. */
  2403. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2404. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2405. /* 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20). */
  2406. dataptr = data;
  2407. for (ctr = 0; ctr < 5; ctr++) {
  2408. elemptr = sample_data[ctr] + start_col;
  2409. /* Even part */
  2410. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
  2411. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
  2412. tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
  2413. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
  2414. tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
  2415. tmp10 = tmp0 + tmp4;
  2416. tmp13 = tmp0 - tmp4;
  2417. tmp11 = tmp1 + tmp3;
  2418. tmp14 = tmp1 - tmp3;
  2419. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
  2420. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
  2421. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
  2422. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
  2423. tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
  2424. /* Apply unsigned->signed conversion */
  2425. dataptr[0] = (DCTELEM)
  2426. ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
  2427. tmp12 += tmp12;
  2428. dataptr[4] = (DCTELEM)
  2429. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
  2430. MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
  2431. CONST_BITS-PASS1_BITS);
  2432. tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
  2433. dataptr[2] = (DCTELEM)
  2434. DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
  2435. CONST_BITS-PASS1_BITS);
  2436. dataptr[6] = (DCTELEM)
  2437. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
  2438. CONST_BITS-PASS1_BITS);
  2439. /* Odd part */
  2440. tmp10 = tmp0 + tmp4;
  2441. tmp11 = tmp1 - tmp3;
  2442. dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
  2443. tmp2 <<= CONST_BITS;
  2444. dataptr[1] = (DCTELEM)
  2445. DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
  2446. MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
  2447. MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
  2448. MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
  2449. CONST_BITS-PASS1_BITS);
  2450. tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
  2451. MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
  2452. tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
  2453. (tmp11 << (CONST_BITS - 1)) - tmp2;
  2454. dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
  2455. dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
  2456. dataptr += DCTSIZE; /* advance pointer to next row */
  2457. }
  2458. /* Pass 2: process columns.
  2459. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2460. * by an overall factor of 8.
  2461. * We must also scale the output by (8/10)*(8/5) = 32/25, which we
  2462. * fold into the constant multipliers:
  2463. * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
  2464. */
  2465. dataptr = data;
  2466. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2467. /* Even part */
  2468. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
  2469. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
  2470. tmp2 = dataptr[DCTSIZE*2];
  2471. tmp10 = tmp0 + tmp1;
  2472. tmp11 = tmp0 - tmp1;
  2473. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
  2474. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
  2475. dataptr[DCTSIZE*0] = (DCTELEM)
  2476. DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
  2477. CONST_BITS+PASS1_BITS);
  2478. tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
  2479. tmp10 -= tmp2 << 2;
  2480. tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
  2481. dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
  2482. dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
  2483. /* Odd part */
  2484. tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
  2485. dataptr[DCTSIZE*1] = (DCTELEM)
  2486. DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
  2487. CONST_BITS+PASS1_BITS);
  2488. dataptr[DCTSIZE*3] = (DCTELEM)
  2489. DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
  2490. CONST_BITS+PASS1_BITS);
  2491. dataptr++; /* advance pointer to next column */
  2492. }
  2493. }
  2494. /*
  2495. * Perform the forward DCT on an 8x4 sample block.
  2496. *
  2497. * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
  2498. */
  2499. GLOBAL(void)
  2500. jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2501. {
  2502. INT32 tmp0, tmp1, tmp2, tmp3;
  2503. INT32 tmp10, tmp11, tmp12, tmp13;
  2504. INT32 z1;
  2505. DCTELEM *dataptr;
  2506. JSAMPROW elemptr;
  2507. int ctr;
  2508. SHIFT_TEMPS
  2509. /* Zero 4 bottom rows of output coefficient block. */
  2510. MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
  2511. /* Pass 1: process rows. */
  2512. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2513. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2514. /* We must also scale the output by 8/4 = 2, which we add here. */
  2515. dataptr = data;
  2516. for (ctr = 0; ctr < 4; ctr++) {
  2517. elemptr = sample_data[ctr] + start_col;
  2518. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  2519. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  2520. */
  2521. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
  2522. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
  2523. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
  2524. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
  2525. tmp10 = tmp0 + tmp3;
  2526. tmp12 = tmp0 - tmp3;
  2527. tmp11 = tmp1 + tmp2;
  2528. tmp13 = tmp1 - tmp2;
  2529. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
  2530. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
  2531. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
  2532. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
  2533. /* Apply unsigned->signed conversion */
  2534. dataptr[0] = (DCTELEM)
  2535. ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
  2536. dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
  2537. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  2538. /* Add fudge factor here for final descale. */
  2539. z1 += ONE << (CONST_BITS-PASS1_BITS-2);
  2540. dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
  2541. CONST_BITS-PASS1_BITS-1);
  2542. dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
  2543. CONST_BITS-PASS1_BITS-1);
  2544. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  2545. * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
  2546. * i0..i3 in the paper are tmp0..tmp3 here.
  2547. */
  2548. tmp10 = tmp0 + tmp3;
  2549. tmp11 = tmp1 + tmp2;
  2550. tmp12 = tmp0 + tmp2;
  2551. tmp13 = tmp1 + tmp3;
  2552. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  2553. /* Add fudge factor here for final descale. */
  2554. z1 += ONE << (CONST_BITS-PASS1_BITS-2);
  2555. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  2556. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  2557. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  2558. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  2559. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  2560. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  2561. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  2562. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  2563. tmp12 += z1;
  2564. tmp13 += z1;
  2565. dataptr[1] = (DCTELEM)
  2566. RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
  2567. dataptr[3] = (DCTELEM)
  2568. RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
  2569. dataptr[5] = (DCTELEM)
  2570. RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
  2571. dataptr[7] = (DCTELEM)
  2572. RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
  2573. dataptr += DCTSIZE; /* advance pointer to next row */
  2574. }
  2575. /* Pass 2: process columns.
  2576. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2577. * by an overall factor of 8.
  2578. * 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
  2579. */
  2580. dataptr = data;
  2581. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2582. /* Even part */
  2583. /* Add fudge factor here for final descale. */
  2584. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
  2585. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
  2586. tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
  2587. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
  2588. dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
  2589. dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
  2590. /* Odd part */
  2591. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  2592. /* Add fudge factor here for final descale. */
  2593. tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
  2594. dataptr[DCTSIZE*1] = (DCTELEM)
  2595. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  2596. CONST_BITS+PASS1_BITS);
  2597. dataptr[DCTSIZE*3] = (DCTELEM)
  2598. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  2599. CONST_BITS+PASS1_BITS);
  2600. dataptr++; /* advance pointer to next column */
  2601. }
  2602. }
  2603. /*
  2604. * Perform the forward DCT on a 6x3 sample block.
  2605. *
  2606. * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
  2607. */
  2608. GLOBAL(void)
  2609. jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2610. {
  2611. INT32 tmp0, tmp1, tmp2;
  2612. INT32 tmp10, tmp11, tmp12;
  2613. DCTELEM *dataptr;
  2614. JSAMPROW elemptr;
  2615. int ctr;
  2616. SHIFT_TEMPS
  2617. /* Pre-zero output coefficient block. */
  2618. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  2619. /* Pass 1: process rows. */
  2620. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2621. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2622. /* We scale the results further by 2 as part of output adaption */
  2623. /* scaling for different DCT size. */
  2624. /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
  2625. dataptr = data;
  2626. for (ctr = 0; ctr < 3; ctr++) {
  2627. elemptr = sample_data[ctr] + start_col;
  2628. /* Even part */
  2629. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
  2630. tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
  2631. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
  2632. tmp10 = tmp0 + tmp2;
  2633. tmp12 = tmp0 - tmp2;
  2634. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
  2635. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
  2636. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
  2637. /* Apply unsigned->signed conversion */
  2638. dataptr[0] = (DCTELEM)
  2639. ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
  2640. dataptr[2] = (DCTELEM)
  2641. DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
  2642. CONST_BITS-PASS1_BITS-1);
  2643. dataptr[4] = (DCTELEM)
  2644. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
  2645. CONST_BITS-PASS1_BITS-1);
  2646. /* Odd part */
  2647. tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
  2648. CONST_BITS-PASS1_BITS-1);
  2649. dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
  2650. dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
  2651. dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
  2652. dataptr += DCTSIZE; /* advance pointer to next row */
  2653. }
  2654. /* Pass 2: process columns.
  2655. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2656. * by an overall factor of 8.
  2657. * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
  2658. * fold into the constant multipliers (other part was done in pass 1):
  2659. * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
  2660. */
  2661. dataptr = data;
  2662. for (ctr = 0; ctr < 6; ctr++) {
  2663. /* Even part */
  2664. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
  2665. tmp1 = dataptr[DCTSIZE*1];
  2666. tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
  2667. dataptr[DCTSIZE*0] = (DCTELEM)
  2668. DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
  2669. CONST_BITS+PASS1_BITS);
  2670. dataptr[DCTSIZE*2] = (DCTELEM)
  2671. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
  2672. CONST_BITS+PASS1_BITS);
  2673. /* Odd part */
  2674. dataptr[DCTSIZE*1] = (DCTELEM)
  2675. DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
  2676. CONST_BITS+PASS1_BITS);
  2677. dataptr++; /* advance pointer to next column */
  2678. }
  2679. }
  2680. /*
  2681. * Perform the forward DCT on a 4x2 sample block.
  2682. *
  2683. * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
  2684. */
  2685. GLOBAL(void)
  2686. jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2687. {
  2688. INT32 tmp0, tmp1;
  2689. INT32 tmp10, tmp11;
  2690. DCTELEM *dataptr;
  2691. JSAMPROW elemptr;
  2692. int ctr;
  2693. SHIFT_TEMPS
  2694. /* Pre-zero output coefficient block. */
  2695. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  2696. /* Pass 1: process rows. */
  2697. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2698. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2699. /* We must also scale the output by (8/4)*(8/2) = 2**3, which we add here. */
  2700. /* 4-point FDCT kernel, */
  2701. /* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT]. */
  2702. dataptr = data;
  2703. for (ctr = 0; ctr < 2; ctr++) {
  2704. elemptr = sample_data[ctr] + start_col;
  2705. /* Even part */
  2706. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
  2707. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
  2708. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
  2709. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
  2710. /* Apply unsigned->signed conversion */
  2711. dataptr[0] = (DCTELEM)
  2712. ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
  2713. dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
  2714. /* Odd part */
  2715. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  2716. /* Add fudge factor here for final descale. */
  2717. tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
  2718. dataptr[1] = (DCTELEM)
  2719. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  2720. CONST_BITS-PASS1_BITS-3);
  2721. dataptr[3] = (DCTELEM)
  2722. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  2723. CONST_BITS-PASS1_BITS-3);
  2724. dataptr += DCTSIZE; /* advance pointer to next row */
  2725. }
  2726. /* Pass 2: process columns.
  2727. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2728. * by an overall factor of 8.
  2729. */
  2730. dataptr = data;
  2731. for (ctr = 0; ctr < 4; ctr++) {
  2732. /* Even part */
  2733. /* Add fudge factor here for final descale. */
  2734. tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
  2735. tmp1 = dataptr[DCTSIZE*1];
  2736. dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
  2737. /* Odd part */
  2738. dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
  2739. dataptr++; /* advance pointer to next column */
  2740. }
  2741. }
  2742. /*
  2743. * Perform the forward DCT on a 2x1 sample block.
  2744. *
  2745. * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
  2746. */
  2747. GLOBAL(void)
  2748. jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2749. {
  2750. INT32 tmp0, tmp1;
  2751. JSAMPROW elemptr;
  2752. /* Pre-zero output coefficient block. */
  2753. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  2754. elemptr = sample_data[0] + start_col;
  2755. tmp0 = GETJSAMPLE(elemptr[0]);
  2756. tmp1 = GETJSAMPLE(elemptr[1]);
  2757. /* We leave the results scaled up by an overall factor of 8.
  2758. * We must also scale the output by (8/2)*(8/1) = 2**5.
  2759. */
  2760. /* Even part */
  2761. /* Apply unsigned->signed conversion */
  2762. data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
  2763. /* Odd part */
  2764. data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
  2765. }
  2766. /*
  2767. * Perform the forward DCT on an 8x16 sample block.
  2768. *
  2769. * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
  2770. */
  2771. GLOBAL(void)
  2772. jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2773. {
  2774. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2775. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
  2776. INT32 z1;
  2777. DCTELEM workspace[DCTSIZE2];
  2778. DCTELEM *dataptr;
  2779. DCTELEM *wsptr;
  2780. JSAMPROW elemptr;
  2781. int ctr;
  2782. SHIFT_TEMPS
  2783. /* Pass 1: process rows. */
  2784. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2785. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2786. dataptr = data;
  2787. ctr = 0;
  2788. for (;;) {
  2789. elemptr = sample_data[ctr] + start_col;
  2790. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  2791. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  2792. */
  2793. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
  2794. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
  2795. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
  2796. tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
  2797. tmp10 = tmp0 + tmp3;
  2798. tmp12 = tmp0 - tmp3;
  2799. tmp11 = tmp1 + tmp2;
  2800. tmp13 = tmp1 - tmp2;
  2801. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
  2802. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
  2803. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
  2804. tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
  2805. /* Apply unsigned->signed conversion */
  2806. dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
  2807. dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
  2808. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  2809. dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
  2810. CONST_BITS-PASS1_BITS);
  2811. dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
  2812. CONST_BITS-PASS1_BITS);
  2813. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  2814. * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
  2815. * i0..i3 in the paper are tmp0..tmp3 here.
  2816. */
  2817. tmp10 = tmp0 + tmp3;
  2818. tmp11 = tmp1 + tmp2;
  2819. tmp12 = tmp0 + tmp2;
  2820. tmp13 = tmp1 + tmp3;
  2821. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  2822. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  2823. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  2824. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  2825. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  2826. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  2827. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  2828. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  2829. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  2830. tmp12 += z1;
  2831. tmp13 += z1;
  2832. dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
  2833. dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
  2834. dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
  2835. dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
  2836. ctr++;
  2837. if (ctr != DCTSIZE) {
  2838. if (ctr == DCTSIZE * 2)
  2839. break; /* Done. */
  2840. dataptr += DCTSIZE; /* advance pointer to next row */
  2841. } else
  2842. dataptr = workspace; /* switch pointer to extended workspace */
  2843. }
  2844. /* Pass 2: process columns.
  2845. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2846. * by an overall factor of 8.
  2847. * We must also scale the output by 8/16 = 1/2.
  2848. * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
  2849. */
  2850. dataptr = data;
  2851. wsptr = workspace;
  2852. for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  2853. /* Even part */
  2854. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
  2855. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
  2856. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
  2857. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
  2858. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
  2859. tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
  2860. tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
  2861. tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
  2862. tmp10 = tmp0 + tmp7;
  2863. tmp14 = tmp0 - tmp7;
  2864. tmp11 = tmp1 + tmp6;
  2865. tmp15 = tmp1 - tmp6;
  2866. tmp12 = tmp2 + tmp5;
  2867. tmp16 = tmp2 - tmp5;
  2868. tmp13 = tmp3 + tmp4;
  2869. tmp17 = tmp3 - tmp4;
  2870. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
  2871. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
  2872. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
  2873. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
  2874. tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
  2875. tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
  2876. tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
  2877. tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
  2878. dataptr[DCTSIZE*0] = (DCTELEM)
  2879. DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
  2880. dataptr[DCTSIZE*4] = (DCTELEM)
  2881. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
  2882. MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
  2883. CONST_BITS+PASS1_BITS+1);
  2884. tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
  2885. MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
  2886. dataptr[DCTSIZE*2] = (DCTELEM)
  2887. DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
  2888. + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
  2889. CONST_BITS+PASS1_BITS+1);
  2890. dataptr[DCTSIZE*6] = (DCTELEM)
  2891. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
  2892. - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
  2893. CONST_BITS+PASS1_BITS+1);
  2894. /* Odd part */
  2895. tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
  2896. MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
  2897. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
  2898. MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
  2899. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
  2900. MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
  2901. tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
  2902. MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
  2903. tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
  2904. MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
  2905. tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
  2906. MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
  2907. tmp10 = tmp11 + tmp12 + tmp13 -
  2908. MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
  2909. MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
  2910. tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
  2911. - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
  2912. tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
  2913. + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
  2914. tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
  2915. + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
  2916. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
  2917. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
  2918. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
  2919. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
  2920. dataptr++; /* advance pointer to next column */
  2921. wsptr++; /* advance pointer to next column */
  2922. }
  2923. }
  2924. /*
  2925. * Perform the forward DCT on a 7x14 sample block.
  2926. *
  2927. * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
  2928. */
  2929. GLOBAL(void)
  2930. jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  2931. {
  2932. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
  2933. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
  2934. INT32 z1, z2, z3;
  2935. DCTELEM workspace[8*6];
  2936. DCTELEM *dataptr;
  2937. DCTELEM *wsptr;
  2938. JSAMPROW elemptr;
  2939. int ctr;
  2940. SHIFT_TEMPS
  2941. /* Pre-zero output coefficient block. */
  2942. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  2943. /* Pass 1: process rows. */
  2944. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  2945. /* furthermore, we scale the results by 2**PASS1_BITS. */
  2946. /* 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14). */
  2947. dataptr = data;
  2948. ctr = 0;
  2949. for (;;) {
  2950. elemptr = sample_data[ctr] + start_col;
  2951. /* Even part */
  2952. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
  2953. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
  2954. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
  2955. tmp3 = GETJSAMPLE(elemptr[3]);
  2956. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
  2957. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
  2958. tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
  2959. z1 = tmp0 + tmp2;
  2960. /* Apply unsigned->signed conversion */
  2961. dataptr[0] = (DCTELEM)
  2962. ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
  2963. tmp3 += tmp3;
  2964. z1 -= tmp3;
  2965. z1 -= tmp3;
  2966. z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
  2967. z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
  2968. z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
  2969. dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
  2970. z1 -= z2;
  2971. z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
  2972. dataptr[4] = (DCTELEM)
  2973. DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
  2974. CONST_BITS-PASS1_BITS);
  2975. dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
  2976. /* Odd part */
  2977. tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
  2978. tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
  2979. tmp0 = tmp1 - tmp2;
  2980. tmp1 += tmp2;
  2981. tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
  2982. tmp1 += tmp2;
  2983. tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
  2984. tmp0 += tmp3;
  2985. tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
  2986. dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
  2987. dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
  2988. dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
  2989. ctr++;
  2990. if (ctr != DCTSIZE) {
  2991. if (ctr == 14)
  2992. break; /* Done. */
  2993. dataptr += DCTSIZE; /* advance pointer to next row */
  2994. } else
  2995. dataptr = workspace; /* switch pointer to extended workspace */
  2996. }
  2997. /* Pass 2: process columns.
  2998. * We remove the PASS1_BITS scaling, but leave the results scaled up
  2999. * by an overall factor of 8.
  3000. * We must also scale the output by (8/7)*(8/14) = 32/49, which we
  3001. * fold into the constant multipliers:
  3002. * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
  3003. */
  3004. dataptr = data;
  3005. wsptr = workspace;
  3006. for (ctr = 0; ctr < 7; ctr++) {
  3007. /* Even part */
  3008. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
  3009. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
  3010. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
  3011. tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
  3012. tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
  3013. tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
  3014. tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
  3015. tmp10 = tmp0 + tmp6;
  3016. tmp14 = tmp0 - tmp6;
  3017. tmp11 = tmp1 + tmp5;
  3018. tmp15 = tmp1 - tmp5;
  3019. tmp12 = tmp2 + tmp4;
  3020. tmp16 = tmp2 - tmp4;
  3021. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
  3022. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
  3023. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
  3024. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
  3025. tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
  3026. tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
  3027. tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
  3028. dataptr[DCTSIZE*0] = (DCTELEM)
  3029. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
  3030. FIX(0.653061224)), /* 32/49 */
  3031. CONST_BITS+PASS1_BITS);
  3032. tmp13 += tmp13;
  3033. dataptr[DCTSIZE*4] = (DCTELEM)
  3034. DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
  3035. MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
  3036. MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
  3037. CONST_BITS+PASS1_BITS);
  3038. tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
  3039. dataptr[DCTSIZE*2] = (DCTELEM)
  3040. DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
  3041. + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
  3042. CONST_BITS+PASS1_BITS);
  3043. dataptr[DCTSIZE*6] = (DCTELEM)
  3044. DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
  3045. - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
  3046. CONST_BITS+PASS1_BITS);
  3047. /* Odd part */
  3048. tmp10 = tmp1 + tmp2;
  3049. tmp11 = tmp5 - tmp4;
  3050. dataptr[DCTSIZE*7] = (DCTELEM)
  3051. DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
  3052. FIX(0.653061224)), /* 32/49 */
  3053. CONST_BITS+PASS1_BITS);
  3054. tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
  3055. tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
  3056. tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
  3057. tmp10 += tmp11 - tmp3;
  3058. tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
  3059. MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
  3060. dataptr[DCTSIZE*5] = (DCTELEM)
  3061. DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
  3062. + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
  3063. CONST_BITS+PASS1_BITS);
  3064. tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
  3065. MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
  3066. dataptr[DCTSIZE*3] = (DCTELEM)
  3067. DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
  3068. - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
  3069. CONST_BITS+PASS1_BITS);
  3070. dataptr[DCTSIZE*1] = (DCTELEM)
  3071. DESCALE(tmp11 + tmp12 + tmp3
  3072. - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
  3073. - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
  3074. CONST_BITS+PASS1_BITS);
  3075. dataptr++; /* advance pointer to next column */
  3076. wsptr++; /* advance pointer to next column */
  3077. }
  3078. }
  3079. /*
  3080. * Perform the forward DCT on a 6x12 sample block.
  3081. *
  3082. * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
  3083. */
  3084. GLOBAL(void)
  3085. jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3086. {
  3087. INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  3088. INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  3089. DCTELEM workspace[8*4];
  3090. DCTELEM *dataptr;
  3091. DCTELEM *wsptr;
  3092. JSAMPROW elemptr;
  3093. int ctr;
  3094. SHIFT_TEMPS
  3095. /* Pre-zero output coefficient block. */
  3096. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3097. /* Pass 1: process rows. */
  3098. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  3099. /* furthermore, we scale the results by 2**PASS1_BITS. */
  3100. /* 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12). */
  3101. dataptr = data;
  3102. ctr = 0;
  3103. for (;;) {
  3104. elemptr = sample_data[ctr] + start_col;
  3105. /* Even part */
  3106. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
  3107. tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
  3108. tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
  3109. tmp10 = tmp0 + tmp2;
  3110. tmp12 = tmp0 - tmp2;
  3111. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
  3112. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
  3113. tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
  3114. /* Apply unsigned->signed conversion */
  3115. dataptr[0] = (DCTELEM)
  3116. ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
  3117. dataptr[2] = (DCTELEM)
  3118. DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
  3119. CONST_BITS-PASS1_BITS);
  3120. dataptr[4] = (DCTELEM)
  3121. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
  3122. CONST_BITS-PASS1_BITS);
  3123. /* Odd part */
  3124. tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
  3125. CONST_BITS-PASS1_BITS);
  3126. dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
  3127. dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
  3128. dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
  3129. ctr++;
  3130. if (ctr != DCTSIZE) {
  3131. if (ctr == 12)
  3132. break; /* Done. */
  3133. dataptr += DCTSIZE; /* advance pointer to next row */
  3134. } else
  3135. dataptr = workspace; /* switch pointer to extended workspace */
  3136. }
  3137. /* Pass 2: process columns.
  3138. * We remove the PASS1_BITS scaling, but leave the results scaled up
  3139. * by an overall factor of 8.
  3140. * We must also scale the output by (8/6)*(8/12) = 8/9, which we
  3141. * fold into the constant multipliers:
  3142. * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
  3143. */
  3144. dataptr = data;
  3145. wsptr = workspace;
  3146. for (ctr = 0; ctr < 6; ctr++) {
  3147. /* Even part */
  3148. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
  3149. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
  3150. tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
  3151. tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
  3152. tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
  3153. tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
  3154. tmp10 = tmp0 + tmp5;
  3155. tmp13 = tmp0 - tmp5;
  3156. tmp11 = tmp1 + tmp4;
  3157. tmp14 = tmp1 - tmp4;
  3158. tmp12 = tmp2 + tmp3;
  3159. tmp15 = tmp2 - tmp3;
  3160. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
  3161. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
  3162. tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
  3163. tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
  3164. tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
  3165. tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
  3166. dataptr[DCTSIZE*0] = (DCTELEM)
  3167. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
  3168. CONST_BITS+PASS1_BITS);
  3169. dataptr[DCTSIZE*6] = (DCTELEM)
  3170. DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
  3171. CONST_BITS+PASS1_BITS);
  3172. dataptr[DCTSIZE*4] = (DCTELEM)
  3173. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
  3174. CONST_BITS+PASS1_BITS);
  3175. dataptr[DCTSIZE*2] = (DCTELEM)
  3176. DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
  3177. MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
  3178. CONST_BITS+PASS1_BITS);
  3179. /* Odd part */
  3180. tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
  3181. tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
  3182. tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
  3183. tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
  3184. tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
  3185. tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
  3186. + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
  3187. tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
  3188. tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
  3189. + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
  3190. tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
  3191. - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
  3192. tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
  3193. - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
  3194. dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
  3195. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
  3196. dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
  3197. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
  3198. dataptr++; /* advance pointer to next column */
  3199. wsptr++; /* advance pointer to next column */
  3200. }
  3201. }
  3202. /*
  3203. * Perform the forward DCT on a 5x10 sample block.
  3204. *
  3205. * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
  3206. */
  3207. GLOBAL(void)
  3208. jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3209. {
  3210. INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
  3211. INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
  3212. DCTELEM workspace[8*2];
  3213. DCTELEM *dataptr;
  3214. DCTELEM *wsptr;
  3215. JSAMPROW elemptr;
  3216. int ctr;
  3217. SHIFT_TEMPS
  3218. /* Pre-zero output coefficient block. */
  3219. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3220. /* Pass 1: process rows. */
  3221. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  3222. /* furthermore, we scale the results by 2**PASS1_BITS. */
  3223. /* 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10). */
  3224. dataptr = data;
  3225. ctr = 0;
  3226. for (;;) {
  3227. elemptr = sample_data[ctr] + start_col;
  3228. /* Even part */
  3229. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
  3230. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
  3231. tmp2 = GETJSAMPLE(elemptr[2]);
  3232. tmp10 = tmp0 + tmp1;
  3233. tmp11 = tmp0 - tmp1;
  3234. tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
  3235. tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
  3236. /* Apply unsigned->signed conversion */
  3237. dataptr[0] = (DCTELEM)
  3238. ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
  3239. tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
  3240. tmp10 -= tmp2 << 2;
  3241. tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
  3242. dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
  3243. dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
  3244. /* Odd part */
  3245. tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
  3246. dataptr[1] = (DCTELEM)
  3247. DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
  3248. CONST_BITS-PASS1_BITS);
  3249. dataptr[3] = (DCTELEM)
  3250. DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
  3251. CONST_BITS-PASS1_BITS);
  3252. ctr++;
  3253. if (ctr != DCTSIZE) {
  3254. if (ctr == 10)
  3255. break; /* Done. */
  3256. dataptr += DCTSIZE; /* advance pointer to next row */
  3257. } else
  3258. dataptr = workspace; /* switch pointer to extended workspace */
  3259. }
  3260. /* Pass 2: process columns.
  3261. * We remove the PASS1_BITS scaling, but leave the results scaled up
  3262. * by an overall factor of 8.
  3263. * We must also scale the output by (8/5)*(8/10) = 32/25, which we
  3264. * fold into the constant multipliers:
  3265. * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
  3266. */
  3267. dataptr = data;
  3268. wsptr = workspace;
  3269. for (ctr = 0; ctr < 5; ctr++) {
  3270. /* Even part */
  3271. tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
  3272. tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
  3273. tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
  3274. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
  3275. tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
  3276. tmp10 = tmp0 + tmp4;
  3277. tmp13 = tmp0 - tmp4;
  3278. tmp11 = tmp1 + tmp3;
  3279. tmp14 = tmp1 - tmp3;
  3280. tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
  3281. tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
  3282. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
  3283. tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
  3284. tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
  3285. dataptr[DCTSIZE*0] = (DCTELEM)
  3286. DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
  3287. CONST_BITS+PASS1_BITS);
  3288. tmp12 += tmp12;
  3289. dataptr[DCTSIZE*4] = (DCTELEM)
  3290. DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
  3291. MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
  3292. CONST_BITS+PASS1_BITS);
  3293. tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
  3294. dataptr[DCTSIZE*2] = (DCTELEM)
  3295. DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
  3296. CONST_BITS+PASS1_BITS);
  3297. dataptr[DCTSIZE*6] = (DCTELEM)
  3298. DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
  3299. CONST_BITS+PASS1_BITS);
  3300. /* Odd part */
  3301. tmp10 = tmp0 + tmp4;
  3302. tmp11 = tmp1 - tmp3;
  3303. dataptr[DCTSIZE*5] = (DCTELEM)
  3304. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
  3305. CONST_BITS+PASS1_BITS);
  3306. tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
  3307. dataptr[DCTSIZE*1] = (DCTELEM)
  3308. DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
  3309. MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
  3310. MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
  3311. MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
  3312. CONST_BITS+PASS1_BITS);
  3313. tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
  3314. MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
  3315. tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
  3316. MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
  3317. dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
  3318. dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
  3319. dataptr++; /* advance pointer to next column */
  3320. wsptr++; /* advance pointer to next column */
  3321. }
  3322. }
  3323. /*
  3324. * Perform the forward DCT on a 4x8 sample block.
  3325. *
  3326. * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
  3327. */
  3328. GLOBAL(void)
  3329. jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3330. {
  3331. INT32 tmp0, tmp1, tmp2, tmp3;
  3332. INT32 tmp10, tmp11, tmp12, tmp13;
  3333. INT32 z1;
  3334. DCTELEM *dataptr;
  3335. JSAMPROW elemptr;
  3336. int ctr;
  3337. SHIFT_TEMPS
  3338. /* Pre-zero output coefficient block. */
  3339. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3340. /* Pass 1: process rows. */
  3341. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  3342. /* furthermore, we scale the results by 2**PASS1_BITS. */
  3343. /* We must also scale the output by 8/4 = 2, which we add here. */
  3344. /* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16). */
  3345. dataptr = data;
  3346. for (ctr = 0; ctr < DCTSIZE; ctr++) {
  3347. elemptr = sample_data[ctr] + start_col;
  3348. /* Even part */
  3349. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
  3350. tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
  3351. tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
  3352. tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
  3353. /* Apply unsigned->signed conversion */
  3354. dataptr[0] = (DCTELEM)
  3355. ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
  3356. dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
  3357. /* Odd part */
  3358. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  3359. /* Add fudge factor here for final descale. */
  3360. tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
  3361. dataptr[1] = (DCTELEM)
  3362. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  3363. CONST_BITS-PASS1_BITS-1);
  3364. dataptr[3] = (DCTELEM)
  3365. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  3366. CONST_BITS-PASS1_BITS-1);
  3367. dataptr += DCTSIZE; /* advance pointer to next row */
  3368. }
  3369. /* Pass 2: process columns.
  3370. * We remove the PASS1_BITS scaling, but leave the results scaled up
  3371. * by an overall factor of 8.
  3372. */
  3373. dataptr = data;
  3374. for (ctr = 0; ctr < 4; ctr++) {
  3375. /* Even part per LL&M figure 1 --- note that published figure is faulty;
  3376. * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
  3377. */
  3378. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  3379. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  3380. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  3381. tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  3382. /* Add fudge factor here for final descale. */
  3383. tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
  3384. tmp12 = tmp0 - tmp3;
  3385. tmp11 = tmp1 + tmp2;
  3386. tmp13 = tmp1 - tmp2;
  3387. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  3388. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  3389. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  3390. tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  3391. dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
  3392. dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
  3393. z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
  3394. /* Add fudge factor here for final descale. */
  3395. z1 += ONE << (CONST_BITS+PASS1_BITS-1);
  3396. dataptr[DCTSIZE*2] = (DCTELEM)
  3397. RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
  3398. dataptr[DCTSIZE*6] = (DCTELEM)
  3399. RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
  3400. /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
  3401. * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
  3402. * i0..i3 in the paper are tmp0..tmp3 here.
  3403. */
  3404. tmp10 = tmp0 + tmp3;
  3405. tmp11 = tmp1 + tmp2;
  3406. tmp12 = tmp0 + tmp2;
  3407. tmp13 = tmp1 + tmp3;
  3408. z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
  3409. /* Add fudge factor here for final descale. */
  3410. z1 += ONE << (CONST_BITS+PASS1_BITS-1);
  3411. tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
  3412. tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
  3413. tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
  3414. tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
  3415. tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
  3416. tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
  3417. tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
  3418. tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
  3419. tmp12 += z1;
  3420. tmp13 += z1;
  3421. dataptr[DCTSIZE*1] = (DCTELEM)
  3422. RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
  3423. dataptr[DCTSIZE*3] = (DCTELEM)
  3424. RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
  3425. dataptr[DCTSIZE*5] = (DCTELEM)
  3426. RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
  3427. dataptr[DCTSIZE*7] = (DCTELEM)
  3428. RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
  3429. dataptr++; /* advance pointer to next column */
  3430. }
  3431. }
  3432. /*
  3433. * Perform the forward DCT on a 3x6 sample block.
  3434. *
  3435. * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
  3436. */
  3437. GLOBAL(void)
  3438. jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3439. {
  3440. INT32 tmp0, tmp1, tmp2;
  3441. INT32 tmp10, tmp11, tmp12;
  3442. DCTELEM *dataptr;
  3443. JSAMPROW elemptr;
  3444. int ctr;
  3445. SHIFT_TEMPS
  3446. /* Pre-zero output coefficient block. */
  3447. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3448. /* Pass 1: process rows. */
  3449. /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  3450. /* furthermore, we scale the results by 2**PASS1_BITS. */
  3451. /* We scale the results further by 2 as part of output adaption */
  3452. /* scaling for different DCT size. */
  3453. /* 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6). */
  3454. dataptr = data;
  3455. for (ctr = 0; ctr < 6; ctr++) {
  3456. elemptr = sample_data[ctr] + start_col;
  3457. /* Even part */
  3458. tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
  3459. tmp1 = GETJSAMPLE(elemptr[1]);
  3460. tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
  3461. /* Apply unsigned->signed conversion */
  3462. dataptr[0] = (DCTELEM)
  3463. ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
  3464. dataptr[2] = (DCTELEM)
  3465. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
  3466. CONST_BITS-PASS1_BITS-1);
  3467. /* Odd part */
  3468. dataptr[1] = (DCTELEM)
  3469. DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
  3470. CONST_BITS-PASS1_BITS-1);
  3471. dataptr += DCTSIZE; /* advance pointer to next row */
  3472. }
  3473. /* Pass 2: process columns.
  3474. * We remove the PASS1_BITS scaling, but leave the results scaled up
  3475. * by an overall factor of 8.
  3476. * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
  3477. * fold into the constant multipliers (other part was done in pass 1):
  3478. * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
  3479. */
  3480. dataptr = data;
  3481. for (ctr = 0; ctr < 3; ctr++) {
  3482. /* Even part */
  3483. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
  3484. tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
  3485. tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
  3486. tmp10 = tmp0 + tmp2;
  3487. tmp12 = tmp0 - tmp2;
  3488. tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
  3489. tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
  3490. tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
  3491. dataptr[DCTSIZE*0] = (DCTELEM)
  3492. DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
  3493. CONST_BITS+PASS1_BITS);
  3494. dataptr[DCTSIZE*2] = (DCTELEM)
  3495. DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
  3496. CONST_BITS+PASS1_BITS);
  3497. dataptr[DCTSIZE*4] = (DCTELEM)
  3498. DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
  3499. CONST_BITS+PASS1_BITS);
  3500. /* Odd part */
  3501. tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
  3502. dataptr[DCTSIZE*1] = (DCTELEM)
  3503. DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
  3504. CONST_BITS+PASS1_BITS);
  3505. dataptr[DCTSIZE*3] = (DCTELEM)
  3506. DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
  3507. CONST_BITS+PASS1_BITS);
  3508. dataptr[DCTSIZE*5] = (DCTELEM)
  3509. DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
  3510. CONST_BITS+PASS1_BITS);
  3511. dataptr++; /* advance pointer to next column */
  3512. }
  3513. }
  3514. /*
  3515. * Perform the forward DCT on a 2x4 sample block.
  3516. *
  3517. * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
  3518. */
  3519. GLOBAL(void)
  3520. jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3521. {
  3522. INT32 tmp0, tmp1;
  3523. INT32 tmp10, tmp11;
  3524. DCTELEM *dataptr;
  3525. JSAMPROW elemptr;
  3526. int ctr;
  3527. SHIFT_TEMPS
  3528. /* Pre-zero output coefficient block. */
  3529. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3530. /* Pass 1: process rows. */
  3531. /* Note results are scaled up by sqrt(8) compared to a true DCT. */
  3532. /* We must also scale the output by (8/2)*(8/4) = 2**3, which we add here. */
  3533. dataptr = data;
  3534. for (ctr = 0; ctr < 4; ctr++) {
  3535. elemptr = sample_data[ctr] + start_col;
  3536. /* Even part */
  3537. tmp0 = GETJSAMPLE(elemptr[0]);
  3538. tmp1 = GETJSAMPLE(elemptr[1]);
  3539. /* Apply unsigned->signed conversion */
  3540. dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
  3541. /* Odd part */
  3542. dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
  3543. dataptr += DCTSIZE; /* advance pointer to next row */
  3544. }
  3545. /* Pass 2: process columns.
  3546. * We leave the results scaled up by an overall factor of 8.
  3547. * 4-point FDCT kernel,
  3548. * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
  3549. */
  3550. dataptr = data;
  3551. for (ctr = 0; ctr < 2; ctr++) {
  3552. /* Even part */
  3553. tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
  3554. tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
  3555. tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
  3556. tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
  3557. dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
  3558. dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
  3559. /* Odd part */
  3560. tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
  3561. /* Add fudge factor here for final descale. */
  3562. tmp0 += ONE << (CONST_BITS-1);
  3563. dataptr[DCTSIZE*1] = (DCTELEM)
  3564. RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
  3565. CONST_BITS);
  3566. dataptr[DCTSIZE*3] = (DCTELEM)
  3567. RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
  3568. CONST_BITS);
  3569. dataptr++; /* advance pointer to next column */
  3570. }
  3571. }
  3572. /*
  3573. * Perform the forward DCT on a 1x2 sample block.
  3574. *
  3575. * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
  3576. */
  3577. GLOBAL(void)
  3578. jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
  3579. {
  3580. INT32 tmp0, tmp1;
  3581. /* Pre-zero output coefficient block. */
  3582. MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
  3583. tmp0 = GETJSAMPLE(sample_data[0][start_col]);
  3584. tmp1 = GETJSAMPLE(sample_data[1][start_col]);
  3585. /* We leave the results scaled up by an overall factor of 8.
  3586. * We must also scale the output by (8/1)*(8/2) = 2**5.
  3587. */
  3588. /* Even part */
  3589. /* Apply unsigned->signed conversion */
  3590. data[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
  3591. /* Odd part */
  3592. data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
  3593. }
  3594. #endif /* DCT_SCALING_SUPPORTED */
  3595. #endif /* DCT_ISLOW_SUPPORTED */