Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

509 lines
12 KiB

  1. #include "stdafx.h"
  2. #pragma hdrstop
  3. /***************************************************************************
  4. *
  5. * INTEL Corporation Proprietary Information
  6. *
  7. *
  8. * Copyright (c) 1996 Intel Corporation.
  9. * All rights reserved.
  10. *
  11. ***************************************************************************
  12. */
  13. /*
  14. * jfdctfst.c
  15. *
  16. * Copyright (C) 1994-1996, Thomas G. Lane.
  17. * This file is part of the Independent JPEG Group's software.
  18. * For conditions of distribution and use, see the accompanying README file.
  19. *
  20. * This file contains a fast, not so accurate integer implementation of the
  21. * forward DCT (Discrete Cosine Transform).
  22. *
  23. * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  24. * on each column. Direct algorithms are also available, but they are
  25. * much more complex and seem not to be any faster when reduced to code.
  26. *
  27. * This implementation is based on Arai, Agui, and Nakajima's algorithm for
  28. * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
  29. * Japanese, but the algorithm is described in the Pennebaker & Mitchell
  30. * JPEG textbook (see REFERENCES section in file README). The following code
  31. * is based directly on figure 4-8 in P&M.
  32. * While an 8-point DCT cannot be done in less than 11 multiplies, it is
  33. * possible to arrange the computation so that many of the multiplies are
  34. * simple scalings of the final outputs. These multiplies can then be
  35. * folded into the multiplications or divisions by the JPEG quantization
  36. * table entries. The AA&N method leaves only 5 multiplies and 29 adds
  37. * to be done in the DCT itself.
  38. * The primary disadvantage of this method is that with fixed-point math,
  39. * accuracy is lost due to imprecise representation of the scaled
  40. * quantization values. The smaller the quantization table entry, the less
  41. * precise the scaled value, so this implementation does worse with high-
  42. * quality-setting files than with low-quality ones.
  43. */
  44. #define JPEG_INTERNALS
  45. #include "jinclude.h"
  46. #include "jpeglib.h"
  47. #include "jdct.h" /* Private declarations for DCT subsystem */
  48. #ifdef DCT_IFAST_SUPPORTED
  49. /*
  50. * This module is specialized to the case DCTSIZE = 8.
  51. */
  52. #if DCTSIZE != 8
  53. Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
  54. #endif
  55. /* Scaling decisions are generally the same as in the LL&M algorithm;
  56. * see jfdctint.c for more details. However, we choose to descale
  57. * (right shift) multiplication products as soon as they are formed,
  58. * rather than carrying additional fractional bits into subsequent additions.
  59. * This compromises accuracy slightly, but it lets us save a few shifts.
  60. * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  61. * everywhere except in the multiplications proper; this saves a good deal
  62. * of work on 16-bit-int machines.
  63. *
  64. * Again to save a few shifts, the intermediate results between pass 1 and
  65. * pass 2 are not upscaled, but are represented only to integral precision.
  66. *
  67. * A final compromise is to represent the multiplicative constants to only
  68. * 8 fractional bits, rather than 13. This saves some shifting work on some
  69. * machines, and may also reduce the cost of multiplication (since there
  70. * are fewer one-bits in the constants).
  71. */
  72. #define CONST_BITS 8
  73. /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  74. * causing a lot of useless floating-point operations at run time.
  75. * To get around this we use the following pre-calculated constants.
  76. * If you change CONST_BITS you may want to add appropriate values.
  77. * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  78. */
  79. #if CONST_BITS == 8
  80. #define FIX_0_382683433 98 /* FIX(0.382683433) */
  81. #define FIX_0_541196100 139 /* FIX(0.541196100) */
  82. #define FIX_0_707106781 181 /* FIX(0.707106781) */
  83. #define FIX_1_306562965 334 /* FIX(1.306562965) */
  84. #else
  85. #define FIX_0_382683433 FIX(0.382683433)
  86. #define FIX_0_541196100 FIX(0.541196100)
  87. #define FIX_0_707106781 FIX(0.707106781)
  88. #define FIX_1_306562965 FIX(1.306562965)
  89. #endif
  90. /* We can gain a little more speed, with a further compromise in accuracy,
  91. * by omitting the addition in a descaling shift. This yields an incorrectly
  92. * rounded result half the time...
  93. */
  94. // The assembly version makes this compromise.
  95. //#ifndef USE_ACCURATE_ROUNDING
  96. //#undef DESCALE
  97. //#define DESCALE(x,n) RIGHT_SHIFT(x, n)
  98. //#endif
  99. #define DCTWIDTH 32
  100. #define DATASIZE 4
  101. /* Multiply a DCTELEM variable by an INT32 constant, and immediately
  102. * descale to yield a DCTELEM result.
  103. */
  104. #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
  105. #if _MSC_FULL_VER >= 13008827 && defined(_M_IX86)
  106. #pragma warning(push)
  107. #pragma warning(disable:4731) // EBP modified with inline asm
  108. #endif
  109. /*
  110. * Perform the forward DCT on one block of samples.
  111. */
  112. GLOBAL(void)
  113. pfdct8x8aan (DCTELEM * data)
  114. {
  115. DCTELEM tmp4, tmp6, tmp7;
  116. int counter;
  117. __asm{
  118. /* Pass 1: process rows. */
  119. // dataptr = data;
  120. mov esi, [data]
  121. mov counter, 8
  122. // for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  123. // tmp0 = dataptr[0] + dataptr[7];
  124. // tmp7 = dataptr[0] - dataptr[7];
  125. // tmp1 = dataptr[1] + dataptr[6];
  126. // tmp6 = dataptr[1] - dataptr[6];
  127. // tmp2 = dataptr[2] + dataptr[5];
  128. // tmp5 = dataptr[2] - dataptr[5];
  129. // tmp3 = dataptr[3] + dataptr[4];
  130. // tmp4 = dataptr[3] - dataptr[4];
  131. StartRow:
  132. mov eax, [esi][DATASIZE*0]
  133. mov ebx, [esi][DATASIZE*7]
  134. mov edx, eax
  135. add eax, ebx ; eax = tmp0
  136. sub edx, ebx ; edx = tmp7
  137. mov ebx, [esi][DATASIZE*3]
  138. mov ecx, [esi][DATASIZE*4]
  139. mov edi, ebx
  140. add ebx, ecx ; ebx = tmp3
  141. sub edi, ecx ; edi = tmp4
  142. mov tmp4, edi
  143. mov tmp7, edx
  144. /* Even part */
  145. // tmp10 = tmp0 + tmp3;
  146. // tmp13 = tmp0 - tmp3;
  147. // tmp11 = tmp1 + tmp2;
  148. // tmp12 = tmp1 - tmp2;
  149. mov ecx, eax
  150. add eax, ebx ; eax = tmp10
  151. sub ecx, ebx ; ecx = tmp13
  152. mov edx, [esi][DATASIZE*1]
  153. mov edi, [esi][DATASIZE*6]
  154. mov ebx, edx
  155. add edx, edi ; edx = tmp1
  156. sub ebx, edi ; ebx = tmp6
  157. mov tmp6, ebx
  158. push ebp
  159. mov edi, [esi][DATASIZE*2]
  160. mov ebp, [esi][DATASIZE*5]
  161. mov ebx, edi
  162. add edi, ebp ; edi = tmp2
  163. sub ebx, ebp ; ebx = tmp5
  164. mov ebp, edx
  165. add edx, edi ; edx = tmp11
  166. sub ebp, edi ; ebp = tmp12
  167. // dataptr[0] = tmp10 + tmp11; /* phase 3 */
  168. // dataptr[4] = tmp10 - tmp11;
  169. mov edi, eax
  170. add eax, edx ; eax = tmp10 + tmp11
  171. sub edi, edx ; edi = tmp10 - tmp11
  172. add ebp, ecx ; ebp = tmp12 + tmp13
  173. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  174. imul ebp, FIX_0_707106781 ; ebp = z1
  175. sar ebp, 8
  176. mov [esi][DATASIZE*0], eax
  177. // dataptr[2] = tmp13 + z1; /* phase 5 */
  178. // dataptr[6] = tmp13 - z1;
  179. mov eax, ecx
  180. add ecx, ebp
  181. sub eax, ebp
  182. pop ebp
  183. mov [esi][DATASIZE*4], edi
  184. mov [esi][DATASIZE*2], ecx
  185. mov [esi][DATASIZE*6], eax
  186. mov edi, tmp4
  187. /* Odd part */
  188. // tmp10 = tmp4 + tmp5; /* phase 2 */
  189. // tmp11 = tmp5 + tmp6;
  190. // tmp12 = tmp6 + tmp7;
  191. mov ecx, tmp6
  192. mov edx, tmp7
  193. add edi, ebx ; edi = tmp10
  194. add ebx, ecx ; ebx = tmp11
  195. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  196. // z11 = tmp7 + z3; /* phase 5 */
  197. // z13 = tmp7 - z3;
  198. imul ebx, FIX_0_707106781 ; ebx = z3
  199. sar ebx, 8
  200. add ecx, edx ; ecx = tmp12
  201. mov eax, edx
  202. add edx, ebx ; edx = z11
  203. sub eax, ebx ; eax = z13
  204. mov ebx, edi
  205. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  206. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  207. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  208. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  209. imul ebx, FIX_0_541196100
  210. sar ebx, 8
  211. sub edi, ecx ; edi = tmp10 - tmp12
  212. imul edi, FIX_0_382683433 ; edi = z5
  213. sar edi, 8
  214. add esi, 32
  215. imul ecx, FIX_1_306562965
  216. sar ecx, 8
  217. add ebx, edi ; ebx = z2
  218. add ecx, edi ; ecx = z4
  219. mov edi, eax
  220. // dataptr[5] = z13 + z2; /* phase 6 */
  221. // dataptr[3] = z13 - z2;
  222. // dataptr[1] = z11 + z4;
  223. // dataptr[7] = z11 - z4;
  224. add eax, ebx ; eax = z13 + z2
  225. sub edi, ebx ; edi = z13 - z2
  226. mov [esi][DATASIZE*5-32], eax
  227. mov ebx, edx
  228. mov [esi][DATASIZE*3-32], edi
  229. add edx, ecx ; edx = z11 + z4
  230. mov [esi][DATASIZE*1-32], edx
  231. sub ebx, ecx ; ebx = z11 - z4
  232. mov ecx, counter
  233. mov [esi][DATASIZE*7-32], ebx
  234. dec ecx
  235. mov counter, ecx
  236. jnz StartRow
  237. // dataptr += DCTSIZE; /* advance pointer to next row */
  238. // }
  239. /* Pass 2: process columns.*/
  240. // dataptr = data;
  241. mov esi, [data]
  242. mov counter, 8
  243. // for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
  244. // tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
  245. // tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
  246. // tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
  247. // tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
  248. // tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
  249. // tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
  250. // tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
  251. // tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
  252. StartCol:
  253. mov eax, [esi][DCTWIDTH*0]
  254. mov ebx, [esi][DCTWIDTH*7]
  255. mov edx, eax
  256. add eax, ebx ; eax = tmp0
  257. sub edx, ebx ; edx = tmp7
  258. mov ebx, [esi][DCTWIDTH*3]
  259. mov ecx, [esi][DCTWIDTH*4]
  260. mov edi, ebx
  261. add ebx, ecx ; ebx = tmp3
  262. sub edi, ecx ; edi = tmp4
  263. mov tmp4, edi
  264. mov tmp7, edx
  265. /* Even part */
  266. // tmp10 = tmp0 + tmp3;
  267. // tmp13 = tmp0 - tmp3;
  268. // tmp11 = tmp1 + tmp2;
  269. // tmp12 = tmp1 - tmp2;
  270. mov ecx, eax
  271. add eax, ebx ; eax = tmp10
  272. sub ecx, ebx ; ecx = tmp13
  273. mov edx, [esi][DCTWIDTH*1]
  274. mov edi, [esi][DCTWIDTH*6]
  275. mov ebx, edx
  276. add edx, edi ; edx = tmp1
  277. sub ebx, edi ; ebx = tmp6
  278. mov tmp6, ebx
  279. push ebp
  280. mov edi, [esi][DCTWIDTH*2]
  281. mov ebp, [esi][DCTWIDTH*5]
  282. mov ebx, edi
  283. add edi, ebp ; edi = tmp2
  284. sub ebx, ebp ; ebx = tmp5
  285. mov ebp, edx
  286. add edx, edi ; edx = tmp11
  287. sub ebp, edi ; ebp = tmp12
  288. // dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
  289. // dataptr[DCTSIZE*4] = tmp10 - tmp11;
  290. mov edi, eax
  291. add eax, edx ; eax = tmp10 + tmp11
  292. sub edi, edx ; edi = tmp10 - tmp11
  293. add ebp, ecx ; ebp = tmp12 + tmp13
  294. // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
  295. imul ebp, FIX_0_707106781 ; ebp = z1
  296. sar ebp, 8
  297. mov [esi][DCTWIDTH*0], eax
  298. // dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
  299. // dataptr[DCTSIZE*6] = tmp13 - z1;
  300. mov eax, ecx
  301. add ecx, ebp
  302. sub eax, ebp
  303. pop ebp
  304. mov [esi][DCTWIDTH*4], edi
  305. mov [esi][DCTWIDTH*2], ecx
  306. mov [esi][DCTWIDTH*6], eax
  307. mov edi, tmp4
  308. /* Odd part */
  309. // tmp10 = tmp4 + tmp5; /* phase 2 */
  310. // tmp11 = tmp5 + tmp6;
  311. // tmp12 = tmp6 + tmp7;
  312. mov ecx, tmp6
  313. mov edx, tmp7
  314. add edi, ebx ; edi = tmp10
  315. add ebx, ecx ; ebx = tmp11
  316. // z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
  317. // z11 = tmp7 + z3; /* phase 5 */
  318. // z13 = tmp7 - z3;
  319. imul ebx, FIX_0_707106781 ; ebx = z3
  320. sar ebx, 8
  321. add ecx, edx ; ecx = tmp12
  322. mov eax, edx
  323. add edx, ebx ; edx = z11
  324. sub eax, ebx ; eax = z13
  325. mov ebx, edi
  326. /* The rotator is modified from fig 4-8 to avoid extra negations. */
  327. // z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
  328. // z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
  329. // z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
  330. imul ebx, FIX_0_541196100
  331. sar ebx, 8
  332. sub edi, ecx ; edi = tmp10 - tmp12
  333. imul edi, FIX_0_382683433 ; edi = z5
  334. sar edi, 8
  335. add esi, 4
  336. imul ecx, FIX_1_306562965
  337. sar ecx, 8
  338. add ebx, edi ; ebx = z2
  339. add ecx, edi ; ecx = z4
  340. mov edi, eax
  341. // dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
  342. // dataptr[DCTSIZE*3] = z13 - z2;
  343. // dataptr[DCTSIZE*1] = z11 + z4;
  344. // dataptr[DCTSIZE*7] = z11 - z4;
  345. add eax, ebx ; eax = z13 + z2
  346. sub edi, ebx ; edi = z13 - z2
  347. mov [esi][DCTWIDTH*5-4], eax
  348. mov ebx, edx
  349. mov [esi][DCTWIDTH*3-4], edi
  350. add edx, ecx ; edx = z11 + z4
  351. mov [esi][DCTWIDTH*1-4], edx
  352. sub ebx, ecx ; ebx = z11 - z4
  353. mov ecx, counter
  354. mov [esi][DCTWIDTH*7-4], ebx
  355. dec ecx
  356. mov counter, ecx
  357. jnz StartCol
  358. } //end asm
  359. // dataptr++; /* advance pointer to next column */
  360. // }
  361. }
  362. #if _MSC_FULL_VER >= 13008827
  363. #pragma warning(pop)
  364. #endif
  365. #endif /* DCT_ISLOW_SUPPORTED */